zephyr-7b-dpo-lora-r16-20k / trainer_state.json
LaoRay's picture
Model save
8c58f70 verified
raw
history blame
138 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008,
"grad_norm": 1.3400768041610718,
"learning_rate": 4e-08,
"logits/chosen": -2.951728105545044,
"logits/rejected": -3.0115513801574707,
"logps/chosen": -261.50799560546875,
"logps/rejected": -337.26708984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 1.3155320882797241,
"learning_rate": 2.0000000000000002e-07,
"logits/chosen": -2.8931193351745605,
"logits/rejected": -2.8665506839752197,
"logps/chosen": -327.18511962890625,
"logps/rejected": -271.54595947265625,
"loss": 0.6934,
"rewards/accuracies": 0.359375,
"rewards/chosen": -0.0003679850487969816,
"rewards/margins": -0.0005117338732816279,
"rewards/rejected": 0.000143748868140392,
"step": 5
},
{
"epoch": 0.008,
"grad_norm": 1.4168583154678345,
"learning_rate": 4.0000000000000003e-07,
"logits/chosen": -2.8454272747039795,
"logits/rejected": -2.8244102001190186,
"logps/chosen": -278.81390380859375,
"logps/rejected": -225.78091430664062,
"loss": 0.6932,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0003080188180319965,
"rewards/margins": -0.00016189362213481218,
"rewards/rejected": -0.00014612523955293,
"step": 10
},
{
"epoch": 0.012,
"grad_norm": 1.4461805820465088,
"learning_rate": 6.000000000000001e-07,
"logits/chosen": -2.941542387008667,
"logits/rejected": -2.919604539871216,
"logps/chosen": -338.14361572265625,
"logps/rejected": -264.4473876953125,
"loss": 0.6931,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -9.753620543051511e-05,
"rewards/margins": 0.00017976768140215427,
"rewards/rejected": -0.0002773039450403303,
"step": 15
},
{
"epoch": 0.016,
"grad_norm": 1.218361735343933,
"learning_rate": 8.000000000000001e-07,
"logits/chosen": -2.844390392303467,
"logits/rejected": -2.8012917041778564,
"logps/chosen": -284.53179931640625,
"logps/rejected": -265.3224792480469,
"loss": 0.693,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.00011823275417555124,
"rewards/margins": 0.00027608583332039416,
"rewards/rejected": -0.00015785309369675815,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 1.0622657537460327,
"learning_rate": 1.0000000000000002e-06,
"logits/chosen": -2.919724941253662,
"logits/rejected": -2.8841071128845215,
"logps/chosen": -282.7057800292969,
"logps/rejected": -250.56005859375,
"loss": 0.693,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 3.2701333111617714e-05,
"rewards/margins": 0.00029431647271849215,
"rewards/rejected": -0.0002616152632981539,
"step": 25
},
{
"epoch": 0.024,
"grad_norm": 1.2840291261672974,
"learning_rate": 1.2000000000000002e-06,
"logits/chosen": -2.8690571784973145,
"logits/rejected": -2.8205409049987793,
"logps/chosen": -248.4199981689453,
"logps/rejected": -239.7508544921875,
"loss": 0.6933,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0002605341433081776,
"rewards/margins": -0.0002968462067656219,
"rewards/rejected": 0.0005573804955929518,
"step": 30
},
{
"epoch": 0.028,
"grad_norm": 1.4659631252288818,
"learning_rate": 1.4000000000000001e-06,
"logits/chosen": -2.819516181945801,
"logits/rejected": -2.8284599781036377,
"logps/chosen": -260.5746765136719,
"logps/rejected": -252.26657104492188,
"loss": 0.6933,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.00032235420076176524,
"rewards/margins": -0.00027956519625149667,
"rewards/rejected": -4.278900451026857e-05,
"step": 35
},
{
"epoch": 0.032,
"grad_norm": 1.641062617301941,
"learning_rate": 1.6000000000000001e-06,
"logits/chosen": -2.8422532081604004,
"logits/rejected": -2.8213276863098145,
"logps/chosen": -225.60000610351562,
"logps/rejected": -254.83389282226562,
"loss": 0.6925,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0003634319291450083,
"rewards/margins": 0.001209324225783348,
"rewards/rejected": -0.0008458923548460007,
"step": 40
},
{
"epoch": 0.036,
"grad_norm": 1.2335015535354614,
"learning_rate": 1.8000000000000001e-06,
"logits/chosen": -2.8927934169769287,
"logits/rejected": -2.895987033843994,
"logps/chosen": -262.75616455078125,
"logps/rejected": -258.01776123046875,
"loss": 0.6926,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -8.163524034898728e-05,
"rewards/margins": 0.0012018559500575066,
"rewards/rejected": -0.0012834911467507482,
"step": 45
},
{
"epoch": 0.04,
"grad_norm": 1.3828603029251099,
"learning_rate": 2.0000000000000003e-06,
"logits/chosen": -2.8105452060699463,
"logits/rejected": -2.766021966934204,
"logps/chosen": -246.88064575195312,
"logps/rejected": -221.18325805664062,
"loss": 0.6929,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.0005901859840378165,
"rewards/margins": 0.0004752330423798412,
"rewards/rejected": 0.00011495289800222963,
"step": 50
},
{
"epoch": 0.044,
"grad_norm": 1.1620622873306274,
"learning_rate": 2.2e-06,
"logits/chosen": -2.8504276275634766,
"logits/rejected": -2.830573558807373,
"logps/chosen": -289.9104919433594,
"logps/rejected": -304.9803771972656,
"loss": 0.6934,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0006987753440625966,
"rewards/margins": -0.0004663577419705689,
"rewards/rejected": -0.00023241760209202766,
"step": 55
},
{
"epoch": 0.048,
"grad_norm": 1.2738378047943115,
"learning_rate": 2.4000000000000003e-06,
"logits/chosen": -2.893800735473633,
"logits/rejected": -2.874782085418701,
"logps/chosen": -265.0617370605469,
"logps/rejected": -274.21246337890625,
"loss": 0.6922,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0009635955793783069,
"rewards/margins": 0.0018142672488465905,
"rewards/rejected": -0.0008506716112606227,
"step": 60
},
{
"epoch": 0.052,
"grad_norm": 1.0527548789978027,
"learning_rate": 2.6e-06,
"logits/chosen": -2.8645272254943848,
"logits/rejected": -2.840221881866455,
"logps/chosen": -242.2287139892578,
"logps/rejected": -258.8787536621094,
"loss": 0.6922,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.0008619900909252465,
"rewards/margins": 0.0019651155453175306,
"rewards/rejected": -0.0011031257454305887,
"step": 65
},
{
"epoch": 0.056,
"grad_norm": 1.2068345546722412,
"learning_rate": 2.8000000000000003e-06,
"logits/chosen": -2.8592796325683594,
"logits/rejected": -2.856304168701172,
"logps/chosen": -256.22979736328125,
"logps/rejected": -239.5323944091797,
"loss": 0.6919,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0025476592127233744,
"rewards/margins": 0.002555294893682003,
"rewards/rejected": -7.636076588823926e-06,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 1.4119035005569458,
"learning_rate": 3e-06,
"logits/chosen": -2.922961473464966,
"logits/rejected": -2.861196517944336,
"logps/chosen": -286.9571838378906,
"logps/rejected": -258.0143737792969,
"loss": 0.6918,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0024884731974452734,
"rewards/margins": 0.0027703498490154743,
"rewards/rejected": -0.00028187656425870955,
"step": 75
},
{
"epoch": 0.064,
"grad_norm": 1.4012054204940796,
"learning_rate": 3.2000000000000003e-06,
"logits/chosen": -2.8784067630767822,
"logits/rejected": -2.8731160163879395,
"logps/chosen": -257.916259765625,
"logps/rejected": -248.72305297851562,
"loss": 0.6912,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0049202474765479565,
"rewards/margins": 0.0038819201290607452,
"rewards/rejected": 0.0010383275803178549,
"step": 80
},
{
"epoch": 0.068,
"grad_norm": 1.4234269857406616,
"learning_rate": 3.4000000000000005e-06,
"logits/chosen": -2.9202146530151367,
"logits/rejected": -2.8719019889831543,
"logps/chosen": -307.8442077636719,
"logps/rejected": -268.5364990234375,
"loss": 0.6917,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.005258677992969751,
"rewards/margins": 0.0030069469939917326,
"rewards/rejected": 0.0022517309989780188,
"step": 85
},
{
"epoch": 0.072,
"grad_norm": 1.1752641201019287,
"learning_rate": 3.6000000000000003e-06,
"logits/chosen": -2.8795719146728516,
"logits/rejected": -2.8445372581481934,
"logps/chosen": -238.05691528320312,
"logps/rejected": -238.66940307617188,
"loss": 0.6924,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.00510067492723465,
"rewards/margins": 0.0015365609433501959,
"rewards/rejected": 0.003564114449545741,
"step": 90
},
{
"epoch": 0.076,
"grad_norm": 1.273596167564392,
"learning_rate": 3.8000000000000005e-06,
"logits/chosen": -2.8963799476623535,
"logits/rejected": -2.899864673614502,
"logps/chosen": -268.2062683105469,
"logps/rejected": -242.0111083984375,
"loss": 0.6902,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.007857006974518299,
"rewards/margins": 0.005892972461879253,
"rewards/rejected": 0.0019640345126390457,
"step": 95
},
{
"epoch": 0.08,
"grad_norm": 1.2360827922821045,
"learning_rate": 4.000000000000001e-06,
"logits/chosen": -2.871992588043213,
"logits/rejected": -2.8619043827056885,
"logps/chosen": -292.5274353027344,
"logps/rejected": -255.6526641845703,
"loss": 0.6899,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.009717302396893501,
"rewards/margins": 0.006645149551331997,
"rewards/rejected": 0.0030721533112227917,
"step": 100
},
{
"epoch": 0.08,
"eval_logits/chosen": -2.889031171798706,
"eval_logits/rejected": -2.8468213081359863,
"eval_logps/chosen": -282.2605285644531,
"eval_logps/rejected": -247.75430297851562,
"eval_loss": 0.6897016167640686,
"eval_rewards/accuracies": 0.6666666865348816,
"eval_rewards/chosen": 0.009775782003998756,
"eval_rewards/margins": 0.007023118901997805,
"eval_rewards/rejected": 0.0027526640333235264,
"eval_runtime": 166.8346,
"eval_samples_per_second": 2.997,
"eval_steps_per_second": 0.378,
"step": 100
},
{
"epoch": 0.084,
"grad_norm": 1.3187963962554932,
"learning_rate": 4.2000000000000004e-06,
"logits/chosen": -2.8663318157196045,
"logits/rejected": -2.8205113410949707,
"logps/chosen": -272.2581481933594,
"logps/rejected": -261.42620849609375,
"loss": 0.6907,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.00842782761901617,
"rewards/margins": 0.005024894140660763,
"rewards/rejected": 0.0034029334783554077,
"step": 105
},
{
"epoch": 0.088,
"grad_norm": 1.2544078826904297,
"learning_rate": 4.4e-06,
"logits/chosen": -2.9372715950012207,
"logits/rejected": -2.9057135581970215,
"logps/chosen": -251.8219757080078,
"logps/rejected": -246.0946044921875,
"loss": 0.6914,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.009438835084438324,
"rewards/margins": 0.0036361501552164555,
"rewards/rejected": 0.0058026849292218685,
"step": 110
},
{
"epoch": 0.092,
"grad_norm": 3.0026137828826904,
"learning_rate": 4.600000000000001e-06,
"logits/chosen": -2.827087879180908,
"logits/rejected": -2.816584348678589,
"logps/chosen": -225.01516723632812,
"logps/rejected": -294.75274658203125,
"loss": 0.6868,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.012459425255656242,
"rewards/margins": 0.012965649366378784,
"rewards/rejected": -0.0005062236450612545,
"step": 115
},
{
"epoch": 0.096,
"grad_norm": 3.147055149078369,
"learning_rate": 4.800000000000001e-06,
"logits/chosen": -2.7388131618499756,
"logits/rejected": -2.748465061187744,
"logps/chosen": -275.8075866699219,
"logps/rejected": -249.1244659423828,
"loss": 0.6885,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.011224482208490372,
"rewards/margins": 0.009682848118245602,
"rewards/rejected": 0.0015416343230754137,
"step": 120
},
{
"epoch": 0.1,
"grad_norm": 1.534111738204956,
"learning_rate": 5e-06,
"logits/chosen": -2.941195487976074,
"logits/rejected": -2.924978494644165,
"logps/chosen": -310.167236328125,
"logps/rejected": -280.0481262207031,
"loss": 0.6872,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.011911705136299133,
"rewards/margins": 0.012096909806132317,
"rewards/rejected": -0.00018520592129789293,
"step": 125
},
{
"epoch": 0.104,
"grad_norm": 1.4911248683929443,
"learning_rate": 4.999756310023261e-06,
"logits/chosen": -2.8891565799713135,
"logits/rejected": -2.896601915359497,
"logps/chosen": -286.2426452636719,
"logps/rejected": -309.3197021484375,
"loss": 0.6869,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.009015440009534359,
"rewards/margins": 0.012755987234413624,
"rewards/rejected": -0.003740546526387334,
"step": 130
},
{
"epoch": 0.108,
"grad_norm": 1.4180755615234375,
"learning_rate": 4.999025287600886e-06,
"logits/chosen": -2.8916049003601074,
"logits/rejected": -2.9071428775787354,
"logps/chosen": -274.48236083984375,
"logps/rejected": -265.49786376953125,
"loss": 0.6811,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.011777431704103947,
"rewards/margins": 0.024640800431370735,
"rewards/rejected": -0.012863369658589363,
"step": 135
},
{
"epoch": 0.112,
"grad_norm": 1.3050034046173096,
"learning_rate": 4.997807075247147e-06,
"logits/chosen": -2.8806416988372803,
"logits/rejected": -2.8594279289245605,
"logps/chosen": -247.0726776123047,
"logps/rejected": -236.9187774658203,
"loss": 0.6891,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.005051865242421627,
"rewards/margins": 0.008825790137052536,
"rewards/rejected": -0.013877655379474163,
"step": 140
},
{
"epoch": 0.116,
"grad_norm": 1.3001933097839355,
"learning_rate": 4.996101910454953e-06,
"logits/chosen": -2.903634548187256,
"logits/rejected": -2.859711170196533,
"logps/chosen": -273.8101806640625,
"logps/rejected": -244.11074829101562,
"loss": 0.6801,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": 0.0007423794595524669,
"rewards/margins": 0.026909640058875084,
"rewards/rejected": -0.026167264208197594,
"step": 145
},
{
"epoch": 0.12,
"grad_norm": 1.671297550201416,
"learning_rate": 4.993910125649561e-06,
"logits/chosen": -2.891292095184326,
"logits/rejected": -2.856261968612671,
"logps/chosen": -293.83563232421875,
"logps/rejected": -247.8043975830078,
"loss": 0.6803,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.01020820252597332,
"rewards/margins": 0.026513541117310524,
"rewards/rejected": -0.016305336728692055,
"step": 150
},
{
"epoch": 0.124,
"grad_norm": 1.5288795232772827,
"learning_rate": 4.9912321481237616e-06,
"logits/chosen": -2.778376340866089,
"logits/rejected": -2.774121046066284,
"logps/chosen": -231.49319458007812,
"logps/rejected": -290.89337158203125,
"loss": 0.6837,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.00030907365726307034,
"rewards/margins": 0.019828204065561295,
"rewards/rejected": -0.019519129768013954,
"step": 155
},
{
"epoch": 0.128,
"grad_norm": 1.5330896377563477,
"learning_rate": 4.988068499954578e-06,
"logits/chosen": -2.889814853668213,
"logits/rejected": -2.888610601425171,
"logps/chosen": -316.81927490234375,
"logps/rejected": -312.25006103515625,
"loss": 0.6715,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.031255967915058136,
"rewards/margins": 0.04561912640929222,
"rewards/rejected": -0.01436315942555666,
"step": 160
},
{
"epoch": 0.132,
"grad_norm": 1.637596607208252,
"learning_rate": 4.984419797901491e-06,
"logits/chosen": -2.922788143157959,
"logits/rejected": -2.911243438720703,
"logps/chosen": -311.63836669921875,
"logps/rejected": -282.0634765625,
"loss": 0.6705,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.031035322695970535,
"rewards/margins": 0.04701067879796028,
"rewards/rejected": -0.015975359827280045,
"step": 165
},
{
"epoch": 0.136,
"grad_norm": 1.531761884689331,
"learning_rate": 4.980286753286196e-06,
"logits/chosen": -2.9153621196746826,
"logits/rejected": -2.9075608253479004,
"logps/chosen": -275.50396728515625,
"logps/rejected": -273.3793029785156,
"loss": 0.6759,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.0289138313382864,
"rewards/margins": 0.0373816192150116,
"rewards/rejected": -0.008467786945402622,
"step": 170
},
{
"epoch": 0.14,
"grad_norm": 1.562333106994629,
"learning_rate": 4.975670171853926e-06,
"logits/chosen": -2.881091833114624,
"logits/rejected": -2.8206849098205566,
"logps/chosen": -268.7303161621094,
"logps/rejected": -241.11801147460938,
"loss": 0.6727,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.01539912074804306,
"rewards/margins": 0.04456932842731476,
"rewards/rejected": -0.02917020581662655,
"step": 175
},
{
"epoch": 0.144,
"grad_norm": 1.5452988147735596,
"learning_rate": 4.970570953616383e-06,
"logits/chosen": -2.870706558227539,
"logits/rejected": -2.846757173538208,
"logps/chosen": -271.70098876953125,
"logps/rejected": -250.15017700195312,
"loss": 0.6579,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": 0.027496661990880966,
"rewards/margins": 0.07495806366205215,
"rewards/rejected": -0.04746139422059059,
"step": 180
},
{
"epoch": 0.148,
"grad_norm": 1.711881160736084,
"learning_rate": 4.964990092676263e-06,
"logits/chosen": -2.8256664276123047,
"logits/rejected": -2.8229262828826904,
"logps/chosen": -272.4619140625,
"logps/rejected": -226.0482177734375,
"loss": 0.6773,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.000382797239581123,
"rewards/margins": 0.03519537299871445,
"rewards/rejected": -0.0355781726539135,
"step": 185
},
{
"epoch": 0.152,
"grad_norm": 1.8593279123306274,
"learning_rate": 4.958928677033465e-06,
"logits/chosen": -2.8317179679870605,
"logits/rejected": -2.820038318634033,
"logps/chosen": -276.53924560546875,
"logps/rejected": -289.26007080078125,
"loss": 0.6639,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.008795881643891335,
"rewards/margins": 0.06358983367681503,
"rewards/rejected": -0.05479395389556885,
"step": 190
},
{
"epoch": 0.156,
"grad_norm": 1.802320957183838,
"learning_rate": 4.9523878883729794e-06,
"logits/chosen": -2.876426935195923,
"logits/rejected": -2.851534128189087,
"logps/chosen": -288.3893737792969,
"logps/rejected": -255.09683227539062,
"loss": 0.6564,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.004248014185577631,
"rewards/margins": 0.07926348596811295,
"rewards/rejected": -0.07501547038555145,
"step": 195
},
{
"epoch": 0.16,
"grad_norm": 1.8666610717773438,
"learning_rate": 4.9453690018345144e-06,
"logits/chosen": -2.852238416671753,
"logits/rejected": -2.8293018341064453,
"logps/chosen": -255.58560180664062,
"logps/rejected": -257.3184814453125,
"loss": 0.6532,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0025687548331916332,
"rewards/margins": 0.08859384059906006,
"rewards/rejected": -0.09116257727146149,
"step": 200
},
{
"epoch": 0.16,
"eval_logits/chosen": -2.878232717514038,
"eval_logits/rejected": -2.8385584354400635,
"eval_logps/chosen": -284.5143127441406,
"eval_logps/rejected": -257.5306091308594,
"eval_loss": 0.6568659543991089,
"eval_rewards/accuracies": 0.6884920597076416,
"eval_rewards/chosen": -0.012762677855789661,
"eval_rewards/margins": 0.08224756270647049,
"eval_rewards/rejected": -0.09501024335622787,
"eval_runtime": 166.7797,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 0.378,
"step": 200
},
{
"epoch": 0.164,
"grad_norm": 1.9332078695297241,
"learning_rate": 4.937873385763909e-06,
"logits/chosen": -2.8655571937561035,
"logits/rejected": -2.8335084915161133,
"logps/chosen": -287.10076904296875,
"logps/rejected": -284.3404846191406,
"loss": 0.6582,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.033930666744709015,
"rewards/margins": 0.07983305305242538,
"rewards/rejected": -0.1137637123465538,
"step": 205
},
{
"epoch": 0.168,
"grad_norm": 1.877467393875122,
"learning_rate": 4.9299025014463665e-06,
"logits/chosen": -2.879312038421631,
"logits/rejected": -2.862196445465088,
"logps/chosen": -248.899169921875,
"logps/rejected": -245.27310180664062,
"loss": 0.6704,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.04507671296596527,
"rewards/margins": 0.054625023156404495,
"rewards/rejected": -0.09970173239707947,
"step": 210
},
{
"epoch": 0.172,
"grad_norm": 1.8854491710662842,
"learning_rate": 4.921457902821578e-06,
"logits/chosen": -2.8618056774139404,
"logits/rejected": -2.8050172328948975,
"logps/chosen": -316.2086181640625,
"logps/rejected": -286.01239013671875,
"loss": 0.6661,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.017241844907402992,
"rewards/margins": 0.06764046102762222,
"rewards/rejected": -0.08488230407238007,
"step": 215
},
{
"epoch": 0.176,
"grad_norm": 2.2946383953094482,
"learning_rate": 4.912541236180779e-06,
"logits/chosen": -2.7987911701202393,
"logits/rejected": -2.76237154006958,
"logps/chosen": -325.50177001953125,
"logps/rejected": -316.89739990234375,
"loss": 0.642,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.016762247309088707,
"rewards/margins": 0.11744923889636993,
"rewards/rejected": -0.1342114955186844,
"step": 220
},
{
"epoch": 0.18,
"grad_norm": 1.7292786836624146,
"learning_rate": 4.903154239845798e-06,
"logits/chosen": -2.8847053050994873,
"logits/rejected": -2.825892210006714,
"logps/chosen": -271.9214172363281,
"logps/rejected": -247.08193969726562,
"loss": 0.6482,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06987594068050385,
"rewards/margins": 0.10394857078790665,
"rewards/rejected": -0.1738245040178299,
"step": 225
},
{
"epoch": 0.184,
"grad_norm": 2.149097204208374,
"learning_rate": 4.893298743830168e-06,
"logits/chosen": -2.792332172393799,
"logits/rejected": -2.80527925491333,
"logps/chosen": -302.56390380859375,
"logps/rejected": -302.79840087890625,
"loss": 0.6306,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.04528792202472687,
"rewards/margins": 0.1434146910905838,
"rewards/rejected": -0.18870261311531067,
"step": 230
},
{
"epoch": 0.188,
"grad_norm": 2.5504279136657715,
"learning_rate": 4.882976669482368e-06,
"logits/chosen": -2.8090176582336426,
"logits/rejected": -2.7789652347564697,
"logps/chosen": -274.94342041015625,
"logps/rejected": -279.92120361328125,
"loss": 0.6433,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0906161516904831,
"rewards/margins": 0.124458909034729,
"rewards/rejected": -0.2150750607252121,
"step": 235
},
{
"epoch": 0.192,
"grad_norm": 2.925840377807617,
"learning_rate": 4.8721900291112415e-06,
"logits/chosen": -2.8581314086914062,
"logits/rejected": -2.837096691131592,
"logps/chosen": -290.9739685058594,
"logps/rejected": -275.4525451660156,
"loss": 0.6432,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.015353793278336525,
"rewards/margins": 0.11724452674388885,
"rewards/rejected": -0.13259831070899963,
"step": 240
},
{
"epoch": 0.196,
"grad_norm": 2.329665184020996,
"learning_rate": 4.860940925593703e-06,
"logits/chosen": -2.878603458404541,
"logits/rejected": -2.8466429710388184,
"logps/chosen": -288.50494384765625,
"logps/rejected": -274.239990234375,
"loss": 0.6374,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.012451673857867718,
"rewards/margins": 0.13850674033164978,
"rewards/rejected": -0.1260550618171692,
"step": 245
},
{
"epoch": 0.2,
"grad_norm": 2.1521079540252686,
"learning_rate": 4.849231551964771e-06,
"logits/chosen": -2.8556203842163086,
"logits/rejected": -2.82784104347229,
"logps/chosen": -254.6340789794922,
"logps/rejected": -242.66726684570312,
"loss": 0.6548,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.016969427466392517,
"rewards/margins": 0.09558813273906708,
"rewards/rejected": -0.112557552754879,
"step": 250
},
{
"epoch": 0.204,
"grad_norm": 3.7580788135528564,
"learning_rate": 4.837064190990036e-06,
"logits/chosen": -2.7907662391662598,
"logits/rejected": -2.8043570518493652,
"logps/chosen": -287.1578674316406,
"logps/rejected": -284.9073181152344,
"loss": 0.6471,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.06225704029202461,
"rewards/margins": 0.11668694019317627,
"rewards/rejected": -0.17894400656223297,
"step": 255
},
{
"epoch": 0.208,
"grad_norm": 3.2057318687438965,
"learning_rate": 4.824441214720629e-06,
"logits/chosen": -2.8258254528045654,
"logits/rejected": -2.838768720626831,
"logps/chosen": -331.3340148925781,
"logps/rejected": -295.8611145019531,
"loss": 0.6624,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12479463964700699,
"rewards/margins": 0.08991553634405136,
"rewards/rejected": -0.21471016108989716,
"step": 260
},
{
"epoch": 0.212,
"grad_norm": 5.250330448150635,
"learning_rate": 4.811365084030784e-06,
"logits/chosen": -2.788186550140381,
"logits/rejected": -2.737650156021118,
"logps/chosen": -240.7392578125,
"logps/rejected": -258.4285583496094,
"loss": 0.6295,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.1400509476661682,
"rewards/margins": 0.14637869596481323,
"rewards/rejected": -0.28642964363098145,
"step": 265
},
{
"epoch": 0.216,
"grad_norm": 4.085949420928955,
"learning_rate": 4.7978383481380865e-06,
"logits/chosen": -2.8263564109802246,
"logits/rejected": -2.82792592048645,
"logps/chosen": -284.7565002441406,
"logps/rejected": -326.1434020996094,
"loss": 0.6284,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.12767954170703888,
"rewards/margins": 0.17385998368263245,
"rewards/rejected": -0.30153951048851013,
"step": 270
},
{
"epoch": 0.22,
"grad_norm": 3.1048779487609863,
"learning_rate": 4.783863644106502e-06,
"logits/chosen": -2.881112575531006,
"logits/rejected": -2.87247896194458,
"logps/chosen": -279.97052001953125,
"logps/rejected": -273.80780029296875,
"loss": 0.6366,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.1319071650505066,
"rewards/margins": 0.14382150769233704,
"rewards/rejected": -0.2757287323474884,
"step": 275
},
{
"epoch": 0.224,
"grad_norm": 3.2117135524749756,
"learning_rate": 4.769443696332272e-06,
"logits/chosen": -2.8749935626983643,
"logits/rejected": -2.844726085662842,
"logps/chosen": -292.9136657714844,
"logps/rejected": -294.96185302734375,
"loss": 0.629,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09047095477581024,
"rewards/margins": 0.1656198650598526,
"rewards/rejected": -0.25609081983566284,
"step": 280
},
{
"epoch": 0.228,
"grad_norm": 3.0179073810577393,
"learning_rate": 4.754581316012785e-06,
"logits/chosen": -2.8741941452026367,
"logits/rejected": -2.799834728240967,
"logps/chosen": -323.2701721191406,
"logps/rejected": -299.8788146972656,
"loss": 0.5984,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.11364835500717163,
"rewards/margins": 0.2376987189054489,
"rewards/rejected": -0.35134708881378174,
"step": 285
},
{
"epoch": 0.232,
"grad_norm": 3.1179349422454834,
"learning_rate": 4.7392794005985324e-06,
"logits/chosen": -2.801036834716797,
"logits/rejected": -2.793466329574585,
"logps/chosen": -293.9784240722656,
"logps/rejected": -272.1210021972656,
"loss": 0.5962,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.23425976932048798,
"rewards/margins": 0.23078274726867676,
"rewards/rejected": -0.4650425314903259,
"step": 290
},
{
"epoch": 0.236,
"grad_norm": 3.5481436252593994,
"learning_rate": 4.723540933228245e-06,
"logits/chosen": -2.8212785720825195,
"logits/rejected": -2.7978832721710205,
"logps/chosen": -327.6084899902344,
"logps/rejected": -320.15106201171875,
"loss": 0.6612,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.32292693853378296,
"rewards/margins": 0.1046195775270462,
"rewards/rejected": -0.42754650115966797,
"step": 295
},
{
"epoch": 0.24,
"grad_norm": 4.389492034912109,
"learning_rate": 4.707368982147318e-06,
"logits/chosen": -2.8768062591552734,
"logits/rejected": -2.8266239166259766,
"logps/chosen": -329.6361083984375,
"logps/rejected": -282.6575927734375,
"loss": 0.6372,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2786393463611603,
"rewards/margins": 0.155739888548851,
"rewards/rejected": -0.4343792498111725,
"step": 300
},
{
"epoch": 0.24,
"eval_logits/chosen": -2.84016752243042,
"eval_logits/rejected": -2.803346872329712,
"eval_logps/chosen": -307.0444030761719,
"eval_logps/rejected": -292.0920715332031,
"eval_loss": 0.6181342005729675,
"eval_rewards/accuracies": 0.682539701461792,
"eval_rewards/chosen": -0.2380632609128952,
"eval_rewards/margins": 0.20256145298480988,
"eval_rewards/rejected": -0.4406247138977051,
"eval_runtime": 166.7743,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 0.378,
"step": 300
},
{
"epoch": 0.244,
"grad_norm": 6.237858295440674,
"learning_rate": 4.690766700109659e-06,
"logits/chosen": -2.813170909881592,
"logits/rejected": -2.765450954437256,
"logps/chosen": -251.8035430908203,
"logps/rejected": -226.10787963867188,
"loss": 0.6377,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.28489404916763306,
"rewards/margins": 0.15168778598308563,
"rewards/rejected": -0.4365817904472351,
"step": 305
},
{
"epoch": 0.248,
"grad_norm": 2.941599130630493,
"learning_rate": 4.673737323763048e-06,
"logits/chosen": -2.8621535301208496,
"logits/rejected": -2.883449077606201,
"logps/chosen": -323.72625732421875,
"logps/rejected": -309.9552307128906,
"loss": 0.5975,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.2496490776538849,
"rewards/margins": 0.24661684036254883,
"rewards/rejected": -0.4962659478187561,
"step": 310
},
{
"epoch": 0.252,
"grad_norm": 2.81584095954895,
"learning_rate": 4.656284173018144e-06,
"logits/chosen": -2.7917304039001465,
"logits/rejected": -2.771953821182251,
"logps/chosen": -305.7306213378906,
"logps/rejected": -337.7940979003906,
"loss": 0.6245,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.24593646824359894,
"rewards/margins": 0.17785824835300446,
"rewards/rejected": -0.423794686794281,
"step": 315
},
{
"epoch": 0.256,
"grad_norm": 3.657536029815674,
"learning_rate": 4.638410650401267e-06,
"logits/chosen": -2.863358974456787,
"logits/rejected": -2.8709418773651123,
"logps/chosen": -308.0937805175781,
"logps/rejected": -322.3440246582031,
"loss": 0.6189,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.19167309999465942,
"rewards/margins": 0.20849671959877014,
"rewards/rejected": -0.40016984939575195,
"step": 320
},
{
"epoch": 0.26,
"grad_norm": 3.2525851726531982,
"learning_rate": 4.620120240391065e-06,
"logits/chosen": -2.8361878395080566,
"logits/rejected": -2.8604865074157715,
"logps/chosen": -331.04949951171875,
"logps/rejected": -306.60662841796875,
"loss": 0.612,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.15142570436000824,
"rewards/margins": 0.23161661624908447,
"rewards/rejected": -0.3830423355102539,
"step": 325
},
{
"epoch": 0.264,
"grad_norm": 3.2161409854888916,
"learning_rate": 4.601416508739211e-06,
"logits/chosen": -2.765329360961914,
"logits/rejected": -2.731293201446533,
"logps/chosen": -294.65509033203125,
"logps/rejected": -288.2440185546875,
"loss": 0.6113,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1889929473400116,
"rewards/margins": 0.23026308417320251,
"rewards/rejected": -0.4192560315132141,
"step": 330
},
{
"epoch": 0.268,
"grad_norm": 4.34539270401001,
"learning_rate": 4.582303101775249e-06,
"logits/chosen": -2.773538112640381,
"logits/rejected": -2.750394582748413,
"logps/chosen": -301.92291259765625,
"logps/rejected": -276.76275634765625,
"loss": 0.6137,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.25806164741516113,
"rewards/margins": 0.237727090716362,
"rewards/rejected": -0.49578872323036194,
"step": 335
},
{
"epoch": 0.272,
"grad_norm": 2.9809610843658447,
"learning_rate": 4.562783745695738e-06,
"logits/chosen": -2.7601351737976074,
"logits/rejected": -2.805574893951416,
"logps/chosen": -213.38693237304688,
"logps/rejected": -248.6228790283203,
"loss": 0.6131,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.2839365601539612,
"rewards/margins": 0.2204209268093109,
"rewards/rejected": -0.5043575167655945,
"step": 340
},
{
"epoch": 0.276,
"grad_norm": 3.7868945598602295,
"learning_rate": 4.542862245837821e-06,
"logits/chosen": -2.862086296081543,
"logits/rejected": -2.80869722366333,
"logps/chosen": -326.58392333984375,
"logps/rejected": -329.5889587402344,
"loss": 0.5811,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.23613891005516052,
"rewards/margins": 0.30714207887649536,
"rewards/rejected": -0.5432809591293335,
"step": 345
},
{
"epoch": 0.28,
"grad_norm": 4.723974227905273,
"learning_rate": 4.522542485937369e-06,
"logits/chosen": -2.723212242126465,
"logits/rejected": -2.6936533451080322,
"logps/chosen": -267.9949951171875,
"logps/rejected": -286.03448486328125,
"loss": 0.6194,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3070460557937622,
"rewards/margins": 0.2262255847454071,
"rewards/rejected": -0.5332716703414917,
"step": 350
},
{
"epoch": 0.284,
"grad_norm": 3.612205982208252,
"learning_rate": 4.501828427371834e-06,
"logits/chosen": -2.8160369396209717,
"logits/rejected": -2.7678263187408447,
"logps/chosen": -276.889892578125,
"logps/rejected": -262.567138671875,
"loss": 0.6269,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2459622323513031,
"rewards/margins": 0.21559634804725647,
"rewards/rejected": -0.4615585207939148,
"step": 355
},
{
"epoch": 0.288,
"grad_norm": 4.156825065612793,
"learning_rate": 4.4807241083879774e-06,
"logits/chosen": -2.8309903144836426,
"logits/rejected": -2.848707914352417,
"logps/chosen": -298.7277526855469,
"logps/rejected": -328.09912109375,
"loss": 0.6177,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.27550002932548523,
"rewards/margins": 0.23323087394237518,
"rewards/rejected": -0.508730947971344,
"step": 360
},
{
"epoch": 0.292,
"grad_norm": 4.314282417297363,
"learning_rate": 4.4592336433146e-06,
"logits/chosen": -2.811722755432129,
"logits/rejected": -2.807515859603882,
"logps/chosen": -309.4399719238281,
"logps/rejected": -314.6178283691406,
"loss": 0.6153,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3844314217567444,
"rewards/margins": 0.22192268073558807,
"rewards/rejected": -0.6063541173934937,
"step": 365
},
{
"epoch": 0.296,
"grad_norm": 4.635516166687012,
"learning_rate": 4.437361221760449e-06,
"logits/chosen": -2.850919485092163,
"logits/rejected": -2.8320136070251465,
"logps/chosen": -316.4634704589844,
"logps/rejected": -295.4265441894531,
"loss": 0.5943,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.32734400033950806,
"rewards/margins": 0.2893194556236267,
"rewards/rejected": -0.61666339635849,
"step": 370
},
{
"epoch": 0.3,
"grad_norm": 4.108780384063721,
"learning_rate": 4.415111107797445e-06,
"logits/chosen": -2.763192892074585,
"logits/rejected": -2.6753077507019043,
"logps/chosen": -304.7072448730469,
"logps/rejected": -296.2711486816406,
"loss": 0.6325,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.277075856924057,
"rewards/margins": 0.20892643928527832,
"rewards/rejected": -0.48600226640701294,
"step": 375
},
{
"epoch": 0.304,
"grad_norm": 2.956279993057251,
"learning_rate": 4.3924876391293915e-06,
"logits/chosen": -2.7950615882873535,
"logits/rejected": -2.7592384815216064,
"logps/chosen": -273.1991271972656,
"logps/rejected": -275.79229736328125,
"loss": 0.6129,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.13601061701774597,
"rewards/margins": 0.22439420223236084,
"rewards/rejected": -0.3604048192501068,
"step": 380
},
{
"epoch": 0.308,
"grad_norm": 4.006164073944092,
"learning_rate": 4.36949522624633e-06,
"logits/chosen": -2.830416202545166,
"logits/rejected": -2.8047218322753906,
"logps/chosen": -323.8509826660156,
"logps/rejected": -308.55230712890625,
"loss": 0.5875,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.10837619006633759,
"rewards/margins": 0.28125640749931335,
"rewards/rejected": -0.38963261246681213,
"step": 385
},
{
"epoch": 0.312,
"grad_norm": 6.139017581939697,
"learning_rate": 4.346138351564711e-06,
"logits/chosen": -2.8317887783050537,
"logits/rejected": -2.7582955360412598,
"logps/chosen": -362.7658996582031,
"logps/rejected": -310.333984375,
"loss": 0.6309,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.20606637001037598,
"rewards/margins": 0.2002502977848053,
"rewards/rejected": -0.4063166677951813,
"step": 390
},
{
"epoch": 0.316,
"grad_norm": 4.839846134185791,
"learning_rate": 4.322421568553529e-06,
"logits/chosen": -2.848759174346924,
"logits/rejected": -2.7962448596954346,
"logps/chosen": -382.81048583984375,
"logps/rejected": -339.45672607421875,
"loss": 0.6138,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1714310199022293,
"rewards/margins": 0.22842545807361603,
"rewards/rejected": -0.39985641837120056,
"step": 395
},
{
"epoch": 0.32,
"grad_norm": 3.8282814025878906,
"learning_rate": 4.2983495008466285e-06,
"logits/chosen": -2.8639044761657715,
"logits/rejected": -2.8238472938537598,
"logps/chosen": -317.0664367675781,
"logps/rejected": -313.5345458984375,
"loss": 0.5699,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1875368058681488,
"rewards/margins": 0.35046523809432983,
"rewards/rejected": -0.538002073764801,
"step": 400
},
{
"epoch": 0.32,
"eval_logits/chosen": -2.831890344619751,
"eval_logits/rejected": -2.795173168182373,
"eval_logps/chosen": -309.8138427734375,
"eval_logps/rejected": -301.8563232421875,
"eval_loss": 0.6034325957298279,
"eval_rewards/accuracies": 0.6964285969734192,
"eval_rewards/chosen": -0.2657574713230133,
"eval_rewards/margins": 0.27250993251800537,
"eval_rewards/rejected": -0.5382674336433411,
"eval_runtime": 166.7653,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 0.378,
"step": 400
},
{
"epoch": 0.324,
"grad_norm": 4.6939778327941895,
"learning_rate": 4.273926841341303e-06,
"logits/chosen": -2.8153679370880127,
"logits/rejected": -2.797407388687134,
"logps/chosen": -267.6861877441406,
"logps/rejected": -296.2272033691406,
"loss": 0.6146,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2948915362358093,
"rewards/margins": 0.27376502752304077,
"rewards/rejected": -0.5686565637588501,
"step": 405
},
{
"epoch": 0.328,
"grad_norm": 5.867495059967041,
"learning_rate": 4.249158351283414e-06,
"logits/chosen": -2.8131103515625,
"logits/rejected": -2.7752747535705566,
"logps/chosen": -296.8202209472656,
"logps/rejected": -309.84991455078125,
"loss": 0.6198,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3828127086162567,
"rewards/margins": 0.27066582441329956,
"rewards/rejected": -0.6534786224365234,
"step": 410
},
{
"epoch": 0.332,
"grad_norm": 3.4944844245910645,
"learning_rate": 4.224048859339175e-06,
"logits/chosen": -2.7919559478759766,
"logits/rejected": -2.7731316089630127,
"logps/chosen": -320.2292175292969,
"logps/rejected": -313.79827880859375,
"loss": 0.5827,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.28928855061531067,
"rewards/margins": 0.3156905770301819,
"rewards/rejected": -0.6049790978431702,
"step": 415
},
{
"epoch": 0.336,
"grad_norm": 6.125190734863281,
"learning_rate": 4.198603260653792e-06,
"logits/chosen": -2.8130970001220703,
"logits/rejected": -2.7901930809020996,
"logps/chosen": -317.51165771484375,
"logps/rejected": -293.3788146972656,
"loss": 0.6275,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.2976795434951782,
"rewards/margins": 0.21939225494861603,
"rewards/rejected": -0.5170717239379883,
"step": 420
},
{
"epoch": 0.34,
"grad_norm": 4.455983638763428,
"learning_rate": 4.172826515897146e-06,
"logits/chosen": -2.8195388317108154,
"logits/rejected": -2.780494451522827,
"logps/chosen": -283.6789245605469,
"logps/rejected": -300.58563232421875,
"loss": 0.572,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.23819419741630554,
"rewards/margins": 0.36092180013656616,
"rewards/rejected": -0.5991159677505493,
"step": 425
},
{
"epoch": 0.344,
"grad_norm": 3.734440326690674,
"learning_rate": 4.146723650296701e-06,
"logits/chosen": -2.8214731216430664,
"logits/rejected": -2.80680775642395,
"logps/chosen": -305.1948547363281,
"logps/rejected": -301.62579345703125,
"loss": 0.603,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.25884318351745605,
"rewards/margins": 0.2811453640460968,
"rewards/rejected": -0.5399885773658752,
"step": 430
},
{
"epoch": 0.348,
"grad_norm": 3.1842479705810547,
"learning_rate": 4.120299752657828e-06,
"logits/chosen": -2.799774169921875,
"logits/rejected": -2.790123462677002,
"logps/chosen": -309.83477783203125,
"logps/rejected": -303.22332763671875,
"loss": 0.5892,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20615491271018982,
"rewards/margins": 0.3112487494945526,
"rewards/rejected": -0.5174037218093872,
"step": 435
},
{
"epoch": 0.352,
"grad_norm": 6.865662574768066,
"learning_rate": 4.093559974371725e-06,
"logits/chosen": -2.806478977203369,
"logits/rejected": -2.8173699378967285,
"logps/chosen": -312.33978271484375,
"logps/rejected": -340.7373046875,
"loss": 0.5898,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.22585222125053406,
"rewards/margins": 0.3283933103084564,
"rewards/rejected": -0.5542455911636353,
"step": 440
},
{
"epoch": 0.356,
"grad_norm": 5.557225704193115,
"learning_rate": 4.066509528411151e-06,
"logits/chosen": -2.7204031944274902,
"logits/rejected": -2.679771900177002,
"logps/chosen": -277.78057861328125,
"logps/rejected": -310.7375183105469,
"loss": 0.5563,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.30937278270721436,
"rewards/margins": 0.39451706409454346,
"rewards/rejected": -0.7038899660110474,
"step": 445
},
{
"epoch": 0.36,
"grad_norm": 6.662594795227051,
"learning_rate": 4.039153688314146e-06,
"logits/chosen": -2.8505125045776367,
"logits/rejected": -2.7928390502929688,
"logps/chosen": -344.05902099609375,
"logps/rejected": -324.29962158203125,
"loss": 0.596,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.43934765458106995,
"rewards/margins": 0.31155315041542053,
"rewards/rejected": -0.7509008049964905,
"step": 450
},
{
"epoch": 0.364,
"grad_norm": 5.561422348022461,
"learning_rate": 4.011497787155938e-06,
"logits/chosen": -2.759361743927002,
"logits/rejected": -2.697282314300537,
"logps/chosen": -333.2874755859375,
"logps/rejected": -329.25225830078125,
"loss": 0.5838,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.529011607170105,
"rewards/margins": 0.3510381579399109,
"rewards/rejected": -0.8800498247146606,
"step": 455
},
{
"epoch": 0.368,
"grad_norm": 4.668981075286865,
"learning_rate": 3.983547216509254e-06,
"logits/chosen": -2.8310632705688477,
"logits/rejected": -2.7877674102783203,
"logps/chosen": -384.951904296875,
"logps/rejected": -339.9574890136719,
"loss": 0.571,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5116842985153198,
"rewards/margins": 0.3813716769218445,
"rewards/rejected": -0.8930560946464539,
"step": 460
},
{
"epoch": 0.372,
"grad_norm": 3.6945154666900635,
"learning_rate": 3.955307425393224e-06,
"logits/chosen": -2.860947370529175,
"logits/rejected": -2.817092180252075,
"logps/chosen": -368.18890380859375,
"logps/rejected": -373.40155029296875,
"loss": 0.5221,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3717382550239563,
"rewards/margins": 0.5002824664115906,
"rewards/rejected": -0.8720208406448364,
"step": 465
},
{
"epoch": 0.376,
"grad_norm": 5.232949256896973,
"learning_rate": 3.92678391921108e-06,
"logits/chosen": -2.7127513885498047,
"logits/rejected": -2.689349889755249,
"logps/chosen": -373.66119384765625,
"logps/rejected": -391.1875305175781,
"loss": 0.5644,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.5153363347053528,
"rewards/margins": 0.4165223240852356,
"rewards/rejected": -0.9318585395812988,
"step": 470
},
{
"epoch": 0.38,
"grad_norm": 5.3401899337768555,
"learning_rate": 3.897982258676867e-06,
"logits/chosen": -2.75883150100708,
"logits/rejected": -2.7517054080963135,
"logps/chosen": -315.7061462402344,
"logps/rejected": -348.54168701171875,
"loss": 0.5835,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4366493821144104,
"rewards/margins": 0.3299176096916199,
"rewards/rejected": -0.7665671110153198,
"step": 475
},
{
"epoch": 0.384,
"grad_norm": 6.390676498413086,
"learning_rate": 3.868908058731376e-06,
"logits/chosen": -2.77325701713562,
"logits/rejected": -2.7217469215393066,
"logps/chosen": -355.1070251464844,
"logps/rejected": -332.23199462890625,
"loss": 0.6723,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.558080792427063,
"rewards/margins": 0.1657411754131317,
"rewards/rejected": -0.7238219380378723,
"step": 480
},
{
"epoch": 0.388,
"grad_norm": 8.37760066986084,
"learning_rate": 3.839566987447492e-06,
"logits/chosen": -2.7543792724609375,
"logits/rejected": -2.7266902923583984,
"logps/chosen": -345.9546813964844,
"logps/rejected": -343.42669677734375,
"loss": 0.5789,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.584365963935852,
"rewards/margins": 0.35950571298599243,
"rewards/rejected": -0.9438716173171997,
"step": 485
},
{
"epoch": 0.392,
"grad_norm": 4.165892124176025,
"learning_rate": 3.8099647649251984e-06,
"logits/chosen": -2.798603057861328,
"logits/rejected": -2.746656894683838,
"logps/chosen": -339.4115905761719,
"logps/rejected": -342.5590515136719,
"loss": 0.6252,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5836890935897827,
"rewards/margins": 0.2635918855667114,
"rewards/rejected": -0.8472809791564941,
"step": 490
},
{
"epoch": 0.396,
"grad_norm": 5.796815872192383,
"learning_rate": 3.780107162176429e-06,
"logits/chosen": -2.771759510040283,
"logits/rejected": -2.7516114711761475,
"logps/chosen": -340.46343994140625,
"logps/rejected": -309.8275451660156,
"loss": 0.5526,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5660332441329956,
"rewards/margins": 0.4165772497653961,
"rewards/rejected": -0.9826105237007141,
"step": 495
},
{
"epoch": 0.4,
"grad_norm": 10.051532745361328,
"learning_rate": 3.7500000000000005e-06,
"logits/chosen": -2.7455780506134033,
"logits/rejected": -2.723891496658325,
"logps/chosen": -364.03936767578125,
"logps/rejected": -377.3976135253906,
"loss": 0.5622,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5170733332633972,
"rewards/margins": 0.44108885526657104,
"rewards/rejected": -0.9581623077392578,
"step": 500
},
{
"epoch": 0.4,
"eval_logits/chosen": -2.732027053833008,
"eval_logits/rejected": -2.691253662109375,
"eval_logps/chosen": -338.8871765136719,
"eval_logps/rejected": -345.97265625,
"eval_loss": 0.5688419342041016,
"eval_rewards/accuracies": 0.7142857313156128,
"eval_rewards/chosen": -0.5564908385276794,
"eval_rewards/margins": 0.42293980717658997,
"eval_rewards/rejected": -0.9794306755065918,
"eval_runtime": 166.7904,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 0.378,
"step": 500
},
{
"epoch": 0.404,
"grad_norm": 7.153428554534912,
"learning_rate": 3.7196491478468322e-06,
"logits/chosen": -2.662764549255371,
"logits/rejected": -2.6799817085266113,
"logps/chosen": -346.8814392089844,
"logps/rejected": -386.2395935058594,
"loss": 0.5618,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6629476547241211,
"rewards/margins": 0.44316092133522034,
"rewards/rejected": -1.106108546257019,
"step": 505
},
{
"epoch": 0.408,
"grad_norm": 7.824460506439209,
"learning_rate": 3.689060522675689e-06,
"logits/chosen": -2.739622116088867,
"logits/rejected": -2.7229952812194824,
"logps/chosen": -341.31610107421875,
"logps/rejected": -361.16583251953125,
"loss": 0.6013,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5365445613861084,
"rewards/margins": 0.3803355097770691,
"rewards/rejected": -0.9168800115585327,
"step": 510
},
{
"epoch": 0.412,
"grad_norm": 6.43038272857666,
"learning_rate": 3.658240087799655e-06,
"logits/chosen": -2.6712212562561035,
"logits/rejected": -2.6851272583007812,
"logps/chosen": -314.61614990234375,
"logps/rejected": -371.5428466796875,
"loss": 0.5377,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5410436391830444,
"rewards/margins": 0.517443835735321,
"rewards/rejected": -1.0584874153137207,
"step": 515
},
{
"epoch": 0.416,
"grad_norm": 8.127975463867188,
"learning_rate": 3.627193851723577e-06,
"logits/chosen": -2.717458724975586,
"logits/rejected": -2.690972328186035,
"logps/chosen": -355.3538818359375,
"logps/rejected": -379.71417236328125,
"loss": 0.6306,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7785177230834961,
"rewards/margins": 0.36465466022491455,
"rewards/rejected": -1.143172264099121,
"step": 520
},
{
"epoch": 0.42,
"grad_norm": 8.312115669250488,
"learning_rate": 3.595927866972694e-06,
"logits/chosen": -2.6868720054626465,
"logits/rejected": -2.6861917972564697,
"logps/chosen": -292.2730407714844,
"logps/rejected": -340.75592041015625,
"loss": 0.5573,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6696837544441223,
"rewards/margins": 0.5235880613327026,
"rewards/rejected": -1.1932718753814697,
"step": 525
},
{
"epoch": 0.424,
"grad_norm": 9.0608491897583,
"learning_rate": 3.564448228912682e-06,
"logits/chosen": -2.6300408840179443,
"logits/rejected": -2.623781204223633,
"logps/chosen": -381.3360900878906,
"logps/rejected": -383.34161376953125,
"loss": 0.6091,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7431681752204895,
"rewards/margins": 0.39449232816696167,
"rewards/rejected": -1.1376605033874512,
"step": 530
},
{
"epoch": 0.428,
"grad_norm": 15.011635780334473,
"learning_rate": 3.532761074561355e-06,
"logits/chosen": -2.6198954582214355,
"logits/rejected": -2.571420192718506,
"logps/chosen": -389.4923095703125,
"logps/rejected": -430.41204833984375,
"loss": 0.5782,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7040005922317505,
"rewards/margins": 0.5225512385368347,
"rewards/rejected": -1.2265517711639404,
"step": 535
},
{
"epoch": 0.432,
"grad_norm": 6.0668768882751465,
"learning_rate": 3.5008725813922383e-06,
"logits/chosen": -2.7458198070526123,
"logits/rejected": -2.6754350662231445,
"logps/chosen": -353.69500732421875,
"logps/rejected": -393.26953125,
"loss": 0.529,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5991845726966858,
"rewards/margins": 0.5754216313362122,
"rewards/rejected": -1.1746060848236084,
"step": 540
},
{
"epoch": 0.436,
"grad_norm": 9.939155578613281,
"learning_rate": 3.4687889661302577e-06,
"logits/chosen": -2.648597240447998,
"logits/rejected": -2.6514642238616943,
"logps/chosen": -319.5657653808594,
"logps/rejected": -362.01885986328125,
"loss": 0.5329,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.6453290581703186,
"rewards/margins": 0.6017133593559265,
"rewards/rejected": -1.2470424175262451,
"step": 545
},
{
"epoch": 0.44,
"grad_norm": 7.438101768493652,
"learning_rate": 3.436516483539781e-06,
"logits/chosen": -2.679978847503662,
"logits/rejected": -2.667757034301758,
"logps/chosen": -354.5327453613281,
"logps/rejected": -376.2767333984375,
"loss": 0.6251,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8374980688095093,
"rewards/margins": 0.40907567739486694,
"rewards/rejected": -1.246573805809021,
"step": 550
},
{
"epoch": 0.444,
"grad_norm": 6.391237735748291,
"learning_rate": 3.4040614252052305e-06,
"logits/chosen": -2.672637939453125,
"logits/rejected": -2.6665103435516357,
"logps/chosen": -380.81195068359375,
"logps/rejected": -405.7100524902344,
"loss": 0.6038,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.939714252948761,
"rewards/margins": 0.42443543672561646,
"rewards/rejected": -1.364149808883667,
"step": 555
},
{
"epoch": 0.448,
"grad_norm": 9.154504776000977,
"learning_rate": 3.3714301183045382e-06,
"logits/chosen": -2.6303160190582275,
"logits/rejected": -2.572775363922119,
"logps/chosen": -331.26007080078125,
"logps/rejected": -356.6570129394531,
"loss": 0.6228,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9475865364074707,
"rewards/margins": 0.3488699793815613,
"rewards/rejected": -1.2964565753936768,
"step": 560
},
{
"epoch": 0.452,
"grad_norm": 5.733785629272461,
"learning_rate": 3.338628924375638e-06,
"logits/chosen": -2.7566773891448975,
"logits/rejected": -2.7174875736236572,
"logps/chosen": -315.8184509277344,
"logps/rejected": -379.3972473144531,
"loss": 0.5181,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7764695286750793,
"rewards/margins": 0.5681883096694946,
"rewards/rejected": -1.3446576595306396,
"step": 565
},
{
"epoch": 0.456,
"grad_norm": 5.354190349578857,
"learning_rate": 3.3056642380762783e-06,
"logits/chosen": -2.715209484100342,
"logits/rejected": -2.703580379486084,
"logps/chosen": -289.91387939453125,
"logps/rejected": -322.75933837890625,
"loss": 0.5887,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.5646545886993408,
"rewards/margins": 0.43763160705566406,
"rewards/rejected": -1.0022861957550049,
"step": 570
},
{
"epoch": 0.46,
"grad_norm": 6.91750955581665,
"learning_rate": 3.272542485937369e-06,
"logits/chosen": -2.6926093101501465,
"logits/rejected": -2.6333212852478027,
"logps/chosen": -300.4584045410156,
"logps/rejected": -308.9971008300781,
"loss": 0.5684,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.579850435256958,
"rewards/margins": 0.424421489238739,
"rewards/rejected": -1.0042719841003418,
"step": 575
},
{
"epoch": 0.464,
"grad_norm": 5.240443706512451,
"learning_rate": 3.2392701251101172e-06,
"logits/chosen": -2.745445966720581,
"logits/rejected": -2.6982531547546387,
"logps/chosen": -345.88031005859375,
"logps/rejected": -365.571533203125,
"loss": 0.5121,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.45542454719543457,
"rewards/margins": 0.6121553778648376,
"rewards/rejected": -1.067579984664917,
"step": 580
},
{
"epoch": 0.468,
"grad_norm": 5.993870258331299,
"learning_rate": 3.205853642107192e-06,
"logits/chosen": -2.679216146469116,
"logits/rejected": -2.660553216934204,
"logps/chosen": -300.2437744140625,
"logps/rejected": -327.10833740234375,
"loss": 0.6046,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5670121312141418,
"rewards/margins": 0.38394877314567566,
"rewards/rejected": -0.9509609341621399,
"step": 585
},
{
"epoch": 0.472,
"grad_norm": 9.442901611328125,
"learning_rate": 3.1722995515381644e-06,
"logits/chosen": -2.657310962677002,
"logits/rejected": -2.649341344833374,
"logps/chosen": -328.50543212890625,
"logps/rejected": -353.29852294921875,
"loss": 0.5305,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.47631463408470154,
"rewards/margins": 0.5425296425819397,
"rewards/rejected": -1.0188442468643188,
"step": 590
},
{
"epoch": 0.476,
"grad_norm": 6.075891971588135,
"learning_rate": 3.1386143948394764e-06,
"logits/chosen": -2.67082142829895,
"logits/rejected": -2.663555145263672,
"logps/chosen": -311.79595947265625,
"logps/rejected": -385.62750244140625,
"loss": 0.545,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5932539701461792,
"rewards/margins": 0.5186047554016113,
"rewards/rejected": -1.11185884475708,
"step": 595
},
{
"epoch": 0.48,
"grad_norm": 8.881017684936523,
"learning_rate": 3.1048047389991693e-06,
"logits/chosen": -2.6688761711120605,
"logits/rejected": -2.598611354827881,
"logps/chosen": -374.6273193359375,
"logps/rejected": -324.2472839355469,
"loss": 0.5826,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5414284467697144,
"rewards/margins": 0.4649595320224762,
"rewards/rejected": -1.0063880681991577,
"step": 600
},
{
"epoch": 0.48,
"eval_logits/chosen": -2.690697431564331,
"eval_logits/rejected": -2.6522767543792725,
"eval_logps/chosen": -337.7991638183594,
"eval_logps/rejected": -359.91156005859375,
"eval_loss": 0.5457041263580322,
"eval_rewards/accuracies": 0.7242063283920288,
"eval_rewards/chosen": -0.5456109642982483,
"eval_rewards/margins": 0.5732083916664124,
"eval_rewards/rejected": -1.1188193559646606,
"eval_runtime": 166.7663,
"eval_samples_per_second": 2.998,
"eval_steps_per_second": 0.378,
"step": 600
},
{
"epoch": 0.484,
"grad_norm": 8.860865592956543,
"learning_rate": 3.0708771752766397e-06,
"logits/chosen": -2.714979648590088,
"logits/rejected": -2.67262601852417,
"logps/chosen": -379.689697265625,
"logps/rejected": -389.07537841796875,
"loss": 0.5436,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.500022292137146,
"rewards/margins": 0.5223508477210999,
"rewards/rejected": -1.0223733186721802,
"step": 605
},
{
"epoch": 0.488,
"grad_norm": 7.4055280685424805,
"learning_rate": 3.0368383179176584e-06,
"logits/chosen": -2.648967981338501,
"logits/rejected": -2.5849640369415283,
"logps/chosen": -349.82244873046875,
"logps/rejected": -420.50494384765625,
"loss": 0.5099,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5480636954307556,
"rewards/margins": 0.6970816850662231,
"rewards/rejected": -1.2451454401016235,
"step": 610
},
{
"epoch": 0.492,
"grad_norm": 8.267292976379395,
"learning_rate": 3.002694802864912e-06,
"logits/chosen": -2.6305606365203857,
"logits/rejected": -2.6106619834899902,
"logps/chosen": -341.2841491699219,
"logps/rejected": -381.73297119140625,
"loss": 0.5663,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.48357224464416504,
"rewards/margins": 0.542944073677063,
"rewards/rejected": -1.026516318321228,
"step": 615
},
{
"epoch": 0.496,
"grad_norm": 6.144512176513672,
"learning_rate": 2.9684532864643123e-06,
"logits/chosen": -2.6233174800872803,
"logits/rejected": -2.6076509952545166,
"logps/chosen": -316.0887756347656,
"logps/rejected": -354.6174621582031,
"loss": 0.5047,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4337923526763916,
"rewards/margins": 0.6806012392044067,
"rewards/rejected": -1.1143935918807983,
"step": 620
},
{
"epoch": 0.5,
"grad_norm": 12.604687690734863,
"learning_rate": 2.9341204441673267e-06,
"logits/chosen": -2.668950319290161,
"logits/rejected": -2.6760809421539307,
"logps/chosen": -368.0353698730469,
"logps/rejected": -343.40869140625,
"loss": 0.658,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.606735110282898,
"rewards/margins": 0.32046255469322205,
"rewards/rejected": -0.9271975755691528,
"step": 625
},
{
"epoch": 0.504,
"grad_norm": 9.32141399383545,
"learning_rate": 2.8997029692295875e-06,
"logits/chosen": -2.6581954956054688,
"logits/rejected": -2.6276352405548096,
"logps/chosen": -291.1399841308594,
"logps/rejected": -327.74859619140625,
"loss": 0.6077,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5456727743148804,
"rewards/margins": 0.44013065099716187,
"rewards/rejected": -0.9858034253120422,
"step": 630
},
{
"epoch": 0.508,
"grad_norm": 8.439581871032715,
"learning_rate": 2.8652075714060296e-06,
"logits/chosen": -2.673593044281006,
"logits/rejected": -2.6902694702148438,
"logps/chosen": -301.8299560546875,
"logps/rejected": -353.7574157714844,
"loss": 0.5776,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.5664817094802856,
"rewards/margins": 0.4610595107078552,
"rewards/rejected": -1.027541160583496,
"step": 635
},
{
"epoch": 0.512,
"grad_norm": 6.46388053894043,
"learning_rate": 2.8306409756428067e-06,
"logits/chosen": -2.6250970363616943,
"logits/rejected": -2.595864772796631,
"logps/chosen": -283.65887451171875,
"logps/rejected": -289.2749938964844,
"loss": 0.5845,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.4361953139305115,
"rewards/margins": 0.3893643915653229,
"rewards/rejected": -0.8255597949028015,
"step": 640
},
{
"epoch": 0.516,
"grad_norm": 7.4935712814331055,
"learning_rate": 2.7960099207662535e-06,
"logits/chosen": -2.638918161392212,
"logits/rejected": -2.606156826019287,
"logps/chosen": -298.98980712890625,
"logps/rejected": -329.75079345703125,
"loss": 0.5492,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4644347131252289,
"rewards/margins": 0.5126577615737915,
"rewards/rejected": -0.9770925641059875,
"step": 645
},
{
"epoch": 0.52,
"grad_norm": 5.561440467834473,
"learning_rate": 2.761321158169134e-06,
"logits/chosen": -2.6843574047088623,
"logits/rejected": -2.687722682952881,
"logps/chosen": -330.052001953125,
"logps/rejected": -330.7695007324219,
"loss": 0.5886,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.48628172278404236,
"rewards/margins": 0.3754786252975464,
"rewards/rejected": -0.8617603182792664,
"step": 650
},
{
"epoch": 0.524,
"grad_norm": 5.8051862716674805,
"learning_rate": 2.726581450494451e-06,
"logits/chosen": -2.6416282653808594,
"logits/rejected": -2.6334142684936523,
"logps/chosen": -323.7715148925781,
"logps/rejected": -333.45831298828125,
"loss": 0.5329,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2815348505973816,
"rewards/margins": 0.5408404469490051,
"rewards/rejected": -0.8223752975463867,
"step": 655
},
{
"epoch": 0.528,
"grad_norm": 4.751335144042969,
"learning_rate": 2.6917975703170466e-06,
"logits/chosen": -2.691120147705078,
"logits/rejected": -2.6784567832946777,
"logps/chosen": -319.2125244140625,
"logps/rejected": -377.9074401855469,
"loss": 0.4832,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.34588542580604553,
"rewards/margins": 0.7417780160903931,
"rewards/rejected": -1.0876634120941162,
"step": 660
},
{
"epoch": 0.532,
"grad_norm": 4.525731086730957,
"learning_rate": 2.6569762988232838e-06,
"logits/chosen": -2.6197690963745117,
"logits/rejected": -2.6266191005706787,
"logps/chosen": -293.38787841796875,
"logps/rejected": -343.3548583984375,
"loss": 0.5703,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.4132465720176697,
"rewards/margins": 0.46936559677124023,
"rewards/rejected": -0.8826121091842651,
"step": 665
},
{
"epoch": 0.536,
"grad_norm": 13.704550743103027,
"learning_rate": 2.6221244244890336e-06,
"logits/chosen": -2.677114963531494,
"logits/rejected": -2.5927250385284424,
"logps/chosen": -350.24310302734375,
"logps/rejected": -378.822021484375,
"loss": 0.5502,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.596604585647583,
"rewards/margins": 0.5345416069030762,
"rewards/rejected": -1.1311461925506592,
"step": 670
},
{
"epoch": 0.54,
"grad_norm": 13.029091835021973,
"learning_rate": 2.587248741756253e-06,
"logits/chosen": -2.7029571533203125,
"logits/rejected": -2.6872310638427734,
"logps/chosen": -327.059326171875,
"logps/rejected": -376.5028991699219,
"loss": 0.5659,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.4712875485420227,
"rewards/margins": 0.5444241762161255,
"rewards/rejected": -1.0157115459442139,
"step": 675
},
{
"epoch": 0.544,
"grad_norm": 5.572321891784668,
"learning_rate": 2.5523560497083927e-06,
"logits/chosen": -2.7125682830810547,
"logits/rejected": -2.6776933670043945,
"logps/chosen": -343.49658203125,
"logps/rejected": -386.14215087890625,
"loss": 0.5567,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6705636978149414,
"rewards/margins": 0.5794155597686768,
"rewards/rejected": -1.2499791383743286,
"step": 680
},
{
"epoch": 0.548,
"grad_norm": 11.680253028869629,
"learning_rate": 2.517453150744904e-06,
"logits/chosen": -2.708914041519165,
"logits/rejected": -2.659080982208252,
"logps/chosen": -388.8927307128906,
"logps/rejected": -396.7651062011719,
"loss": 0.6039,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7607260942459106,
"rewards/margins": 0.5218645334243774,
"rewards/rejected": -1.2825905084609985,
"step": 685
},
{
"epoch": 0.552,
"grad_norm": 7.640016555786133,
"learning_rate": 2.482546849255096e-06,
"logits/chosen": -2.6931045055389404,
"logits/rejected": -2.6509275436401367,
"logps/chosen": -371.285400390625,
"logps/rejected": -441.174560546875,
"loss": 0.5044,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.866797924041748,
"rewards/margins": 0.8486925363540649,
"rewards/rejected": -1.7154903411865234,
"step": 690
},
{
"epoch": 0.556,
"grad_norm": 6.240846157073975,
"learning_rate": 2.447643950291608e-06,
"logits/chosen": -2.527269124984741,
"logits/rejected": -2.4740500450134277,
"logps/chosen": -335.0668640136719,
"logps/rejected": -334.4136657714844,
"loss": 0.5401,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7494308352470398,
"rewards/margins": 0.6046732068061829,
"rewards/rejected": -1.3541040420532227,
"step": 695
},
{
"epoch": 0.56,
"grad_norm": 10.70870590209961,
"learning_rate": 2.4127512582437486e-06,
"logits/chosen": -2.6449031829833984,
"logits/rejected": -2.6302623748779297,
"logps/chosen": -367.7524719238281,
"logps/rejected": -402.79736328125,
"loss": 0.5313,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.799649178981781,
"rewards/margins": 0.6352438926696777,
"rewards/rejected": -1.434893012046814,
"step": 700
},
{
"epoch": 0.56,
"eval_logits/chosen": -2.658555030822754,
"eval_logits/rejected": -2.617255449295044,
"eval_logps/chosen": -354.6570739746094,
"eval_logps/rejected": -381.07342529296875,
"eval_loss": 0.5387491583824158,
"eval_rewards/accuracies": 0.7242063283920288,
"eval_rewards/chosen": -0.7141901850700378,
"eval_rewards/margins": 0.616247832775116,
"eval_rewards/rejected": -1.3304380178451538,
"eval_runtime": 165.7154,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 0.38,
"step": 700
},
{
"epoch": 0.564,
"grad_norm": 16.324960708618164,
"learning_rate": 2.377875575510967e-06,
"logits/chosen": -2.5862739086151123,
"logits/rejected": -2.5246312618255615,
"logps/chosen": -361.58331298828125,
"logps/rejected": -368.5226745605469,
"loss": 0.5767,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8630898594856262,
"rewards/margins": 0.5553407669067383,
"rewards/rejected": -1.4184306859970093,
"step": 705
},
{
"epoch": 0.568,
"grad_norm": 9.928940773010254,
"learning_rate": 2.3430237011767166e-06,
"logits/chosen": -2.6898465156555176,
"logits/rejected": -2.6531174182891846,
"logps/chosen": -348.86614990234375,
"logps/rejected": -376.40582275390625,
"loss": 0.5582,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7559942007064819,
"rewards/margins": 0.5151509642601013,
"rewards/rejected": -1.2711451053619385,
"step": 710
},
{
"epoch": 0.572,
"grad_norm": 8.1253023147583,
"learning_rate": 2.3082024296829538e-06,
"logits/chosen": -2.6176774501800537,
"logits/rejected": -2.579152822494507,
"logps/chosen": -302.24481201171875,
"logps/rejected": -392.26226806640625,
"loss": 0.4689,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6992444396018982,
"rewards/margins": 0.8311125040054321,
"rewards/rejected": -1.530356764793396,
"step": 715
},
{
"epoch": 0.576,
"grad_norm": 13.087418556213379,
"learning_rate": 2.2734185495055503e-06,
"logits/chosen": -2.67484974861145,
"logits/rejected": -2.5980706214904785,
"logps/chosen": -360.56085205078125,
"logps/rejected": -359.30743408203125,
"loss": 0.5599,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7009941935539246,
"rewards/margins": 0.48396244645118713,
"rewards/rejected": -1.184956669807434,
"step": 720
},
{
"epoch": 0.58,
"grad_norm": 8.680456161499023,
"learning_rate": 2.238678841830867e-06,
"logits/chosen": -2.6140735149383545,
"logits/rejected": -2.5830371379852295,
"logps/chosen": -364.34942626953125,
"logps/rejected": -394.05047607421875,
"loss": 0.5686,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6779341697692871,
"rewards/margins": 0.5314933657646179,
"rewards/rejected": -1.2094275951385498,
"step": 725
},
{
"epoch": 0.584,
"grad_norm": 10.017196655273438,
"learning_rate": 2.2039900792337477e-06,
"logits/chosen": -2.644421339035034,
"logits/rejected": -2.6203815937042236,
"logps/chosen": -375.94244384765625,
"logps/rejected": -401.27056884765625,
"loss": 0.5818,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8054073452949524,
"rewards/margins": 0.5798496007919312,
"rewards/rejected": -1.3852570056915283,
"step": 730
},
{
"epoch": 0.588,
"grad_norm": 6.945478439331055,
"learning_rate": 2.1693590243571937e-06,
"logits/chosen": -2.6634681224823,
"logits/rejected": -2.6050820350646973,
"logps/chosen": -348.97540283203125,
"logps/rejected": -389.2197265625,
"loss": 0.5502,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.855374813079834,
"rewards/margins": 0.6680639982223511,
"rewards/rejected": -1.523438811302185,
"step": 735
},
{
"epoch": 0.592,
"grad_norm": 10.911724090576172,
"learning_rate": 2.134792428593971e-06,
"logits/chosen": -2.5928680896759033,
"logits/rejected": -2.563474416732788,
"logps/chosen": -328.8409729003906,
"logps/rejected": -379.95599365234375,
"loss": 0.5491,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.9474200010299683,
"rewards/margins": 0.49488845467567444,
"rewards/rejected": -1.4423085451126099,
"step": 740
},
{
"epoch": 0.596,
"grad_norm": 9.961637496948242,
"learning_rate": 2.1002970307704134e-06,
"logits/chosen": -2.7088496685028076,
"logits/rejected": -2.646193027496338,
"logps/chosen": -421.77178955078125,
"logps/rejected": -460.4210510253906,
"loss": 0.5682,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.875130295753479,
"rewards/margins": 0.6770623922348022,
"rewards/rejected": -1.5521926879882812,
"step": 745
},
{
"epoch": 0.6,
"grad_norm": 6.698659420013428,
"learning_rate": 2.0658795558326745e-06,
"logits/chosen": -2.6276183128356934,
"logits/rejected": -2.6407034397125244,
"logps/chosen": -371.37225341796875,
"logps/rejected": -429.2191467285156,
"loss": 0.5026,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8518401384353638,
"rewards/margins": 0.7702382206916809,
"rewards/rejected": -1.622078537940979,
"step": 750
},
{
"epoch": 0.604,
"grad_norm": 9.7139310836792,
"learning_rate": 2.031546713535688e-06,
"logits/chosen": -2.6299374103546143,
"logits/rejected": -2.572783946990967,
"logps/chosen": -366.63360595703125,
"logps/rejected": -413.1153259277344,
"loss": 0.5502,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7979942560195923,
"rewards/margins": 0.6386594772338867,
"rewards/rejected": -1.4366536140441895,
"step": 755
},
{
"epoch": 0.608,
"grad_norm": 12.361163139343262,
"learning_rate": 1.997305197135089e-06,
"logits/chosen": -2.554405689239502,
"logits/rejected": -2.564911365509033,
"logps/chosen": -300.50689697265625,
"logps/rejected": -360.9603271484375,
"loss": 0.5553,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8539615869522095,
"rewards/margins": 0.5313900709152222,
"rewards/rejected": -1.3853518962860107,
"step": 760
},
{
"epoch": 0.612,
"grad_norm": 8.68078899383545,
"learning_rate": 1.963161682082342e-06,
"logits/chosen": -2.5307528972625732,
"logits/rejected": -2.5801663398742676,
"logps/chosen": -357.2311706542969,
"logps/rejected": -383.4229431152344,
"loss": 0.567,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7676454186439514,
"rewards/margins": 0.5505531430244446,
"rewards/rejected": -1.3181986808776855,
"step": 765
},
{
"epoch": 0.616,
"grad_norm": 4.522902011871338,
"learning_rate": 1.9291228247233607e-06,
"logits/chosen": -2.5538437366485596,
"logits/rejected": -2.5088653564453125,
"logps/chosen": -338.6076965332031,
"logps/rejected": -382.578369140625,
"loss": 0.5448,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5785464644432068,
"rewards/margins": 0.5552471280097961,
"rewards/rejected": -1.133793592453003,
"step": 770
},
{
"epoch": 0.62,
"grad_norm": 8.719709396362305,
"learning_rate": 1.895195261000831e-06,
"logits/chosen": -2.619828462600708,
"logits/rejected": -2.574763298034668,
"logps/chosen": -361.0684814453125,
"logps/rejected": -423.12408447265625,
"loss": 0.5232,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5979502201080322,
"rewards/margins": 0.6523123383522034,
"rewards/rejected": -1.2502626180648804,
"step": 775
},
{
"epoch": 0.624,
"grad_norm": 7.265892505645752,
"learning_rate": 1.8613856051605242e-06,
"logits/chosen": -2.4674975872039795,
"logits/rejected": -2.4988579750061035,
"logps/chosen": -303.31695556640625,
"logps/rejected": -350.3982849121094,
"loss": 0.5336,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.65043705701828,
"rewards/margins": 0.534541666507721,
"rewards/rejected": -1.184978723526001,
"step": 780
},
{
"epoch": 0.628,
"grad_norm": 5.215295314788818,
"learning_rate": 1.827700448461836e-06,
"logits/chosen": -2.6682486534118652,
"logits/rejected": -2.595508098602295,
"logps/chosen": -380.72271728515625,
"logps/rejected": -402.666748046875,
"loss": 0.5612,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8000537157058716,
"rewards/margins": 0.5293024182319641,
"rewards/rejected": -1.3293559551239014,
"step": 785
},
{
"epoch": 0.632,
"grad_norm": 9.841978073120117,
"learning_rate": 1.7941463578928088e-06,
"logits/chosen": -2.592263698577881,
"logits/rejected": -2.559999942779541,
"logps/chosen": -422.0464782714844,
"logps/rejected": -429.20831298828125,
"loss": 0.58,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7715168595314026,
"rewards/margins": 0.5217434167861938,
"rewards/rejected": -1.2932603359222412,
"step": 790
},
{
"epoch": 0.636,
"grad_norm": 7.524374485015869,
"learning_rate": 1.7607298748898844e-06,
"logits/chosen": -2.6286463737487793,
"logits/rejected": -2.6194465160369873,
"logps/chosen": -354.8101501464844,
"logps/rejected": -398.53948974609375,
"loss": 0.588,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7351819276809692,
"rewards/margins": 0.5240973830223083,
"rewards/rejected": -1.2592793703079224,
"step": 795
},
{
"epoch": 0.64,
"grad_norm": 7.135324954986572,
"learning_rate": 1.7274575140626318e-06,
"logits/chosen": -2.50719952583313,
"logits/rejected": -2.4550704956054688,
"logps/chosen": -336.1716003417969,
"logps/rejected": -404.1689147949219,
"loss": 0.5332,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.632702648639679,
"rewards/margins": 0.5536705255508423,
"rewards/rejected": -1.1863731145858765,
"step": 800
},
{
"epoch": 0.64,
"eval_logits/chosen": -2.6166913509368896,
"eval_logits/rejected": -2.5759570598602295,
"eval_logps/chosen": -355.7965393066406,
"eval_logps/rejected": -381.5441589355469,
"eval_loss": 0.5385683178901672,
"eval_rewards/accuracies": 0.7182539701461792,
"eval_rewards/chosen": -0.7255847454071045,
"eval_rewards/margins": 0.6095607876777649,
"eval_rewards/rejected": -1.3351454734802246,
"eval_runtime": 165.673,
"eval_samples_per_second": 3.018,
"eval_steps_per_second": 0.38,
"step": 800
},
{
"epoch": 0.644,
"grad_norm": 6.823562145233154,
"learning_rate": 1.6943357619237227e-06,
"logits/chosen": -2.5743985176086426,
"logits/rejected": -2.563049793243408,
"logps/chosen": -344.70318603515625,
"logps/rejected": -374.0846862792969,
"loss": 0.4913,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7136049866676331,
"rewards/margins": 0.67967289686203,
"rewards/rejected": -1.393277883529663,
"step": 805
},
{
"epoch": 0.648,
"grad_norm": 9.05685806274414,
"learning_rate": 1.661371075624363e-06,
"logits/chosen": -2.6020989418029785,
"logits/rejected": -2.6505770683288574,
"logps/chosen": -347.4692687988281,
"logps/rejected": -473.5855407714844,
"loss": 0.5726,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9543269872665405,
"rewards/margins": 0.5893380641937256,
"rewards/rejected": -1.5436651706695557,
"step": 810
},
{
"epoch": 0.652,
"grad_norm": 10.01281452178955,
"learning_rate": 1.6285698816954626e-06,
"logits/chosen": -2.6235404014587402,
"logits/rejected": -2.5892868041992188,
"logps/chosen": -362.3233947753906,
"logps/rejected": -383.004638671875,
"loss": 0.5152,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6492418646812439,
"rewards/margins": 0.675209641456604,
"rewards/rejected": -1.3244515657424927,
"step": 815
},
{
"epoch": 0.656,
"grad_norm": 10.048011779785156,
"learning_rate": 1.5959385747947697e-06,
"logits/chosen": -2.5589287281036377,
"logits/rejected": -2.503087043762207,
"logps/chosen": -325.8083801269531,
"logps/rejected": -343.68316650390625,
"loss": 0.5628,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9260866045951843,
"rewards/margins": 0.5877935886383057,
"rewards/rejected": -1.5138801336288452,
"step": 820
},
{
"epoch": 0.66,
"grad_norm": 10.148550987243652,
"learning_rate": 1.56348351646022e-06,
"logits/chosen": -2.459164619445801,
"logits/rejected": -2.4178318977355957,
"logps/chosen": -334.05731201171875,
"logps/rejected": -384.0740661621094,
"loss": 0.5496,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9508736729621887,
"rewards/margins": 0.5749450922012329,
"rewards/rejected": -1.5258188247680664,
"step": 825
},
{
"epoch": 0.664,
"grad_norm": 11.33963680267334,
"learning_rate": 1.5312110338697427e-06,
"logits/chosen": -2.5506274700164795,
"logits/rejected": -2.476365566253662,
"logps/chosen": -354.90130615234375,
"logps/rejected": -414.8627014160156,
"loss": 0.5309,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1112221479415894,
"rewards/margins": 0.6936079263687134,
"rewards/rejected": -1.8048301935195923,
"step": 830
},
{
"epoch": 0.668,
"grad_norm": 9.498885154724121,
"learning_rate": 1.4991274186077632e-06,
"logits/chosen": -2.5485012531280518,
"logits/rejected": -2.539670467376709,
"logps/chosen": -386.7198181152344,
"logps/rejected": -442.57421875,
"loss": 0.524,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0074347257614136,
"rewards/margins": 0.6671037077903748,
"rewards/rejected": -1.6745383739471436,
"step": 835
},
{
"epoch": 0.672,
"grad_norm": 10.999540328979492,
"learning_rate": 1.467238925438646e-06,
"logits/chosen": -2.5577821731567383,
"logits/rejected": -2.5113046169281006,
"logps/chosen": -419.2740173339844,
"logps/rejected": -448.5807189941406,
"loss": 0.6084,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9763299226760864,
"rewards/margins": 0.62681645154953,
"rewards/rejected": -1.6031463146209717,
"step": 840
},
{
"epoch": 0.676,
"grad_norm": 8.105618476867676,
"learning_rate": 1.4355517710873184e-06,
"logits/chosen": -2.524392604827881,
"logits/rejected": -2.4944310188293457,
"logps/chosen": -380.297119140625,
"logps/rejected": -396.42138671875,
"loss": 0.5097,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9465241432189941,
"rewards/margins": 0.6743323802947998,
"rewards/rejected": -1.6208562850952148,
"step": 845
},
{
"epoch": 0.68,
"grad_norm": 14.872075080871582,
"learning_rate": 1.4040721330273063e-06,
"logits/chosen": -2.496351957321167,
"logits/rejected": -2.5108532905578613,
"logps/chosen": -367.57183837890625,
"logps/rejected": -420.6947326660156,
"loss": 0.6583,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.039666771888733,
"rewards/margins": 0.4930298328399658,
"rewards/rejected": -1.5326964855194092,
"step": 850
},
{
"epoch": 0.684,
"grad_norm": 9.286355018615723,
"learning_rate": 1.3728061482764238e-06,
"logits/chosen": -2.626911163330078,
"logits/rejected": -2.6237576007843018,
"logps/chosen": -398.8912048339844,
"logps/rejected": -464.3138732910156,
"loss": 0.6356,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.851772129535675,
"rewards/margins": 0.5085344910621643,
"rewards/rejected": -1.3603065013885498,
"step": 855
},
{
"epoch": 0.688,
"grad_norm": 9.629425048828125,
"learning_rate": 1.3417599122003464e-06,
"logits/chosen": -2.6108672618865967,
"logits/rejected": -2.600440502166748,
"logps/chosen": -341.123046875,
"logps/rejected": -377.686767578125,
"loss": 0.6128,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8103886842727661,
"rewards/margins": 0.4344421327114105,
"rewards/rejected": -1.244830846786499,
"step": 860
},
{
"epoch": 0.692,
"grad_norm": 10.558273315429688,
"learning_rate": 1.3109394773243117e-06,
"logits/chosen": -2.5375044345855713,
"logits/rejected": -2.5400888919830322,
"logps/chosen": -382.33154296875,
"logps/rejected": -431.41455078125,
"loss": 0.5468,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8447578549385071,
"rewards/margins": 0.7139667272567749,
"rewards/rejected": -1.5587245225906372,
"step": 865
},
{
"epoch": 0.696,
"grad_norm": 11.613428115844727,
"learning_rate": 1.280350852153168e-06,
"logits/chosen": -2.610506296157837,
"logits/rejected": -2.5400068759918213,
"logps/chosen": -361.366943359375,
"logps/rejected": -373.01904296875,
"loss": 0.5561,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.8189024925231934,
"rewards/margins": 0.5556577444076538,
"rewards/rejected": -1.3745602369308472,
"step": 870
},
{
"epoch": 0.7,
"grad_norm": 11.380918502807617,
"learning_rate": 1.2500000000000007e-06,
"logits/chosen": -2.5413687229156494,
"logits/rejected": -2.5202109813690186,
"logps/chosen": -360.80889892578125,
"logps/rejected": -413.19012451171875,
"loss": 0.5164,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7272705435752869,
"rewards/margins": 0.6798168420791626,
"rewards/rejected": -1.4070874452590942,
"step": 875
},
{
"epoch": 0.704,
"grad_norm": 11.80184555053711,
"learning_rate": 1.2198928378235717e-06,
"logits/chosen": -2.5873050689697266,
"logits/rejected": -2.576911211013794,
"logps/chosen": -299.10498046875,
"logps/rejected": -388.73211669921875,
"loss": 0.5155,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6009725332260132,
"rewards/margins": 0.6888505220413208,
"rewards/rejected": -1.2898229360580444,
"step": 880
},
{
"epoch": 0.708,
"grad_norm": 5.85650634765625,
"learning_rate": 1.1900352350748026e-06,
"logits/chosen": -2.560586929321289,
"logits/rejected": -2.5254247188568115,
"logps/chosen": -374.28692626953125,
"logps/rejected": -407.04827880859375,
"loss": 0.5088,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7127686142921448,
"rewards/margins": 0.7965149879455566,
"rewards/rejected": -1.509283423423767,
"step": 885
},
{
"epoch": 0.712,
"grad_norm": 7.8027448654174805,
"learning_rate": 1.160433012552508e-06,
"logits/chosen": -2.5033535957336426,
"logits/rejected": -2.5112838745117188,
"logps/chosen": -330.83880615234375,
"logps/rejected": -380.98297119140625,
"loss": 0.5379,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7824558019638062,
"rewards/margins": 0.5539994239807129,
"rewards/rejected": -1.3364553451538086,
"step": 890
},
{
"epoch": 0.716,
"grad_norm": 8.75863265991211,
"learning_rate": 1.1310919412686248e-06,
"logits/chosen": -2.5839171409606934,
"logits/rejected": -2.5830111503601074,
"logps/chosen": -370.30780029296875,
"logps/rejected": -396.5429992675781,
"loss": 0.5589,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7587816119194031,
"rewards/margins": 0.5284246802330017,
"rewards/rejected": -1.2872062921524048,
"step": 895
},
{
"epoch": 0.72,
"grad_norm": 8.253792762756348,
"learning_rate": 1.1020177413231334e-06,
"logits/chosen": -2.5888657569885254,
"logits/rejected": -2.5641961097717285,
"logps/chosen": -352.4532775878906,
"logps/rejected": -376.51751708984375,
"loss": 0.5334,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.746880054473877,
"rewards/margins": 0.5956496000289917,
"rewards/rejected": -1.342529535293579,
"step": 900
},
{
"epoch": 0.72,
"eval_logits/chosen": -2.5998997688293457,
"eval_logits/rejected": -2.5573904514312744,
"eval_logps/chosen": -353.8529357910156,
"eval_logps/rejected": -380.3204345703125,
"eval_loss": 0.536827027797699,
"eval_rewards/accuracies": 0.716269850730896,
"eval_rewards/chosen": -0.7061484456062317,
"eval_rewards/margins": 0.616759717464447,
"eval_rewards/rejected": -1.3229081630706787,
"eval_runtime": 165.7628,
"eval_samples_per_second": 3.016,
"eval_steps_per_second": 0.38,
"step": 900
},
{
"epoch": 0.724,
"grad_norm": 7.283039093017578,
"learning_rate": 1.073216080788921e-06,
"logits/chosen": -2.6033217906951904,
"logits/rejected": -2.5845823287963867,
"logps/chosen": -361.44329833984375,
"logps/rejected": -374.8320617675781,
"loss": 0.6075,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.7317181825637817,
"rewards/margins": 0.37635332345962524,
"rewards/rejected": -1.1080714464187622,
"step": 905
},
{
"epoch": 0.728,
"grad_norm": 9.450459480285645,
"learning_rate": 1.0446925746067768e-06,
"logits/chosen": -2.5516154766082764,
"logits/rejected": -2.495788097381592,
"logps/chosen": -316.8692321777344,
"logps/rejected": -324.5566101074219,
"loss": 0.5047,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7124193906784058,
"rewards/margins": 0.6433127522468567,
"rewards/rejected": -1.3557320833206177,
"step": 910
},
{
"epoch": 0.732,
"grad_norm": 11.512716293334961,
"learning_rate": 1.0164527834907468e-06,
"logits/chosen": -2.4677295684814453,
"logits/rejected": -2.4644956588745117,
"logps/chosen": -342.6759948730469,
"logps/rejected": -419.4685974121094,
"loss": 0.4815,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7622194886207581,
"rewards/margins": 0.7687476277351379,
"rewards/rejected": -1.5309669971466064,
"step": 915
},
{
"epoch": 0.736,
"grad_norm": 27.01951789855957,
"learning_rate": 9.88502212844063e-07,
"logits/chosen": -2.5636465549468994,
"logits/rejected": -2.5651931762695312,
"logps/chosen": -345.3995666503906,
"logps/rejected": -412.34979248046875,
"loss": 0.622,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.8238021731376648,
"rewards/margins": 0.42956480383872986,
"rewards/rejected": -1.2533669471740723,
"step": 920
},
{
"epoch": 0.74,
"grad_norm": 13.12364387512207,
"learning_rate": 9.608463116858544e-07,
"logits/chosen": -2.5695652961730957,
"logits/rejected": -2.5348198413848877,
"logps/chosen": -351.7240905761719,
"logps/rejected": -388.83319091796875,
"loss": 0.5433,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.76836758852005,
"rewards/margins": 0.6143354773521423,
"rewards/rejected": -1.3827030658721924,
"step": 925
},
{
"epoch": 0.744,
"grad_norm": 10.578348159790039,
"learning_rate": 9.334904715888496e-07,
"logits/chosen": -2.4992146492004395,
"logits/rejected": -2.501399517059326,
"logps/chosen": -339.5255432128906,
"logps/rejected": -395.8268737792969,
"loss": 0.5339,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.775671660900116,
"rewards/margins": 0.6518876552581787,
"rewards/rejected": -1.4275591373443604,
"step": 930
},
{
"epoch": 0.748,
"grad_norm": 7.748569011688232,
"learning_rate": 9.064400256282757e-07,
"logits/chosen": -2.57441782951355,
"logits/rejected": -2.546863079071045,
"logps/chosen": -355.07818603515625,
"logps/rejected": -380.3440246582031,
"loss": 0.559,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.6962515115737915,
"rewards/margins": 0.5717657208442688,
"rewards/rejected": -1.268017292022705,
"step": 935
},
{
"epoch": 0.752,
"grad_norm": 7.653827667236328,
"learning_rate": 8.797002473421729e-07,
"logits/chosen": -2.544231653213501,
"logits/rejected": -2.553048610687256,
"logps/chosen": -380.5497131347656,
"logps/rejected": -403.52191162109375,
"loss": 0.5081,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5373459458351135,
"rewards/margins": 0.6796460151672363,
"rewards/rejected": -1.216991901397705,
"step": 940
},
{
"epoch": 0.756,
"grad_norm": 14.531281471252441,
"learning_rate": 8.532763497032987e-07,
"logits/chosen": -2.4647645950317383,
"logits/rejected": -2.452423572540283,
"logps/chosen": -368.66497802734375,
"logps/rejected": -440.90313720703125,
"loss": 0.5264,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7574427127838135,
"rewards/margins": 0.6616984605789185,
"rewards/rejected": -1.4191412925720215,
"step": 945
},
{
"epoch": 0.76,
"grad_norm": 6.607179164886475,
"learning_rate": 8.271734841028553e-07,
"logits/chosen": -2.6168630123138428,
"logits/rejected": -2.6241250038146973,
"logps/chosen": -340.37542724609375,
"logps/rejected": -366.9325866699219,
"loss": 0.5419,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7232319712638855,
"rewards/margins": 0.5706599950790405,
"rewards/rejected": -1.2938919067382812,
"step": 950
},
{
"epoch": 0.764,
"grad_norm": 7.8033528327941895,
"learning_rate": 8.013967393462094e-07,
"logits/chosen": -2.4783270359039307,
"logits/rejected": -2.501206874847412,
"logps/chosen": -348.3237609863281,
"logps/rejected": -384.16656494140625,
"loss": 0.5859,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7813480496406555,
"rewards/margins": 0.5686533451080322,
"rewards/rejected": -1.3500014543533325,
"step": 955
},
{
"epoch": 0.768,
"grad_norm": 6.114492893218994,
"learning_rate": 7.759511406608255e-07,
"logits/chosen": -2.5830774307250977,
"logits/rejected": -2.516847848892212,
"logps/chosen": -397.07305908203125,
"logps/rejected": -403.8395080566406,
"loss": 0.4834,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.8136352300643921,
"rewards/margins": 0.8824082612991333,
"rewards/rejected": -1.6960432529449463,
"step": 960
},
{
"epoch": 0.772,
"grad_norm": 12.286111831665039,
"learning_rate": 7.508416487165862e-07,
"logits/chosen": -2.4968883991241455,
"logits/rejected": -2.5091567039489746,
"logps/chosen": -366.52630615234375,
"logps/rejected": -400.1545715332031,
"loss": 0.5807,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7833856344223022,
"rewards/margins": 0.5320286154747009,
"rewards/rejected": -1.3154141902923584,
"step": 965
},
{
"epoch": 0.776,
"grad_norm": 12.27044677734375,
"learning_rate": 7.260731586586983e-07,
"logits/chosen": -2.4706804752349854,
"logits/rejected": -2.4732460975646973,
"logps/chosen": -339.1402587890625,
"logps/rejected": -404.2414245605469,
"loss": 0.6221,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9275779724121094,
"rewards/margins": 0.46196335554122925,
"rewards/rejected": -1.3895412683486938,
"step": 970
},
{
"epoch": 0.78,
"grad_norm": 7.917988300323486,
"learning_rate": 7.016504991533727e-07,
"logits/chosen": -2.593116283416748,
"logits/rejected": -2.565453290939331,
"logps/chosen": -383.8894348144531,
"logps/rejected": -424.5870666503906,
"loss": 0.4774,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6088994145393372,
"rewards/margins": 0.7229386568069458,
"rewards/rejected": -1.3318378925323486,
"step": 975
},
{
"epoch": 0.784,
"grad_norm": 5.051321983337402,
"learning_rate": 6.775784314464717e-07,
"logits/chosen": -2.4984991550445557,
"logits/rejected": -2.5199942588806152,
"logps/chosen": -342.84515380859375,
"logps/rejected": -421.0189514160156,
"loss": 0.4971,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7955012917518616,
"rewards/margins": 0.7193040251731873,
"rewards/rejected": -1.5148054361343384,
"step": 980
},
{
"epoch": 0.788,
"grad_norm": 8.092668533325195,
"learning_rate": 6.538616484352902e-07,
"logits/chosen": -2.5383505821228027,
"logits/rejected": -2.526851177215576,
"logps/chosen": -345.52655029296875,
"logps/rejected": -379.8380432128906,
"loss": 0.5156,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7932868003845215,
"rewards/margins": 0.6958837509155273,
"rewards/rejected": -1.4891705513000488,
"step": 985
},
{
"epoch": 0.792,
"grad_norm": 9.803926467895508,
"learning_rate": 6.305047737536707e-07,
"logits/chosen": -2.509049654006958,
"logits/rejected": -2.463141679763794,
"logps/chosen": -351.3589172363281,
"logps/rejected": -371.07281494140625,
"loss": 0.5485,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8914083242416382,
"rewards/margins": 0.6496592164039612,
"rewards/rejected": -1.5410678386688232,
"step": 990
},
{
"epoch": 0.796,
"grad_norm": 15.167935371398926,
"learning_rate": 6.075123608706093e-07,
"logits/chosen": -2.5473320484161377,
"logits/rejected": -2.5690910816192627,
"logps/chosen": -365.46673583984375,
"logps/rejected": -389.1126403808594,
"loss": 0.5431,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7901977300643921,
"rewards/margins": 0.5910181999206543,
"rewards/rejected": -1.381216049194336,
"step": 995
},
{
"epoch": 0.8,
"grad_norm": 7.769952774047852,
"learning_rate": 5.848888922025553e-07,
"logits/chosen": -2.461652994155884,
"logits/rejected": -2.4495410919189453,
"logps/chosen": -327.51654052734375,
"logps/rejected": -418.86737060546875,
"loss": 0.5837,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9019506573677063,
"rewards/margins": 0.5747453570365906,
"rewards/rejected": -1.4766958951950073,
"step": 1000
},
{
"epoch": 0.8,
"eval_logits/chosen": -2.5706355571746826,
"eval_logits/rejected": -2.527315855026245,
"eval_logps/chosen": -362.7657165527344,
"eval_logps/rejected": -395.8990783691406,
"eval_loss": 0.5301549434661865,
"eval_rewards/accuracies": 0.716269850730896,
"eval_rewards/chosen": -0.795275866985321,
"eval_rewards/margins": 0.6834191083908081,
"eval_rewards/rejected": -1.4786947965621948,
"eval_runtime": 165.7401,
"eval_samples_per_second": 3.017,
"eval_steps_per_second": 0.38,
"step": 1000
},
{
"epoch": 0.804,
"grad_norm": 9.499650001525879,
"learning_rate": 5.626387782395512e-07,
"logits/chosen": -2.570199489593506,
"logits/rejected": -2.5388243198394775,
"logps/chosen": -386.8207702636719,
"logps/rejected": -439.36053466796875,
"loss": 0.5546,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9186019897460938,
"rewards/margins": 0.6662044525146484,
"rewards/rejected": -1.5848064422607422,
"step": 1005
},
{
"epoch": 0.808,
"grad_norm": 8.864973068237305,
"learning_rate": 5.407663566854008e-07,
"logits/chosen": -2.514481544494629,
"logits/rejected": -2.469686269760132,
"logps/chosen": -375.16436767578125,
"logps/rejected": -431.52813720703125,
"loss": 0.5046,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7475723028182983,
"rewards/margins": 0.7609124779701233,
"rewards/rejected": -1.5084848403930664,
"step": 1010
},
{
"epoch": 0.812,
"grad_norm": 17.737668991088867,
"learning_rate": 5.192758916120236e-07,
"logits/chosen": -2.5291812419891357,
"logits/rejected": -2.501344680786133,
"logps/chosen": -376.4272766113281,
"logps/rejected": -419.0375061035156,
"loss": 0.5571,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8887074589729309,
"rewards/margins": 0.6588888168334961,
"rewards/rejected": -1.5475962162017822,
"step": 1015
},
{
"epoch": 0.816,
"grad_norm": 9.168149948120117,
"learning_rate": 4.981715726281666e-07,
"logits/chosen": -2.5210018157958984,
"logits/rejected": -2.518200635910034,
"logps/chosen": -374.60687255859375,
"logps/rejected": -385.31317138671875,
"loss": 0.6639,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9296348690986633,
"rewards/margins": 0.35056614875793457,
"rewards/rejected": -1.2802008390426636,
"step": 1020
},
{
"epoch": 0.82,
"grad_norm": 6.545177936553955,
"learning_rate": 4.774575140626317e-07,
"logits/chosen": -2.553743839263916,
"logits/rejected": -2.564492702484131,
"logps/chosen": -374.25372314453125,
"logps/rejected": -418.3985290527344,
"loss": 0.5131,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7494795918464661,
"rewards/margins": 0.7920123338699341,
"rewards/rejected": -1.5414918661117554,
"step": 1025
},
{
"epoch": 0.824,
"grad_norm": 10.368010520935059,
"learning_rate": 4.5713775416217884e-07,
"logits/chosen": -2.5401394367218018,
"logits/rejected": -2.5111076831817627,
"logps/chosen": -364.0164794921875,
"logps/rejected": -398.11749267578125,
"loss": 0.493,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7943639159202576,
"rewards/margins": 0.7978767156600952,
"rewards/rejected": -1.5922406911849976,
"step": 1030
},
{
"epoch": 0.828,
"grad_norm": 12.131779670715332,
"learning_rate": 4.372162543042624e-07,
"logits/chosen": -2.579563856124878,
"logits/rejected": -2.539201259613037,
"logps/chosen": -327.2681579589844,
"logps/rejected": -347.89068603515625,
"loss": 0.6285,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9573305249214172,
"rewards/margins": 0.44094863533973694,
"rewards/rejected": -1.3982793092727661,
"step": 1035
},
{
"epoch": 0.832,
"grad_norm": 7.402243137359619,
"learning_rate": 4.1769689822475147e-07,
"logits/chosen": -2.533160924911499,
"logits/rejected": -2.514822244644165,
"logps/chosen": -332.4081726074219,
"logps/rejected": -374.7890930175781,
"loss": 0.5243,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7698723673820496,
"rewards/margins": 0.6416595578193665,
"rewards/rejected": -1.4115320444107056,
"step": 1040
},
{
"epoch": 0.836,
"grad_norm": 11.70563793182373,
"learning_rate": 3.9858349126078945e-07,
"logits/chosen": -2.4150428771972656,
"logits/rejected": -2.439276933670044,
"logps/chosen": -360.05657958984375,
"logps/rejected": -423.20184326171875,
"loss": 0.5974,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8564049005508423,
"rewards/margins": 0.5383543372154236,
"rewards/rejected": -1.394759178161621,
"step": 1045
},
{
"epoch": 0.84,
"grad_norm": 13.589889526367188,
"learning_rate": 3.798797596089351e-07,
"logits/chosen": -2.5914146900177,
"logits/rejected": -2.56174898147583,
"logps/chosen": -381.68048095703125,
"logps/rejected": -398.01007080078125,
"loss": 0.5775,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8752404451370239,
"rewards/margins": 0.5522125959396362,
"rewards/rejected": -1.4274529218673706,
"step": 1050
},
{
"epoch": 0.844,
"grad_norm": 8.736641883850098,
"learning_rate": 3.615893495987335e-07,
"logits/chosen": -2.4973983764648438,
"logits/rejected": -2.51640248298645,
"logps/chosen": -355.0185852050781,
"logps/rejected": -448.5818786621094,
"loss": 0.5172,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.703173816204071,
"rewards/margins": 0.7288501858711243,
"rewards/rejected": -1.4320241212844849,
"step": 1055
},
{
"epoch": 0.848,
"grad_norm": 7.283242702484131,
"learning_rate": 3.4371582698185636e-07,
"logits/chosen": -2.510960578918457,
"logits/rejected": -2.5245649814605713,
"logps/chosen": -381.8621520996094,
"logps/rejected": -428.3202209472656,
"loss": 0.4451,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8054243326187134,
"rewards/margins": 0.8550852537155151,
"rewards/rejected": -1.660509467124939,
"step": 1060
},
{
"epoch": 0.852,
"grad_norm": 11.76474380493164,
"learning_rate": 3.262626762369525e-07,
"logits/chosen": -2.5506832599639893,
"logits/rejected": -2.4736270904541016,
"logps/chosen": -330.5300598144531,
"logps/rejected": -350.5076599121094,
"loss": 0.5339,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7755425572395325,
"rewards/margins": 0.6821144819259644,
"rewards/rejected": -1.4576570987701416,
"step": 1065
},
{
"epoch": 0.856,
"grad_norm": 9.793760299682617,
"learning_rate": 3.092332998903416e-07,
"logits/chosen": -2.554365634918213,
"logits/rejected": -2.5592918395996094,
"logps/chosen": -383.59014892578125,
"logps/rejected": -433.6626892089844,
"loss": 0.5678,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.708263635635376,
"rewards/margins": 0.541972279548645,
"rewards/rejected": -1.250235915184021,
"step": 1070
},
{
"epoch": 0.86,
"grad_norm": 8.709606170654297,
"learning_rate": 2.9263101785268253e-07,
"logits/chosen": -2.5445055961608887,
"logits/rejected": -2.5182182788848877,
"logps/chosen": -370.8434753417969,
"logps/rejected": -384.3281555175781,
"loss": 0.6357,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.878252387046814,
"rewards/margins": 0.4590927064418793,
"rewards/rejected": -1.3373451232910156,
"step": 1075
},
{
"epoch": 0.864,
"grad_norm": 7.788934230804443,
"learning_rate": 2.764590667717562e-07,
"logits/chosen": -2.5197861194610596,
"logits/rejected": -2.498582363128662,
"logps/chosen": -348.8467712402344,
"logps/rejected": -429.28936767578125,
"loss": 0.4726,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7739642858505249,
"rewards/margins": 0.8543514013290405,
"rewards/rejected": -1.6283156871795654,
"step": 1080
},
{
"epoch": 0.868,
"grad_norm": 9.934327125549316,
"learning_rate": 2.6072059940146775e-07,
"logits/chosen": -2.4858384132385254,
"logits/rejected": -2.4607391357421875,
"logps/chosen": -357.95025634765625,
"logps/rejected": -370.97479248046875,
"loss": 0.639,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.9774085283279419,
"rewards/margins": 0.36331382393836975,
"rewards/rejected": -1.3407223224639893,
"step": 1085
},
{
"epoch": 0.872,
"grad_norm": 12.652565956115723,
"learning_rate": 2.454186839872158e-07,
"logits/chosen": -2.4667727947235107,
"logits/rejected": -2.428893566131592,
"logps/chosen": -368.6217346191406,
"logps/rejected": -427.3558654785156,
"loss": 0.5759,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8502403497695923,
"rewards/margins": 0.5524962544441223,
"rewards/rejected": -1.4027366638183594,
"step": 1090
},
{
"epoch": 0.876,
"grad_norm": 7.642593860626221,
"learning_rate": 2.3055630366772857e-07,
"logits/chosen": -2.5572714805603027,
"logits/rejected": -2.5431621074676514,
"logps/chosen": -356.75775146484375,
"logps/rejected": -395.92498779296875,
"loss": 0.5148,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7548877000808716,
"rewards/margins": 0.7364410161972046,
"rewards/rejected": -1.4913287162780762,
"step": 1095
},
{
"epoch": 0.88,
"grad_norm": 10.851374626159668,
"learning_rate": 2.1613635589349756e-07,
"logits/chosen": -2.549379825592041,
"logits/rejected": -2.547346830368042,
"logps/chosen": -347.64202880859375,
"logps/rejected": -392.60089111328125,
"loss": 0.5144,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.831588625907898,
"rewards/margins": 0.7135321497917175,
"rewards/rejected": -1.5451208353042603,
"step": 1100
},
{
"epoch": 0.88,
"eval_logits/chosen": -2.55863094329834,
"eval_logits/rejected": -2.516242742538452,
"eval_logps/chosen": -357.33807373046875,
"eval_logps/rejected": -388.2352600097656,
"eval_loss": 0.5326837301254272,
"eval_rewards/accuracies": 0.7123016119003296,
"eval_rewards/chosen": -0.7409996390342712,
"eval_rewards/margins": 0.6610568761825562,
"eval_rewards/rejected": -1.4020566940307617,
"eval_runtime": 166.2233,
"eval_samples_per_second": 3.008,
"eval_steps_per_second": 0.379,
"step": 1100
},
{
"epoch": 0.884,
"grad_norm": 11.941755294799805,
"learning_rate": 2.0216165186191406e-07,
"logits/chosen": -2.5250916481018066,
"logits/rejected": -2.5078232288360596,
"logps/chosen": -360.7425842285156,
"logps/rejected": -419.687744140625,
"loss": 0.5508,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7622045874595642,
"rewards/margins": 0.6720725297927856,
"rewards/rejected": -1.434277057647705,
"step": 1105
},
{
"epoch": 0.888,
"grad_norm": 11.753776550292969,
"learning_rate": 1.8863491596921745e-07,
"logits/chosen": -2.5271763801574707,
"logits/rejected": -2.495025396347046,
"logps/chosen": -394.4120788574219,
"logps/rejected": -420.91552734375,
"loss": 0.6139,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.926856517791748,
"rewards/margins": 0.48281335830688477,
"rewards/rejected": -1.4096698760986328,
"step": 1110
},
{
"epoch": 0.892,
"grad_norm": 12.333569526672363,
"learning_rate": 1.7555878527937164e-07,
"logits/chosen": -2.6087048053741455,
"logits/rejected": -2.5676796436309814,
"logps/chosen": -378.8360595703125,
"logps/rejected": -399.5335693359375,
"loss": 0.4934,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8667081594467163,
"rewards/margins": 0.7776769995689392,
"rewards/rejected": -1.6443853378295898,
"step": 1115
},
{
"epoch": 0.896,
"grad_norm": 9.7278413772583,
"learning_rate": 1.629358090099639e-07,
"logits/chosen": -2.495575428009033,
"logits/rejected": -2.489112615585327,
"logps/chosen": -391.45159912109375,
"logps/rejected": -426.06085205078125,
"loss": 0.5054,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9123435020446777,
"rewards/margins": 0.6857225298881531,
"rewards/rejected": -1.5980660915374756,
"step": 1120
},
{
"epoch": 0.9,
"grad_norm": 9.703481674194336,
"learning_rate": 1.507684480352292e-07,
"logits/chosen": -2.5202584266662598,
"logits/rejected": -2.527817964553833,
"logps/chosen": -364.148681640625,
"logps/rejected": -412.2860412597656,
"loss": 0.529,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9178013801574707,
"rewards/margins": 0.6695634126663208,
"rewards/rejected": -1.5873647928237915,
"step": 1125
},
{
"epoch": 0.904,
"grad_norm": 6.87813138961792,
"learning_rate": 1.3905907440629752e-07,
"logits/chosen": -2.5462465286254883,
"logits/rejected": -2.529540777206421,
"logps/chosen": -367.54986572265625,
"logps/rejected": -395.8585510253906,
"loss": 0.5463,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9065104722976685,
"rewards/margins": 0.6257832050323486,
"rewards/rejected": -1.532293677330017,
"step": 1130
},
{
"epoch": 0.908,
"grad_norm": 9.895462989807129,
"learning_rate": 1.278099708887587e-07,
"logits/chosen": -2.552335262298584,
"logits/rejected": -2.5324137210845947,
"logps/chosen": -345.7575988769531,
"logps/rejected": -455.0641174316406,
"loss": 0.5316,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7876826524734497,
"rewards/margins": 0.7177135348320007,
"rewards/rejected": -1.5053961277008057,
"step": 1135
},
{
"epoch": 0.912,
"grad_norm": 8.755758285522461,
"learning_rate": 1.1702333051763271e-07,
"logits/chosen": -2.5616421699523926,
"logits/rejected": -2.554831027984619,
"logps/chosen": -397.1969909667969,
"logps/rejected": -403.56439208984375,
"loss": 0.5163,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8538335561752319,
"rewards/margins": 0.7461098432540894,
"rewards/rejected": -1.5999435186386108,
"step": 1140
},
{
"epoch": 0.916,
"grad_norm": 12.037848472595215,
"learning_rate": 1.067012561698319e-07,
"logits/chosen": -2.5323455333709717,
"logits/rejected": -2.519660472869873,
"logps/chosen": -379.17340087890625,
"logps/rejected": -407.46014404296875,
"loss": 0.6399,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.895904541015625,
"rewards/margins": 0.44356465339660645,
"rewards/rejected": -1.3394691944122314,
"step": 1145
},
{
"epoch": 0.92,
"grad_norm": 14.188241958618164,
"learning_rate": 9.684576015420277e-08,
"logits/chosen": -2.4839751720428467,
"logits/rejected": -2.4552297592163086,
"logps/chosen": -331.6858215332031,
"logps/rejected": -358.20819091796875,
"loss": 0.5293,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7821733355522156,
"rewards/margins": 0.6462420225143433,
"rewards/rejected": -1.4284155368804932,
"step": 1150
},
{
"epoch": 0.924,
"grad_norm": 21.802221298217773,
"learning_rate": 8.745876381922147e-08,
"logits/chosen": -2.485172748565674,
"logits/rejected": -2.5178401470184326,
"logps/chosen": -343.31103515625,
"logps/rejected": -370.01336669921875,
"loss": 0.577,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8277499079704285,
"rewards/margins": 0.6033510565757751,
"rewards/rejected": -1.4311010837554932,
"step": 1155
},
{
"epoch": 0.928,
"grad_norm": 11.248420715332031,
"learning_rate": 7.854209717842231e-08,
"logits/chosen": -2.5530881881713867,
"logits/rejected": -2.5298221111297607,
"logps/chosen": -387.3213806152344,
"logps/rejected": -385.5107421875,
"loss": 0.6488,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9766770601272583,
"rewards/margins": 0.35460105538368225,
"rewards/rejected": -1.3312779664993286,
"step": 1160
},
{
"epoch": 0.932,
"grad_norm": 6.085402011871338,
"learning_rate": 7.009749855363457e-08,
"logits/chosen": -2.5276684761047363,
"logits/rejected": -2.508495330810547,
"logps/chosen": -339.74969482421875,
"logps/rejected": -404.6656799316406,
"loss": 0.519,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6988147497177124,
"rewards/margins": 0.6438443660736084,
"rewards/rejected": -1.3426591157913208,
"step": 1165
},
{
"epoch": 0.936,
"grad_norm": 15.023430824279785,
"learning_rate": 6.212661423609184e-08,
"logits/chosen": -2.5954625606536865,
"logits/rejected": -2.5354666709899902,
"logps/chosen": -389.9742736816406,
"logps/rejected": -427.60284423828125,
"loss": 0.5631,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9025875926017761,
"rewards/margins": 0.6419355869293213,
"rewards/rejected": -1.5445232391357422,
"step": 1170
},
{
"epoch": 0.94,
"grad_norm": 12.646740913391113,
"learning_rate": 5.463099816548578e-08,
"logits/chosen": -2.5129947662353516,
"logits/rejected": -2.5076282024383545,
"logps/chosen": -355.4842224121094,
"logps/rejected": -443.46014404296875,
"loss": 0.4861,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8773505091667175,
"rewards/margins": 0.769055187702179,
"rewards/rejected": -1.646405816078186,
"step": 1175
},
{
"epoch": 0.944,
"grad_norm": 8.745574951171875,
"learning_rate": 4.761211162702117e-08,
"logits/chosen": -2.5645899772644043,
"logits/rejected": -2.502182722091675,
"logps/chosen": -396.885498046875,
"logps/rejected": -444.1766662597656,
"loss": 0.5327,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.760084331035614,
"rewards/margins": 0.5914161801338196,
"rewards/rejected": -1.3515005111694336,
"step": 1180
},
{
"epoch": 0.948,
"grad_norm": 10.453509330749512,
"learning_rate": 4.1071322966535487e-08,
"logits/chosen": -2.577366590499878,
"logits/rejected": -2.5066463947296143,
"logps/chosen": -418.02801513671875,
"logps/rejected": -403.1604309082031,
"loss": 0.4854,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7425155639648438,
"rewards/margins": 0.8614899516105652,
"rewards/rejected": -1.6040055751800537,
"step": 1185
},
{
"epoch": 0.952,
"grad_norm": 6.866016864776611,
"learning_rate": 3.5009907323737826e-08,
"logits/chosen": -2.504338026046753,
"logits/rejected": -2.57658052444458,
"logps/chosen": -371.8552551269531,
"logps/rejected": -480.74066162109375,
"loss": 0.4368,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6980635523796082,
"rewards/margins": 0.9740827679634094,
"rewards/rejected": -1.6721464395523071,
"step": 1190
},
{
"epoch": 0.956,
"grad_norm": 8.07772159576416,
"learning_rate": 2.9429046383618042e-08,
"logits/chosen": -2.459728717803955,
"logits/rejected": -2.4553236961364746,
"logps/chosen": -368.6015930175781,
"logps/rejected": -395.6241149902344,
"loss": 0.4823,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.7073078751564026,
"rewards/margins": 0.7206257581710815,
"rewards/rejected": -1.4279335737228394,
"step": 1195
},
{
"epoch": 0.96,
"grad_norm": 13.021551132202148,
"learning_rate": 2.4329828146074096e-08,
"logits/chosen": -2.524336099624634,
"logits/rejected": -2.4975974559783936,
"logps/chosen": -377.58343505859375,
"logps/rejected": -374.9549255371094,
"loss": 0.5196,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8911786079406738,
"rewards/margins": 0.713758647441864,
"rewards/rejected": -1.6049373149871826,
"step": 1200
},
{
"epoch": 0.96,
"eval_logits/chosen": -2.5477142333984375,
"eval_logits/rejected": -2.504517078399658,
"eval_logps/chosen": -361.9387512207031,
"eval_logps/rejected": -394.47796630859375,
"eval_loss": 0.5300799608230591,
"eval_rewards/accuracies": 0.7202380895614624,
"eval_rewards/chosen": -0.7870069146156311,
"eval_rewards/margins": 0.6774766445159912,
"eval_rewards/rejected": -1.4644837379455566,
"eval_runtime": 166.2408,
"eval_samples_per_second": 3.008,
"eval_steps_per_second": 0.379,
"step": 1200
},
{
"epoch": 0.964,
"grad_norm": 8.705704689025879,
"learning_rate": 1.9713246713805588e-08,
"logits/chosen": -2.4079999923706055,
"logits/rejected": -2.3863213062286377,
"logps/chosen": -336.49639892578125,
"logps/rejected": -405.0527648925781,
"loss": 0.4696,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6577492952346802,
"rewards/margins": 0.8586214780807495,
"rewards/rejected": -1.5163707733154297,
"step": 1205
},
{
"epoch": 0.968,
"grad_norm": 9.633703231811523,
"learning_rate": 1.5580202098509078e-08,
"logits/chosen": -2.488119602203369,
"logits/rejected": -2.446547746658325,
"logps/chosen": -409.77557373046875,
"logps/rejected": -457.3531188964844,
"loss": 0.5975,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9190298318862915,
"rewards/margins": 0.5190376043319702,
"rewards/rejected": -1.4380674362182617,
"step": 1210
},
{
"epoch": 0.972,
"grad_norm": 9.208328247070312,
"learning_rate": 1.193150004542204e-08,
"logits/chosen": -2.523573160171509,
"logits/rejected": -2.5186927318573,
"logps/chosen": -355.54656982421875,
"logps/rejected": -407.33172607421875,
"loss": 0.5734,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.683289647102356,
"rewards/margins": 0.6039477586746216,
"rewards/rejected": -1.2872374057769775,
"step": 1215
},
{
"epoch": 0.976,
"grad_norm": 7.021068096160889,
"learning_rate": 8.767851876239075e-09,
"logits/chosen": -2.505402088165283,
"logits/rejected": -2.454876661300659,
"logps/chosen": -327.73358154296875,
"logps/rejected": -372.61370849609375,
"loss": 0.5824,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.8027257919311523,
"rewards/margins": 0.5884792804718018,
"rewards/rejected": -1.391205072402954,
"step": 1220
},
{
"epoch": 0.98,
"grad_norm": 8.4197416305542,
"learning_rate": 6.089874350439507e-09,
"logits/chosen": -2.5013089179992676,
"logits/rejected": -2.485605239868164,
"logps/chosen": -435.61669921875,
"logps/rejected": -448.99688720703125,
"loss": 0.5037,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.8056790232658386,
"rewards/margins": 0.687267005443573,
"rewards/rejected": -1.492945909500122,
"step": 1225
},
{
"epoch": 0.984,
"grad_norm": 9.84626293182373,
"learning_rate": 3.8980895450474455e-09,
"logits/chosen": -2.469447612762451,
"logits/rejected": -2.4653396606445312,
"logps/chosen": -375.6591796875,
"logps/rejected": -485.65179443359375,
"loss": 0.4352,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7044429779052734,
"rewards/margins": 0.9541055560112,
"rewards/rejected": -1.658548355102539,
"step": 1230
},
{
"epoch": 0.988,
"grad_norm": 10.856142044067383,
"learning_rate": 2.192924752854042e-09,
"logits/chosen": -2.5709242820739746,
"logits/rejected": -2.552412986755371,
"logps/chosen": -359.99749755859375,
"logps/rejected": -404.10693359375,
"loss": 0.5811,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8072735071182251,
"rewards/margins": 0.5389326810836792,
"rewards/rejected": -1.3462061882019043,
"step": 1235
},
{
"epoch": 0.992,
"grad_norm": 8.36683464050293,
"learning_rate": 9.747123991141193e-10,
"logits/chosen": -2.4341177940368652,
"logits/rejected": -2.4185235500335693,
"logps/chosen": -372.7251892089844,
"logps/rejected": -395.2005310058594,
"loss": 0.5735,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8750492334365845,
"rewards/margins": 0.6034801006317139,
"rewards/rejected": -1.4785292148590088,
"step": 1240
},
{
"epoch": 0.996,
"grad_norm": 9.960768699645996,
"learning_rate": 2.43689976739403e-10,
"logits/chosen": -2.397348642349243,
"logits/rejected": -2.444608688354492,
"logps/chosen": -407.68475341796875,
"logps/rejected": -409.0362243652344,
"loss": 0.5478,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.8263516426086426,
"rewards/margins": 0.5675632357597351,
"rewards/rejected": -1.3939149379730225,
"step": 1245
},
{
"epoch": 1.0,
"grad_norm": 14.954544067382812,
"learning_rate": 0.0,
"logits/chosen": -2.471954822540283,
"logits/rejected": -2.448702335357666,
"logps/chosen": -397.40447998046875,
"logps/rejected": -444.6131896972656,
"loss": 0.5219,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.9162886738777161,
"rewards/margins": 0.6220329999923706,
"rewards/rejected": -1.5383217334747314,
"step": 1250
},
{
"epoch": 1.0,
"step": 1250,
"total_flos": 0.0,
"train_loss": 0.5873338260650635,
"train_runtime": 15803.1996,
"train_samples_per_second": 1.266,
"train_steps_per_second": 0.079
}
],
"logging_steps": 5,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}