martimfasantos's picture
Model save
78dc60f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 2776,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007204610951008645,
"grad_norm": 21.825801849365234,
"learning_rate": 1.7985611510791367e-10,
"logits/chosen": -1.539827823638916,
"logits/rejected": -1.5469944477081299,
"logps/chosen": -40.41275405883789,
"logps/rejected": -44.19762420654297,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.007204610951008645,
"grad_norm": 23.4193058013916,
"learning_rate": 1.7985611510791365e-09,
"logits/chosen": -1.6871981620788574,
"logits/rejected": -1.6693940162658691,
"logps/chosen": -48.1815299987793,
"logps/rejected": -51.31031799316406,
"loss": 0.6939,
"rewards/accuracies": 0.3819444477558136,
"rewards/chosen": -0.0012067599454894662,
"rewards/margins": -0.001382419839501381,
"rewards/rejected": 0.00017565980670042336,
"step": 10
},
{
"epoch": 0.01440922190201729,
"grad_norm": 26.470735549926758,
"learning_rate": 3.597122302158273e-09,
"logits/chosen": -1.7020677328109741,
"logits/rejected": -1.6882435083389282,
"logps/chosen": -50.412376403808594,
"logps/rejected": -53.254310607910156,
"loss": 0.6941,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.00035694619873538613,
"rewards/margins": -0.0017174886306747794,
"rewards/rejected": 0.00207443512044847,
"step": 20
},
{
"epoch": 0.021613832853025938,
"grad_norm": 31.81087303161621,
"learning_rate": 5.3956834532374095e-09,
"logits/chosen": -1.6516170501708984,
"logits/rejected": -1.6449276208877563,
"logps/chosen": -54.51926803588867,
"logps/rejected": -57.81389617919922,
"loss": 0.694,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.0005903076380491257,
"rewards/margins": -0.0014708719681948423,
"rewards/rejected": 0.0008805643883533776,
"step": 30
},
{
"epoch": 0.02881844380403458,
"grad_norm": 24.485918045043945,
"learning_rate": 7.194244604316546e-09,
"logits/chosen": -1.7025425434112549,
"logits/rejected": -1.6985228061676025,
"logps/chosen": -49.09718704223633,
"logps/rejected": -52.62793731689453,
"loss": 0.6947,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": -0.0021219542250037193,
"rewards/margins": -0.002908582566305995,
"rewards/rejected": 0.0007866283995099366,
"step": 40
},
{
"epoch": 0.03602305475504323,
"grad_norm": 25.75558090209961,
"learning_rate": 8.992805755395683e-09,
"logits/chosen": -1.6198612451553345,
"logits/rejected": -1.6186037063598633,
"logps/chosen": -49.2431526184082,
"logps/rejected": -51.3972282409668,
"loss": 0.6932,
"rewards/accuracies": 0.46875,
"rewards/chosen": 8.340943895746022e-05,
"rewards/margins": 9.575006697559729e-05,
"rewards/rejected": -1.234045521414373e-05,
"step": 50
},
{
"epoch": 0.043227665706051875,
"grad_norm": 35.717689514160156,
"learning_rate": 1.0791366906474819e-08,
"logits/chosen": -1.702419638633728,
"logits/rejected": -1.6958341598510742,
"logps/chosen": -57.57291793823242,
"logps/rejected": -59.58625030517578,
"loss": 0.6925,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.0025610830634832382,
"rewards/margins": 0.0014890721067786217,
"rewards/rejected": 0.0010720104910433292,
"step": 60
},
{
"epoch": 0.05043227665706052,
"grad_norm": 20.242334365844727,
"learning_rate": 1.2589928057553956e-08,
"logits/chosen": -1.6722795963287354,
"logits/rejected": -1.6661018133163452,
"logps/chosen": -57.32216262817383,
"logps/rejected": -60.748252868652344,
"loss": 0.6935,
"rewards/accuracies": 0.41874998807907104,
"rewards/chosen": 0.0012209347914904356,
"rewards/margins": -0.0005330587737262249,
"rewards/rejected": 0.0017539940308779478,
"step": 70
},
{
"epoch": 0.05763688760806916,
"grad_norm": 29.4974308013916,
"learning_rate": 1.4388489208633092e-08,
"logits/chosen": -1.7139599323272705,
"logits/rejected": -1.7067283391952515,
"logps/chosen": -58.90734100341797,
"logps/rejected": -61.24401092529297,
"loss": 0.6927,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.00023442548990715295,
"rewards/margins": 0.001039124559611082,
"rewards/rejected": -0.0008046992006711662,
"step": 80
},
{
"epoch": 0.06484149855907781,
"grad_norm": 30.03173065185547,
"learning_rate": 1.618705035971223e-08,
"logits/chosen": -1.6469614505767822,
"logits/rejected": -1.643303632736206,
"logps/chosen": -56.198219299316406,
"logps/rejected": -58.478515625,
"loss": 0.6925,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0021789357997477055,
"rewards/margins": 0.0014319642214104533,
"rewards/rejected": 0.0007469715783372521,
"step": 90
},
{
"epoch": 0.07204610951008646,
"grad_norm": 28.857566833496094,
"learning_rate": 1.7985611510791365e-08,
"logits/chosen": -1.7217485904693604,
"logits/rejected": -1.7120834589004517,
"logps/chosen": -51.54811477661133,
"logps/rejected": -55.7042236328125,
"loss": 0.6918,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -4.318228457123041e-05,
"rewards/margins": 0.002846790011972189,
"rewards/rejected": -0.002889972412958741,
"step": 100
},
{
"epoch": 0.0792507204610951,
"grad_norm": 21.317718505859375,
"learning_rate": 1.9784172661870502e-08,
"logits/chosen": -1.5993636846542358,
"logits/rejected": -1.5787856578826904,
"logps/chosen": -55.75529098510742,
"logps/rejected": -58.517738342285156,
"loss": 0.6931,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": 0.0009529569069854915,
"rewards/margins": 0.00015153829008340836,
"rewards/rejected": 0.0008014187915250659,
"step": 110
},
{
"epoch": 0.08645533141210375,
"grad_norm": 22.020675659179688,
"learning_rate": 2.1582733812949638e-08,
"logits/chosen": -1.6349830627441406,
"logits/rejected": -1.6326115131378174,
"logps/chosen": -50.696022033691406,
"logps/rejected": -53.475746154785156,
"loss": 0.6944,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -6.050623414921574e-05,
"rewards/margins": -0.002361572813242674,
"rewards/rejected": 0.0023010666482150555,
"step": 120
},
{
"epoch": 0.0936599423631124,
"grad_norm": 27.489294052124023,
"learning_rate": 2.3381294964028775e-08,
"logits/chosen": -1.7193313837051392,
"logits/rejected": -1.7148230075836182,
"logps/chosen": -56.9023323059082,
"logps/rejected": -59.1072998046875,
"loss": 0.6941,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.0012045868206769228,
"rewards/margins": -0.0017159022390842438,
"rewards/rejected": 0.0005113151855766773,
"step": 130
},
{
"epoch": 0.10086455331412104,
"grad_norm": 26.305728912353516,
"learning_rate": 2.517985611510791e-08,
"logits/chosen": -1.606856346130371,
"logits/rejected": -1.595958948135376,
"logps/chosen": -52.90543746948242,
"logps/rejected": -57.972129821777344,
"loss": 0.6934,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.002076763194054365,
"rewards/margins": -0.00030962502933107316,
"rewards/rejected": 0.0023863886017352343,
"step": 140
},
{
"epoch": 0.10806916426512968,
"grad_norm": 31.3708553314209,
"learning_rate": 2.6978417266187048e-08,
"logits/chosen": -1.6171735525131226,
"logits/rejected": -1.6119439601898193,
"logps/chosen": -54.98027801513672,
"logps/rejected": -59.0755615234375,
"loss": 0.6926,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0010308735072612762,
"rewards/margins": 0.0012253187596797943,
"rewards/rejected": -0.00019444509234745055,
"step": 150
},
{
"epoch": 0.11527377521613832,
"grad_norm": 24.7769832611084,
"learning_rate": 2.8776978417266184e-08,
"logits/chosen": -1.6504337787628174,
"logits/rejected": -1.631773591041565,
"logps/chosen": -47.71480941772461,
"logps/rejected": -51.67340850830078,
"loss": 0.6935,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.00014348707918543369,
"rewards/margins": -0.0006808551261201501,
"rewards/rejected": 0.0008243421907536685,
"step": 160
},
{
"epoch": 0.12247838616714697,
"grad_norm": 28.69447898864746,
"learning_rate": 3.057553956834532e-08,
"logits/chosen": -1.6642261743545532,
"logits/rejected": -1.6457111835479736,
"logps/chosen": -51.293724060058594,
"logps/rejected": -54.08746337890625,
"loss": 0.692,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0019911406561732292,
"rewards/margins": 0.0023246805649250746,
"rewards/rejected": -0.0003335399378556758,
"step": 170
},
{
"epoch": 0.12968299711815562,
"grad_norm": 22.81742286682129,
"learning_rate": 3.237410071942446e-08,
"logits/chosen": -1.723880410194397,
"logits/rejected": -1.7132070064544678,
"logps/chosen": -51.556312561035156,
"logps/rejected": -54.2805290222168,
"loss": 0.6919,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0016957769403234124,
"rewards/margins": 0.00254463916644454,
"rewards/rejected": -0.0008488625171594322,
"step": 180
},
{
"epoch": 0.13688760806916425,
"grad_norm": 24.014127731323242,
"learning_rate": 3.4172661870503594e-08,
"logits/chosen": -1.7289400100708008,
"logits/rejected": -1.7246164083480835,
"logps/chosen": -48.183475494384766,
"logps/rejected": -52.82011032104492,
"loss": 0.6917,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0009596001473255455,
"rewards/margins": 0.0030194323044270277,
"rewards/rejected": -0.002059832215309143,
"step": 190
},
{
"epoch": 0.1440922190201729,
"grad_norm": 33.81916046142578,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -1.68062424659729,
"logits/rejected": -1.6727863550186157,
"logps/chosen": -49.81970977783203,
"logps/rejected": -54.09613800048828,
"loss": 0.6925,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 2.1038798877270892e-05,
"rewards/margins": 0.0013697268441319466,
"rewards/rejected": -0.001348688150756061,
"step": 200
},
{
"epoch": 0.15129682997118155,
"grad_norm": 27.09559440612793,
"learning_rate": 3.776978417266187e-08,
"logits/chosen": -1.7069154977798462,
"logits/rejected": -1.6970031261444092,
"logps/chosen": -49.16070556640625,
"logps/rejected": -50.410491943359375,
"loss": 0.6918,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.002084630075842142,
"rewards/margins": 0.0027677714824676514,
"rewards/rejected": -0.0006831414066255093,
"step": 210
},
{
"epoch": 0.1585014409221902,
"grad_norm": 26.769506454467773,
"learning_rate": 3.9568345323741003e-08,
"logits/chosen": -1.62959885597229,
"logits/rejected": -1.6223684549331665,
"logps/chosen": -53.7839241027832,
"logps/rejected": -60.50789260864258,
"loss": 0.6915,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.004310721065849066,
"rewards/margins": 0.003480097744613886,
"rewards/rejected": 0.0008306234958581626,
"step": 220
},
{
"epoch": 0.16570605187319884,
"grad_norm": 24.570886611938477,
"learning_rate": 4.136690647482014e-08,
"logits/chosen": -1.6772973537445068,
"logits/rejected": -1.675528883934021,
"logps/chosen": -51.2908935546875,
"logps/rejected": -55.5107536315918,
"loss": 0.693,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.0018524869810789824,
"rewards/margins": 0.0003210431314073503,
"rewards/rejected": 0.0015314440242946148,
"step": 230
},
{
"epoch": 0.1729106628242075,
"grad_norm": 26.605432510375977,
"learning_rate": 4.3165467625899276e-08,
"logits/chosen": -1.7227964401245117,
"logits/rejected": -1.7107378244400024,
"logps/chosen": -55.66338348388672,
"logps/rejected": -58.241539001464844,
"loss": 0.6933,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.002910902723670006,
"rewards/margins": -0.0001255483366549015,
"rewards/rejected": 0.003036451293155551,
"step": 240
},
{
"epoch": 0.18011527377521613,
"grad_norm": 35.790584564208984,
"learning_rate": 4.496402877697841e-08,
"logits/chosen": -1.6011412143707275,
"logits/rejected": -1.5909541845321655,
"logps/chosen": -57.142967224121094,
"logps/rejected": -59.007530212402344,
"loss": 0.6922,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0032608923502266407,
"rewards/margins": 0.0020222733728587627,
"rewards/rejected": 0.0012386186281219125,
"step": 250
},
{
"epoch": 0.1873198847262248,
"grad_norm": 29.72098731994629,
"learning_rate": 4.676258992805755e-08,
"logits/chosen": -1.6467279195785522,
"logits/rejected": -1.6427417993545532,
"logps/chosen": -56.70636749267578,
"logps/rejected": -59.045448303222656,
"loss": 0.6941,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.0010839566821232438,
"rewards/margins": -0.0017195299733430147,
"rewards/rejected": 0.0028034865390509367,
"step": 260
},
{
"epoch": 0.19452449567723343,
"grad_norm": 30.71733856201172,
"learning_rate": 4.8561151079136686e-08,
"logits/chosen": -1.7268844842910767,
"logits/rejected": -1.713024377822876,
"logps/chosen": -53.37934494018555,
"logps/rejected": -56.01610565185547,
"loss": 0.6931,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.002369955414906144,
"rewards/margins": 0.00020498293451964855,
"rewards/rejected": 0.0021649724803864956,
"step": 270
},
{
"epoch": 0.2017291066282421,
"grad_norm": 27.323898315429688,
"learning_rate": 4.999992091672379e-08,
"logits/chosen": -1.6942745447158813,
"logits/rejected": -1.679835557937622,
"logps/chosen": -48.37071228027344,
"logps/rejected": -50.23174285888672,
"loss": 0.6921,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.002187022939324379,
"rewards/margins": 0.0021849684417247772,
"rewards/rejected": 2.054649030469591e-06,
"step": 280
},
{
"epoch": 0.20893371757925072,
"grad_norm": 27.54994773864746,
"learning_rate": 4.999715305459108e-08,
"logits/chosen": -1.7232547998428345,
"logits/rejected": -1.7125844955444336,
"logps/chosen": -51.24810028076172,
"logps/rejected": -52.99908447265625,
"loss": 0.6932,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.0002954238443635404,
"rewards/margins": -6.616571045015007e-05,
"rewards/rejected": 0.00036158948205411434,
"step": 290
},
{
"epoch": 0.21613832853025935,
"grad_norm": 28.071247100830078,
"learning_rate": 4.9990431528966836e-08,
"logits/chosen": -1.7029025554656982,
"logits/rejected": -1.6867132186889648,
"logps/chosen": -55.0485954284668,
"logps/rejected": -58.52321243286133,
"loss": 0.6922,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.0034954734146595,
"rewards/margins": 0.0020077379886060953,
"rewards/rejected": 0.0014877354260534048,
"step": 300
},
{
"epoch": 0.22334293948126802,
"grad_norm": 30.036535263061523,
"learning_rate": 4.997975740295813e-08,
"logits/chosen": -1.5882554054260254,
"logits/rejected": -1.5789012908935547,
"logps/chosen": -54.333534240722656,
"logps/rejected": -57.2091064453125,
"loss": 0.6904,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.004001539200544357,
"rewards/margins": 0.0055999672040343285,
"rewards/rejected": -0.0015984283527359366,
"step": 310
},
{
"epoch": 0.23054755043227665,
"grad_norm": 23.738216400146484,
"learning_rate": 4.996513236483331e-08,
"logits/chosen": -1.6974598169326782,
"logits/rejected": -1.6820309162139893,
"logps/chosen": -54.326332092285156,
"logps/rejected": -59.19081497192383,
"loss": 0.6929,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.002228650962933898,
"rewards/margins": 0.0006980501930229366,
"rewards/rejected": 0.0015306004788726568,
"step": 320
},
{
"epoch": 0.2377521613832853,
"grad_norm": 21.979890823364258,
"learning_rate": 4.9946558727754974e-08,
"logits/chosen": -1.6695890426635742,
"logits/rejected": -1.6626873016357422,
"logps/chosen": -57.21343231201172,
"logps/rejected": -56.08220672607422,
"loss": 0.6934,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.0033706065732985735,
"rewards/margins": -0.0003914666303899139,
"rewards/rejected": 0.003762073116376996,
"step": 330
},
{
"epoch": 0.24495677233429394,
"grad_norm": 25.381389617919922,
"learning_rate": 4.9924039429414086e-08,
"logits/chosen": -1.723496437072754,
"logits/rejected": -1.7127540111541748,
"logps/chosen": -58.3027458190918,
"logps/rejected": -60.19641876220703,
"loss": 0.6911,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.0054273479618132114,
"rewards/margins": 0.004153185058385134,
"rewards/rejected": 0.0012741630198433995,
"step": 340
},
{
"epoch": 0.2521613832853026,
"grad_norm": 35.49002456665039,
"learning_rate": 4.989757803156537e-08,
"logits/chosen": -1.655975580215454,
"logits/rejected": -1.6502435207366943,
"logps/chosen": -52.090789794921875,
"logps/rejected": -55.48548126220703,
"loss": 0.69,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.006078408099710941,
"rewards/margins": 0.0064167543314397335,
"rewards/rejected": -0.00033834631904028356,
"step": 350
},
{
"epoch": 0.25936599423631124,
"grad_norm": 30.3967227935791,
"learning_rate": 4.986717871946393e-08,
"logits/chosen": -1.6415973901748657,
"logits/rejected": -1.6323236227035522,
"logps/chosen": -62.180633544921875,
"logps/rejected": -65.96495819091797,
"loss": 0.6911,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.005377051420509815,
"rewards/margins": 0.004161593038588762,
"rewards/rejected": 0.0012154586147516966,
"step": 360
},
{
"epoch": 0.2665706051873199,
"grad_norm": 35.132511138916016,
"learning_rate": 4.983284630120331e-08,
"logits/chosen": -1.6300022602081299,
"logits/rejected": -1.6275346279144287,
"logps/chosen": -55.072723388671875,
"logps/rejected": -60.9841423034668,
"loss": 0.6928,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0034929229877889156,
"rewards/margins": 0.0007132775499485433,
"rewards/rejected": 0.002779645612463355,
"step": 370
},
{
"epoch": 0.2737752161383285,
"grad_norm": 24.042118072509766,
"learning_rate": 4.979458620695505e-08,
"logits/chosen": -1.7153857946395874,
"logits/rejected": -1.7088816165924072,
"logps/chosen": -53.499717712402344,
"logps/rejected": -55.354148864746094,
"loss": 0.6917,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.005800359416753054,
"rewards/margins": 0.002978697419166565,
"rewards/rejected": 0.002821662463247776,
"step": 380
},
{
"epoch": 0.28097982708933716,
"grad_norm": 33.635623931884766,
"learning_rate": 4.975240448810977e-08,
"logits/chosen": -1.6882060766220093,
"logits/rejected": -1.678504228591919,
"logps/chosen": -55.21924591064453,
"logps/rejected": -59.813438415527344,
"loss": 0.6913,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.0060536982491612434,
"rewards/margins": 0.003763732733204961,
"rewards/rejected": 0.0022899655159562826,
"step": 390
},
{
"epoch": 0.2881844380403458,
"grad_norm": 30.458513259887695,
"learning_rate": 4.970630781632009e-08,
"logits/chosen": -1.70065176486969,
"logits/rejected": -1.6941430568695068,
"logps/chosen": -51.30016326904297,
"logps/rejected": -53.36370849609375,
"loss": 0.6919,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.008028768934309483,
"rewards/margins": 0.0026502818800508976,
"rewards/rejected": 0.005378487519919872,
"step": 400
},
{
"epoch": 0.2953890489913545,
"grad_norm": 33.6281852722168,
"learning_rate": 4.965630348244542e-08,
"logits/chosen": -1.6540043354034424,
"logits/rejected": -1.6505072116851807,
"logps/chosen": -54.006988525390625,
"logps/rejected": -56.83721923828125,
"loss": 0.6929,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.00576495798304677,
"rewards/margins": 0.0007124156691133976,
"rewards/rejected": 0.0050525423139333725,
"step": 410
},
{
"epoch": 0.3025936599423631,
"grad_norm": 23.50993537902832,
"learning_rate": 4.9602399395398786e-08,
"logits/chosen": -1.619267463684082,
"logits/rejected": -1.627747893333435,
"logps/chosen": -52.449256896972656,
"logps/rejected": -56.338417053222656,
"loss": 0.6933,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.003929648548364639,
"rewards/margins": -0.00018039104179479182,
"rewards/rejected": 0.0041100396774709225,
"step": 420
},
{
"epoch": 0.30979827089337175,
"grad_norm": 21.24608612060547,
"learning_rate": 4.95446040808959e-08,
"logits/chosen": -1.645350694656372,
"logits/rejected": -1.6449620723724365,
"logps/chosen": -52.73632049560547,
"logps/rejected": -53.45061111450195,
"loss": 0.6908,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.006634493824094534,
"rewards/margins": 0.004952923860400915,
"rewards/rejected": 0.0016815703129395843,
"step": 430
},
{
"epoch": 0.3170028818443804,
"grad_norm": 22.69368553161621,
"learning_rate": 4.948292668010676e-08,
"logits/chosen": -1.6465650796890259,
"logits/rejected": -1.637599229812622,
"logps/chosen": -52.00286102294922,
"logps/rejected": -56.63525390625,
"loss": 0.6913,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0058380914852023125,
"rewards/margins": 0.0039370255544781685,
"rewards/rejected": 0.0019010662799701095,
"step": 440
},
{
"epoch": 0.3242074927953891,
"grad_norm": 28.970970153808594,
"learning_rate": 4.941737694820975e-08,
"logits/chosen": -1.6554502248764038,
"logits/rejected": -1.63752019405365,
"logps/chosen": -60.775146484375,
"logps/rejected": -59.3543701171875,
"loss": 0.6913,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.008422751910984516,
"rewards/margins": 0.003841835306957364,
"rewards/rejected": 0.004580915905535221,
"step": 450
},
{
"epoch": 0.3314121037463977,
"grad_norm": 37.377037048339844,
"learning_rate": 4.93479652528488e-08,
"logits/chosen": -1.6547534465789795,
"logits/rejected": -1.6382789611816406,
"logps/chosen": -60.82624053955078,
"logps/rejected": -63.30852127075195,
"loss": 0.6906,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.008778962306678295,
"rewards/margins": 0.005369237158447504,
"rewards/rejected": 0.003409724682569504,
"step": 460
},
{
"epoch": 0.33861671469740634,
"grad_norm": 32.41299819946289,
"learning_rate": 4.9274702572493555e-08,
"logits/chosen": -1.7175623178482056,
"logits/rejected": -1.6901031732559204,
"logps/chosen": -60.22943878173828,
"logps/rejected": -61.229896545410156,
"loss": 0.6922,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.004740011878311634,
"rewards/margins": 0.002111945766955614,
"rewards/rejected": 0.002628065412864089,
"step": 470
},
{
"epoch": 0.345821325648415,
"grad_norm": 31.445178985595703,
"learning_rate": 4.9197600494702955e-08,
"logits/chosen": -1.784799337387085,
"logits/rejected": -1.7709108591079712,
"logps/chosen": -48.04889678955078,
"logps/rejected": -51.65523147583008,
"loss": 0.6899,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.009241985157132149,
"rewards/margins": 0.006657962687313557,
"rewards/rejected": 0.0025840220041573048,
"step": 480
},
{
"epoch": 0.3530259365994236,
"grad_norm": 37.82373809814453,
"learning_rate": 4.9116671214292526e-08,
"logits/chosen": -1.6648657321929932,
"logits/rejected": -1.654442548751831,
"logps/chosen": -53.06227493286133,
"logps/rejected": -56.86822509765625,
"loss": 0.6899,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.010359242558479309,
"rewards/margins": 0.0067110308445990086,
"rewards/rejected": 0.003648211481049657,
"step": 490
},
{
"epoch": 0.36023054755043227,
"grad_norm": 26.230350494384766,
"learning_rate": 4.903192753140557e-08,
"logits/chosen": -1.6695973873138428,
"logits/rejected": -1.6475780010223389,
"logps/chosen": -48.755794525146484,
"logps/rejected": -53.21979904174805,
"loss": 0.6892,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.010777877643704414,
"rewards/margins": 0.00803940650075674,
"rewards/rejected": 0.002738471608608961,
"step": 500
},
{
"epoch": 0.36743515850144093,
"grad_norm": 25.887540817260742,
"learning_rate": 4.894338284948866e-08,
"logits/chosen": -1.775498628616333,
"logits/rejected": -1.7581126689910889,
"logps/chosen": -52.77173614501953,
"logps/rejected": -55.65742111206055,
"loss": 0.6892,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.011902733705937862,
"rewards/margins": 0.008151276968419552,
"rewards/rejected": 0.0037514567375183105,
"step": 510
},
{
"epoch": 0.3746397694524496,
"grad_norm": 32.86073684692383,
"learning_rate": 4.8851051173171656e-08,
"logits/chosen": -1.670640230178833,
"logits/rejected": -1.666121482849121,
"logps/chosen": -61.16508865356445,
"logps/rejected": -64.01704406738281,
"loss": 0.6911,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.008481292985379696,
"rewards/margins": 0.004413911607116461,
"rewards/rejected": 0.00406738230958581,
"step": 520
},
{
"epoch": 0.3818443804034582,
"grad_norm": 30.485824584960938,
"learning_rate": 4.8754947106052696e-08,
"logits/chosen": -1.6040318012237549,
"logits/rejected": -1.5821549892425537,
"logps/chosen": -53.919700622558594,
"logps/rejected": -55.5211067199707,
"loss": 0.6894,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.01268856506794691,
"rewards/margins": 0.007814417593181133,
"rewards/rejected": 0.004874147940427065,
"step": 530
},
{
"epoch": 0.38904899135446686,
"grad_norm": 29.459304809570312,
"learning_rate": 4.865508584838841e-08,
"logits/chosen": -1.6308727264404297,
"logits/rejected": -1.6111412048339844,
"logps/chosen": -52.30418014526367,
"logps/rejected": -55.19745635986328,
"loss": 0.6886,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.014297975227236748,
"rewards/margins": 0.00937692541629076,
"rewards/rejected": 0.004921050742268562,
"step": 540
},
{
"epoch": 0.3962536023054755,
"grad_norm": 27.596515655517578,
"learning_rate": 4.855148319468979e-08,
"logits/chosen": -1.5790401697158813,
"logits/rejected": -1.5705499649047852,
"logps/chosen": -54.7948112487793,
"logps/rejected": -55.06526565551758,
"loss": 0.6915,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.011280642822384834,
"rewards/margins": 0.0034813634119927883,
"rewards/rejected": 0.007799278944730759,
"step": 550
},
{
"epoch": 0.4034582132564842,
"grad_norm": 28.585477828979492,
"learning_rate": 4.8444155531224065e-08,
"logits/chosen": -1.7170881032943726,
"logits/rejected": -1.7071220874786377,
"logps/chosen": -54.61487579345703,
"logps/rejected": -56.94572067260742,
"loss": 0.6905,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.011090746149420738,
"rewards/margins": 0.005567173473536968,
"rewards/rejected": 0.005523574538528919,
"step": 560
},
{
"epoch": 0.4106628242074928,
"grad_norm": 28.21603775024414,
"learning_rate": 4.833311983342292e-08,
"logits/chosen": -1.6930019855499268,
"logits/rejected": -1.6664783954620361,
"logps/chosen": -60.13446807861328,
"logps/rejected": -62.42277145385742,
"loss": 0.6891,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.013689065352082253,
"rewards/margins": 0.008529866114258766,
"rewards/rejected": 0.005159198306500912,
"step": 570
},
{
"epoch": 0.41786743515850144,
"grad_norm": 30.411657333374023,
"learning_rate": 4.821839366319768e-08,
"logits/chosen": -1.718141794204712,
"logits/rejected": -1.7054805755615234,
"logps/chosen": -49.07467269897461,
"logps/rejected": -51.04106521606445,
"loss": 0.6881,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.014034624211490154,
"rewards/margins": 0.010366623289883137,
"rewards/rejected": 0.0036680016200989485,
"step": 580
},
{
"epoch": 0.4250720461095101,
"grad_norm": 24.9964599609375,
"learning_rate": 4.8099995166161536e-08,
"logits/chosen": -1.6703847646713257,
"logits/rejected": -1.6679855585098267,
"logps/chosen": -54.932334899902344,
"logps/rejected": -61.33048629760742,
"loss": 0.6893,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.016779553145170212,
"rewards/margins": 0.007991639897227287,
"rewards/rejected": 0.00878791231662035,
"step": 590
},
{
"epoch": 0.4322766570605187,
"grad_norm": 28.37261390686035,
"learning_rate": 4.797794306875963e-08,
"logits/chosen": -1.7611091136932373,
"logits/rejected": -1.751899003982544,
"logps/chosen": -52.07770919799805,
"logps/rejected": -57.04071044921875,
"loss": 0.6886,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.013305628672242165,
"rewards/margins": 0.009486498311161995,
"rewards/rejected": 0.0038191310595721006,
"step": 600
},
{
"epoch": 0.43948126801152737,
"grad_norm": 33.9644660949707,
"learning_rate": 4.785225667530716e-08,
"logits/chosen": -1.6755338907241821,
"logits/rejected": -1.6584323644638062,
"logps/chosen": -57.341094970703125,
"logps/rejected": -58.7607307434082,
"loss": 0.69,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.012329719960689545,
"rewards/margins": 0.006740064360201359,
"rewards/rejected": 0.0055896565318107605,
"step": 610
},
{
"epoch": 0.44668587896253603,
"grad_norm": 24.328838348388672,
"learning_rate": 4.772295586493613e-08,
"logits/chosen": -1.7163664102554321,
"logits/rejected": -1.7021703720092773,
"logps/chosen": -49.007179260253906,
"logps/rejected": -51.45275115966797,
"loss": 0.6898,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.013197936117649078,
"rewards/margins": 0.007094507105648518,
"rewards/rejected": 0.0061034285463392735,
"step": 620
},
{
"epoch": 0.4538904899135447,
"grad_norm": 21.8724308013916,
"learning_rate": 4.759006108845116e-08,
"logits/chosen": -1.7142736911773682,
"logits/rejected": -1.7078378200531006,
"logps/chosen": -49.206016540527344,
"logps/rejected": -54.04901123046875,
"loss": 0.6856,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.017311111092567444,
"rewards/margins": 0.015471754595637321,
"rewards/rejected": 0.0018393562640994787,
"step": 630
},
{
"epoch": 0.4610951008645533,
"grad_norm": 27.477933883666992,
"learning_rate": 4.7453593365094926e-08,
"logits/chosen": -1.5769332647323608,
"logits/rejected": -1.571119785308838,
"logps/chosen": -51.74431610107422,
"logps/rejected": -55.1117057800293,
"loss": 0.6874,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.01672833226621151,
"rewards/margins": 0.01204992551356554,
"rewards/rejected": 0.004678409546613693,
"step": 640
},
{
"epoch": 0.46829971181556196,
"grad_norm": 30.72771644592285,
"learning_rate": 4.731357427922361e-08,
"logits/chosen": -1.7531722784042358,
"logits/rejected": -1.7257356643676758,
"logps/chosen": -51.89509201049805,
"logps/rejected": -52.538978576660156,
"loss": 0.6872,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.019248802214860916,
"rewards/margins": 0.012345701456069946,
"rewards/rejected": 0.00690310075879097,
"step": 650
},
{
"epoch": 0.4755043227665706,
"grad_norm": 34.05529022216797,
"learning_rate": 4.71700259768931e-08,
"logits/chosen": -1.6869693994522095,
"logits/rejected": -1.685805082321167,
"logps/chosen": -53.994873046875,
"logps/rejected": -57.93560028076172,
"loss": 0.6852,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.018321344628930092,
"rewards/margins": 0.016431275755167007,
"rewards/rejected": 0.0018900686409324408,
"step": 660
},
{
"epoch": 0.4827089337175792,
"grad_norm": 31.389686584472656,
"learning_rate": 4.7022971162356176e-08,
"logits/chosen": -1.6463180780410767,
"logits/rejected": -1.6191755533218384,
"logps/chosen": -56.87351608276367,
"logps/rejected": -57.94671630859375,
"loss": 0.6863,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.01981157623231411,
"rewards/margins": 0.014039942994713783,
"rewards/rejected": 0.005771632306277752,
"step": 670
},
{
"epoch": 0.4899135446685879,
"grad_norm": 36.83378601074219,
"learning_rate": 4.6872433094471577e-08,
"logits/chosen": -1.5911400318145752,
"logits/rejected": -1.579105257987976,
"logps/chosen": -56.64697265625,
"logps/rejected": -57.130035400390625,
"loss": 0.6874,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.019423145800828934,
"rewards/margins": 0.012030874378979206,
"rewards/rejected": 0.007392272353172302,
"step": 680
},
{
"epoch": 0.49711815561959655,
"grad_norm": 32.14356994628906,
"learning_rate": 4.671843558302522e-08,
"logits/chosen": -1.6711755990982056,
"logits/rejected": -1.6611177921295166,
"logps/chosen": -54.51006317138672,
"logps/rejected": -58.520973205566406,
"loss": 0.6889,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.01895291917026043,
"rewards/margins": 0.00888957642018795,
"rewards/rejected": 0.01006334088742733,
"step": 690
},
{
"epoch": 0.5043227665706052,
"grad_norm": 27.02754020690918,
"learning_rate": 4.656100298496439e-08,
"logits/chosen": -1.6875331401824951,
"logits/rejected": -1.6737067699432373,
"logps/chosen": -53.4271240234375,
"logps/rejected": -57.33203887939453,
"loss": 0.6863,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.020056720823049545,
"rewards/margins": 0.014298888854682446,
"rewards/rejected": 0.005757831037044525,
"step": 700
},
{
"epoch": 0.5115273775216138,
"grad_norm": 26.375259399414062,
"learning_rate": 4.640016020054527e-08,
"logits/chosen": -1.671545386314392,
"logits/rejected": -1.657135009765625,
"logps/chosen": -47.39130783081055,
"logps/rejected": -50.1197395324707,
"loss": 0.6862,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.020558740943670273,
"rewards/margins": 0.014566670171916485,
"rewards/rejected": 0.005992068909108639,
"step": 710
},
{
"epoch": 0.5187319884726225,
"grad_norm": 33.271087646484375,
"learning_rate": 4.6235932669394676e-08,
"logits/chosen": -1.6350809335708618,
"logits/rejected": -1.620134711265564,
"logps/chosen": -56.80983352661133,
"logps/rejected": -60.5606803894043,
"loss": 0.687,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.020696226507425308,
"rewards/margins": 0.012868310324847698,
"rewards/rejected": 0.00782791618257761,
"step": 720
},
{
"epoch": 0.5259365994236311,
"grad_norm": 25.984638214111328,
"learning_rate": 4.6068346366486325e-08,
"logits/chosen": -1.6507568359375,
"logits/rejected": -1.6315109729766846,
"logps/chosen": -55.800132751464844,
"logps/rejected": -58.299346923828125,
"loss": 0.6857,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.020775150507688522,
"rewards/margins": 0.015492427162826061,
"rewards/rejected": 0.005282718688249588,
"step": 730
},
{
"epoch": 0.5331412103746398,
"grad_norm": 27.39038848876953,
"learning_rate": 4.589742779803259e-08,
"logits/chosen": -1.6461410522460938,
"logits/rejected": -1.6439073085784912,
"logps/chosen": -53.59571075439453,
"logps/rejected": -52.81595993041992,
"loss": 0.6906,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.01788356341421604,
"rewards/margins": 0.005540410988032818,
"rewards/rejected": 0.0123431496322155,
"step": 740
},
{
"epoch": 0.5403458213256485,
"grad_norm": 26.366432189941406,
"learning_rate": 4.5723203997292146e-08,
"logits/chosen": -1.6698424816131592,
"logits/rejected": -1.6543042659759521,
"logps/chosen": -55.21739959716797,
"logps/rejected": -57.220542907714844,
"loss": 0.6836,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.02430053800344467,
"rewards/margins": 0.0198974572122097,
"rewards/rejected": 0.004403082188218832,
"step": 750
},
{
"epoch": 0.547550432276657,
"grad_norm": 33.0073356628418,
"learning_rate": 4.554570252029421e-08,
"logits/chosen": -1.6031173467636108,
"logits/rejected": -1.6015863418579102,
"logps/chosen": -52.24799728393555,
"logps/rejected": -55.23774337768555,
"loss": 0.6879,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.019535819068551064,
"rewards/margins": 0.01108732633292675,
"rewards/rejected": 0.008448492735624313,
"step": 760
},
{
"epoch": 0.5547550432276657,
"grad_norm": 30.090187072753906,
"learning_rate": 4.536495144148021e-08,
"logits/chosen": -1.6216052770614624,
"logits/rejected": -1.6167309284210205,
"logps/chosen": -49.6798095703125,
"logps/rejected": -53.358123779296875,
"loss": 0.6834,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.02593979239463806,
"rewards/margins": 0.02041347324848175,
"rewards/rejected": 0.005526319611817598,
"step": 770
},
{
"epoch": 0.5619596541786743,
"grad_norm": 23.466976165771484,
"learning_rate": 4.518097934926339e-08,
"logits/chosen": -1.6407028436660767,
"logits/rejected": -1.6307079792022705,
"logps/chosen": -55.18608856201172,
"logps/rejected": -57.64618682861328,
"loss": 0.6848,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.02296794392168522,
"rewards/margins": 0.017504002898931503,
"rewards/rejected": 0.0054639410227537155,
"step": 780
},
{
"epoch": 0.569164265129683,
"grad_norm": 27.666854858398438,
"learning_rate": 4.499381534150714e-08,
"logits/chosen": -1.685529112815857,
"logits/rejected": -1.6761360168457031,
"logps/chosen": -56.853492736816406,
"logps/rejected": -61.52440643310547,
"loss": 0.6854,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.022552262991666794,
"rewards/margins": 0.016366995871067047,
"rewards/rejected": 0.006185270380228758,
"step": 790
},
{
"epoch": 0.5763688760806917,
"grad_norm": 30.78841781616211,
"learning_rate": 4.48034890209227e-08,
"logits/chosen": -1.670078992843628,
"logits/rejected": -1.6596672534942627,
"logps/chosen": -53.268516540527344,
"logps/rejected": -55.402740478515625,
"loss": 0.6854,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.024996964260935783,
"rewards/margins": 0.016293780878186226,
"rewards/rejected": 0.008703185245394707,
"step": 800
},
{
"epoch": 0.5835734870317003,
"grad_norm": 23.246274948120117,
"learning_rate": 4.4610030490387154e-08,
"logits/chosen": -1.6598783731460571,
"logits/rejected": -1.6620323657989502,
"logps/chosen": -51.27886199951172,
"logps/rejected": -55.02397537231445,
"loss": 0.6864,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.020924841985106468,
"rewards/margins": 0.014275836758315563,
"rewards/rejected": 0.006649008486419916,
"step": 810
},
{
"epoch": 0.590778097982709,
"grad_norm": 25.784561157226562,
"learning_rate": 4.4413470348182124e-08,
"logits/chosen": -1.693752646446228,
"logits/rejected": -1.6728289127349854,
"logps/chosen": -53.787078857421875,
"logps/rejected": -54.61577224731445,
"loss": 0.6851,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.02386101894080639,
"rewards/margins": 0.01699705794453621,
"rewards/rejected": 0.006863957736641169,
"step": 820
},
{
"epoch": 0.5979827089337176,
"grad_norm": 26.125385284423828,
"learning_rate": 4.421383968315427e-08,
"logits/chosen": -1.6544780731201172,
"logits/rejected": -1.6471678018569946,
"logps/chosen": -48.14168930053711,
"logps/rejected": -52.247154235839844,
"loss": 0.6797,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.03294381499290466,
"rewards/margins": 0.02802552655339241,
"rewards/rejected": 0.00491828890517354,
"step": 830
},
{
"epoch": 0.6051873198847262,
"grad_norm": 21.14900779724121,
"learning_rate": 4.4011170069798126e-08,
"logits/chosen": -1.6627849340438843,
"logits/rejected": -1.6545110940933228,
"logps/chosen": -53.29193115234375,
"logps/rejected": -54.93217849731445,
"loss": 0.6832,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0249174851924181,
"rewards/margins": 0.02101990208029747,
"rewards/rejected": 0.003897582646459341,
"step": 840
},
{
"epoch": 0.6123919308357348,
"grad_norm": 38.04325866699219,
"learning_rate": 4.380549356326208e-08,
"logits/chosen": -1.6758836507797241,
"logits/rejected": -1.663313627243042,
"logps/chosen": -54.3109016418457,
"logps/rejected": -58.55974197387695,
"loss": 0.6822,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03060699999332428,
"rewards/margins": 0.023099634796380997,
"rewards/rejected": 0.007507366128265858,
"step": 850
},
{
"epoch": 0.6195965417867435,
"grad_norm": 25.8773193359375,
"learning_rate": 4.359684269427848e-08,
"logits/chosen": -1.6748501062393188,
"logits/rejected": -1.6620460748672485,
"logps/chosen": -65.36933898925781,
"logps/rejected": -65.69126892089844,
"loss": 0.6842,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.025393366813659668,
"rewards/margins": 0.019018063321709633,
"rewards/rejected": 0.006375306751579046,
"step": 860
},
{
"epoch": 0.6268011527377522,
"grad_norm": 28.597139358520508,
"learning_rate": 4.3385250464018355e-08,
"logits/chosen": -1.7144954204559326,
"logits/rejected": -1.7035188674926758,
"logps/chosen": -54.55393600463867,
"logps/rejected": -58.47465896606445,
"loss": 0.6822,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.02745116874575615,
"rewards/margins": 0.02292410284280777,
"rewards/rejected": 0.004527065437287092,
"step": 870
},
{
"epoch": 0.6340057636887608,
"grad_norm": 28.048931121826172,
"learning_rate": 4.3170750338871806e-08,
"logits/chosen": -1.7362436056137085,
"logits/rejected": -1.7299144268035889,
"logps/chosen": -53.04833221435547,
"logps/rejected": -54.4532585144043,
"loss": 0.6873,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.020626841112971306,
"rewards/margins": 0.012741965241730213,
"rewards/rejected": 0.007884878665208817,
"step": 880
},
{
"epoch": 0.6412103746397695,
"grad_norm": 26.831119537353516,
"learning_rate": 4.295337624515485e-08,
"logits/chosen": -1.7312161922454834,
"logits/rejected": -1.7202436923980713,
"logps/chosen": -50.97220993041992,
"logps/rejected": -52.995948791503906,
"loss": 0.6824,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.02905122935771942,
"rewards/margins": 0.022564779967069626,
"rewards/rejected": 0.0064864493906497955,
"step": 890
},
{
"epoch": 0.6484149855907781,
"grad_norm": 31.435150146484375,
"learning_rate": 4.273316256374342e-08,
"logits/chosen": -1.5930171012878418,
"logits/rejected": -1.5941686630249023,
"logps/chosen": -60.389488220214844,
"logps/rejected": -64.2243881225586,
"loss": 0.6879,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.020148711279034615,
"rewards/margins": 0.011882050894200802,
"rewards/rejected": 0.008266657590866089,
"step": 900
},
{
"epoch": 0.6556195965417867,
"grad_norm": 27.812658309936523,
"learning_rate": 4.2510144124635605e-08,
"logits/chosen": -1.639870285987854,
"logits/rejected": -1.6335127353668213,
"logps/chosen": -58.445831298828125,
"logps/rejected": -59.66516876220703,
"loss": 0.6866,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.024780476465821266,
"rewards/margins": 0.01432741153985262,
"rewards/rejected": 0.010453062132000923,
"step": 910
},
{
"epoch": 0.6628242074927954,
"grad_norm": 23.902587890625,
"learning_rate": 4.22843562014427e-08,
"logits/chosen": -1.6627442836761475,
"logits/rejected": -1.651299238204956,
"logps/chosen": -47.656944274902344,
"logps/rejected": -50.61472702026367,
"loss": 0.6789,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.03436293825507164,
"rewards/margins": 0.029948776587843895,
"rewards/rejected": 0.004414163064211607,
"step": 920
},
{
"epoch": 0.670028818443804,
"grad_norm": 29.023405075073242,
"learning_rate": 4.205583450581023e-08,
"logits/chosen": -1.727513074874878,
"logits/rejected": -1.7153295278549194,
"logps/chosen": -53.51002883911133,
"logps/rejected": -56.443153381347656,
"loss": 0.682,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.032039422541856766,
"rewards/margins": 0.02314738929271698,
"rewards/rejected": 0.00889203418046236,
"step": 930
},
{
"epoch": 0.6772334293948127,
"grad_norm": 22.48113250732422,
"learning_rate": 4.1824615181769577e-08,
"logits/chosen": -1.6178547143936157,
"logits/rejected": -1.6143802404403687,
"logps/chosen": -62.34340286254883,
"logps/rejected": -63.569923400878906,
"loss": 0.6839,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.027915989980101585,
"rewards/margins": 0.01995277777314186,
"rewards/rejected": 0.007963214069604874,
"step": 940
},
{
"epoch": 0.6844380403458213,
"grad_norm": 27.37798309326172,
"learning_rate": 4.1590734800021354e-08,
"logits/chosen": -1.5735194683074951,
"logits/rejected": -1.5851722955703735,
"logps/chosen": -52.67449188232422,
"logps/rejected": -58.57447052001953,
"loss": 0.6832,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.033303067088127136,
"rewards/margins": 0.0210419949144125,
"rewards/rejected": 0.01226106844842434,
"step": 950
},
{
"epoch": 0.69164265129683,
"grad_norm": 27.289091110229492,
"learning_rate": 4.1354230352151143e-08,
"logits/chosen": -1.708268165588379,
"logits/rejected": -1.700402855873108,
"logps/chosen": -55.896453857421875,
"logps/rejected": -58.6062126159668,
"loss": 0.6844,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.02778809331357479,
"rewards/margins": 0.01915113627910614,
"rewards/rejected": 0.00863695703446865,
"step": 960
},
{
"epoch": 0.6988472622478387,
"grad_norm": 34.176612854003906,
"learning_rate": 4.111513924477878e-08,
"logits/chosen": -1.7223440408706665,
"logits/rejected": -1.7140798568725586,
"logps/chosen": -51.11802291870117,
"logps/rejected": -55.39356231689453,
"loss": 0.6805,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.038919635117053986,
"rewards/margins": 0.02688221074640751,
"rewards/rejected": 0.012037424370646477,
"step": 970
},
{
"epoch": 0.7060518731988472,
"grad_norm": 24.33797264099121,
"learning_rate": 4.087349929364192e-08,
"logits/chosen": -1.5715091228485107,
"logits/rejected": -1.5727804899215698,
"logps/chosen": -58.17668914794922,
"logps/rejected": -63.71800994873047,
"loss": 0.6856,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.02855418249964714,
"rewards/margins": 0.016662323847413063,
"rewards/rejected": 0.011891861446201801,
"step": 980
},
{
"epoch": 0.7132564841498559,
"grad_norm": 30.351694107055664,
"learning_rate": 4.062934871761497e-08,
"logits/chosen": -1.6828606128692627,
"logits/rejected": -1.6766620874404907,
"logps/chosen": -57.43531036376953,
"logps/rejected": -59.62762451171875,
"loss": 0.6837,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.030923152342438698,
"rewards/margins": 0.020262621343135834,
"rewards/rejected": 0.010660530999302864,
"step": 990
},
{
"epoch": 0.7204610951008645,
"grad_norm": 27.251075744628906,
"learning_rate": 4.038272613266419e-08,
"logits/chosen": -1.6652915477752686,
"logits/rejected": -1.6540310382843018,
"logps/chosen": -53.43489456176758,
"logps/rejected": -56.68244171142578,
"loss": 0.6821,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03513840585947037,
"rewards/margins": 0.024065181612968445,
"rewards/rejected": 0.011073225177824497,
"step": 1000
},
{
"epoch": 0.7276657060518732,
"grad_norm": 32.4438362121582,
"learning_rate": 4.0133670545740014e-08,
"logits/chosen": -1.671142578125,
"logits/rejected": -1.6590646505355835,
"logps/chosen": -48.778076171875,
"logps/rejected": -51.36602020263672,
"loss": 0.6815,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.03226137161254883,
"rewards/margins": 0.0248870812356472,
"rewards/rejected": 0.007374290376901627,
"step": 1010
},
{
"epoch": 0.7348703170028819,
"grad_norm": 26.294254302978516,
"learning_rate": 3.988222134860755e-08,
"logits/chosen": -1.6925846338272095,
"logits/rejected": -1.6816070079803467,
"logps/chosen": -52.785797119140625,
"logps/rejected": -55.186126708984375,
"loss": 0.6878,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.02925296127796173,
"rewards/margins": 0.011979686096310616,
"rewards/rejected": 0.017273275181651115,
"step": 1020
},
{
"epoch": 0.7420749279538905,
"grad_norm": 24.478652954101562,
"learning_rate": 3.962841831161617e-08,
"logits/chosen": -1.619998574256897,
"logits/rejected": -1.6116468906402588,
"logps/chosen": -49.49003982543945,
"logps/rejected": -54.25371170043945,
"loss": 0.678,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.04156359285116196,
"rewards/margins": 0.032577551901340485,
"rewards/rejected": 0.008986040018498898,
"step": 1030
},
{
"epoch": 0.7492795389048992,
"grad_norm": 23.191585540771484,
"learning_rate": 3.937230157740931e-08,
"logits/chosen": -1.6642029285430908,
"logits/rejected": -1.6495530605316162,
"logps/chosen": -52.95183563232422,
"logps/rejected": -55.2051887512207,
"loss": 0.6818,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.033531554043293,
"rewards/margins": 0.024701178073883057,
"rewards/rejected": 0.008830374106764793,
"step": 1040
},
{
"epoch": 0.7564841498559077,
"grad_norm": 22.424583435058594,
"learning_rate": 3.9113911654575246e-08,
"logits/chosen": -1.5803316831588745,
"logits/rejected": -1.5679031610488892,
"logps/chosen": -47.259952545166016,
"logps/rejected": -51.2758903503418,
"loss": 0.678,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.03728173300623894,
"rewards/margins": 0.03241972625255585,
"rewards/rejected": 0.004862007685005665,
"step": 1050
},
{
"epoch": 0.7636887608069164,
"grad_norm": 27.110639572143555,
"learning_rate": 3.885328941124014e-08,
"logits/chosen": -1.6533464193344116,
"logits/rejected": -1.6282085180282593,
"logps/chosen": -54.238807678222656,
"logps/rejected": -55.80359649658203,
"loss": 0.6777,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.037798795849084854,
"rewards/margins": 0.03302844241261482,
"rewards/rejected": 0.00477034505456686,
"step": 1060
},
{
"epoch": 0.770893371757925,
"grad_norm": 34.7132453918457,
"learning_rate": 3.8590476068604106e-08,
"logits/chosen": -1.6385762691497803,
"logits/rejected": -1.6310360431671143,
"logps/chosen": -61.6613883972168,
"logps/rejected": -66.19737243652344,
"loss": 0.6774,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.04412104934453964,
"rewards/margins": 0.03382264822721481,
"rewards/rejected": 0.010298402979969978,
"step": 1070
},
{
"epoch": 0.7780979827089337,
"grad_norm": 27.947389602661133,
"learning_rate": 3.832551319442151e-08,
"logits/chosen": -1.6364872455596924,
"logits/rejected": -1.6301469802856445,
"logps/chosen": -55.05015182495117,
"logps/rejected": -59.003196716308594,
"loss": 0.6751,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04528949409723282,
"rewards/margins": 0.038304783403873444,
"rewards/rejected": 0.006984707899391651,
"step": 1080
},
{
"epoch": 0.7853025936599424,
"grad_norm": 32.10716247558594,
"learning_rate": 3.8058442696426404e-08,
"logits/chosen": -1.6696398258209229,
"logits/rejected": -1.6630815267562866,
"logps/chosen": -60.15520095825195,
"logps/rejected": -64.48773193359375,
"loss": 0.6787,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02938222512602806,
"rewards/margins": 0.031164387241005898,
"rewards/rejected": -0.001782161882147193,
"step": 1090
},
{
"epoch": 0.792507204610951,
"grad_norm": 30.23760986328125,
"learning_rate": 3.7789306815704216e-08,
"logits/chosen": -1.6817725896835327,
"logits/rejected": -1.6605682373046875,
"logps/chosen": -53.92103958129883,
"logps/rejected": -56.1260986328125,
"loss": 0.6779,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.041012417525053024,
"rewards/margins": 0.03267299756407738,
"rewards/rejected": 0.00833942275494337,
"step": 1100
},
{
"epoch": 0.7997118155619597,
"grad_norm": 20.537240982055664,
"learning_rate": 3.7518148120010705e-08,
"logits/chosen": -1.6919755935668945,
"logits/rejected": -1.6787545680999756,
"logps/chosen": -53.570655822753906,
"logps/rejected": -56.57023239135742,
"loss": 0.6792,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.03128691390156746,
"rewards/margins": 0.03024480864405632,
"rewards/rejected": 0.0010421050246804953,
"step": 1110
},
{
"epoch": 0.8069164265129684,
"grad_norm": 23.85075569152832,
"learning_rate": 3.7245009497039244e-08,
"logits/chosen": -1.6372750997543335,
"logits/rejected": -1.6247594356536865,
"logps/chosen": -52.35149383544922,
"logps/rejected": -53.97734832763672,
"loss": 0.6766,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.04169910401105881,
"rewards/margins": 0.036101967096328735,
"rewards/rejected": 0.005597130861133337,
"step": 1120
},
{
"epoch": 0.8141210374639769,
"grad_norm": 27.246475219726562,
"learning_rate": 3.696993414763753e-08,
"logits/chosen": -1.6642570495605469,
"logits/rejected": -1.658142328262329,
"logps/chosen": -49.33148193359375,
"logps/rejected": -51.482025146484375,
"loss": 0.6839,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.03087095357477665,
"rewards/margins": 0.020789533853530884,
"rewards/rejected": 0.010081417858600616,
"step": 1130
},
{
"epoch": 0.8213256484149856,
"grad_norm": 26.61668586730957,
"learning_rate": 3.66929655789747e-08,
"logits/chosen": -1.706260323524475,
"logits/rejected": -1.6993458271026611,
"logps/chosen": -53.623802185058594,
"logps/rejected": -56.636138916015625,
"loss": 0.6747,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.04618995636701584,
"rewards/margins": 0.03998780995607376,
"rewards/rejected": 0.006202142685651779,
"step": 1140
},
{
"epoch": 0.8285302593659942,
"grad_norm": 25.16993522644043,
"learning_rate": 3.64141475976601e-08,
"logits/chosen": -1.7065346240997314,
"logits/rejected": -1.6957979202270508,
"logps/chosen": -55.83368682861328,
"logps/rejected": -58.4466667175293,
"loss": 0.68,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.034841228276491165,
"rewards/margins": 0.028890833258628845,
"rewards/rejected": 0.005950393620878458,
"step": 1150
},
{
"epoch": 0.8357348703170029,
"grad_norm": 28.79724884033203,
"learning_rate": 3.61335243028146e-08,
"logits/chosen": -1.6835393905639648,
"logits/rejected": -1.6707643270492554,
"logps/chosen": -58.164833068847656,
"logps/rejected": -58.84022903442383,
"loss": 0.6803,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.03287480026483536,
"rewards/margins": 0.028582941740751266,
"rewards/rejected": 0.004291852004826069,
"step": 1160
},
{
"epoch": 0.8429394812680115,
"grad_norm": 31.446813583374023,
"learning_rate": 3.585114007909562e-08,
"logits/chosen": -1.6065078973770142,
"logits/rejected": -1.5808089971542358,
"logps/chosen": -52.96314239501953,
"logps/rejected": -53.44304275512695,
"loss": 0.681,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.03624821454286575,
"rewards/margins": 0.0269475020468235,
"rewards/rejected": 0.009300706908106804,
"step": 1170
},
{
"epoch": 0.8501440922190202,
"grad_norm": 40.8033332824707,
"learning_rate": 3.556703958967716e-08,
"logits/chosen": -1.5730987787246704,
"logits/rejected": -1.560154676437378,
"logps/chosen": -55.611358642578125,
"logps/rejected": -57.49493408203125,
"loss": 0.6802,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.03398990258574486,
"rewards/margins": 0.02881578728556633,
"rewards/rejected": 0.005174115765839815,
"step": 1180
},
{
"epoch": 0.8573487031700289,
"grad_norm": 26.1713809967041,
"learning_rate": 3.528126776918559e-08,
"logits/chosen": -1.732617974281311,
"logits/rejected": -1.7108558416366577,
"logps/chosen": -56.743370056152344,
"logps/rejected": -58.24882888793945,
"loss": 0.6791,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.032749153673648834,
"rewards/margins": 0.03046230599284172,
"rewards/rejected": 0.0022868472151458263,
"step": 1190
},
{
"epoch": 0.8645533141210374,
"grad_norm": 30.190982818603516,
"learning_rate": 3.499386981659262e-08,
"logits/chosen": -1.6135402917861938,
"logits/rejected": -1.5936330556869507,
"logps/chosen": -59.51149368286133,
"logps/rejected": -61.74272918701172,
"loss": 0.678,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03140435367822647,
"rewards/margins": 0.03346829488873482,
"rewards/rejected": -0.002063935622572899,
"step": 1200
},
{
"epoch": 0.8717579250720461,
"grad_norm": 24.446578979492188,
"learning_rate": 3.47048911880664e-08,
"logits/chosen": -1.594386339187622,
"logits/rejected": -1.592362642288208,
"logps/chosen": -49.790889739990234,
"logps/rejected": -52.867347717285156,
"loss": 0.6794,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.03862444683909416,
"rewards/margins": 0.030268633738160133,
"rewards/rejected": 0.008355814963579178,
"step": 1210
},
{
"epoch": 0.8789625360230547,
"grad_norm": 29.226476669311523,
"learning_rate": 3.4414377589782e-08,
"logits/chosen": -1.642496109008789,
"logits/rejected": -1.6357898712158203,
"logps/chosen": -53.193016052246094,
"logps/rejected": -56.894073486328125,
"loss": 0.6791,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.03899763897061348,
"rewards/margins": 0.03164661303162575,
"rewards/rejected": 0.007351027335971594,
"step": 1220
},
{
"epoch": 0.8861671469740634,
"grad_norm": 26.702550888061523,
"learning_rate": 3.412237497069226e-08,
"logits/chosen": -1.5900542736053467,
"logits/rejected": -1.5684837102890015,
"logps/chosen": -55.75627517700195,
"logps/rejected": -58.2167854309082,
"loss": 0.6743,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.044974491000175476,
"rewards/margins": 0.0419379398226738,
"rewards/rejected": 0.003036542795598507,
"step": 1230
},
{
"epoch": 0.8933717579250721,
"grad_norm": 33.24451446533203,
"learning_rate": 3.382892951526036e-08,
"logits/chosen": -1.647112250328064,
"logits/rejected": -1.6350224018096924,
"logps/chosen": -48.98326873779297,
"logps/rejected": -52.603431701660156,
"loss": 0.6709,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.04122794419527054,
"rewards/margins": 0.04834694787859917,
"rewards/rejected": -0.007119007408618927,
"step": 1240
},
{
"epoch": 0.9005763688760807,
"grad_norm": 24.48476219177246,
"learning_rate": 3.353408763615502e-08,
"logits/chosen": -1.694043755531311,
"logits/rejected": -1.6888548135757446,
"logps/chosen": -58.39896774291992,
"logps/rejected": -62.35167694091797,
"loss": 0.6814,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.035146381705999374,
"rewards/margins": 0.0263100303709507,
"rewards/rejected": 0.008836353197693825,
"step": 1250
},
{
"epoch": 0.9077809798270894,
"grad_norm": 28.14899444580078,
"learning_rate": 3.323789596690971e-08,
"logits/chosen": -1.6525447368621826,
"logits/rejected": -1.665614128112793,
"logps/chosen": -53.552452087402344,
"logps/rejected": -61.17711639404297,
"loss": 0.6799,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.029197026044130325,
"rewards/margins": 0.029619354754686356,
"rewards/rejected": -0.0004223290889058262,
"step": 1260
},
{
"epoch": 0.9149855907780979,
"grad_norm": 28.011920928955078,
"learning_rate": 3.294040135454681e-08,
"logits/chosen": -1.5988832712173462,
"logits/rejected": -1.5843169689178467,
"logps/chosen": -52.511505126953125,
"logps/rejected": -55.977783203125,
"loss": 0.6722,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.04705361649394035,
"rewards/margins": 0.04608291760087013,
"rewards/rejected": 0.0009706999990157783,
"step": 1270
},
{
"epoch": 0.9221902017291066,
"grad_norm": 33.35093307495117,
"learning_rate": 3.264165085216817e-08,
"logits/chosen": -1.7459869384765625,
"logits/rejected": -1.7356443405151367,
"logps/chosen": -53.0772590637207,
"logps/rejected": -57.63134002685547,
"loss": 0.6681,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.05287221819162369,
"rewards/margins": 0.05477180331945419,
"rewards/rejected": -0.0018995910650119185,
"step": 1280
},
{
"epoch": 0.9293948126801153,
"grad_norm": 24.03230857849121,
"learning_rate": 3.2341691711512854e-08,
"logits/chosen": -1.712523102760315,
"logits/rejected": -1.708817720413208,
"logps/chosen": -52.34474563598633,
"logps/rejected": -56.7424430847168,
"loss": 0.6729,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.046087004244327545,
"rewards/margins": 0.04450781270861626,
"rewards/rejected": 0.0015791950281709433,
"step": 1290
},
{
"epoch": 0.9365994236311239,
"grad_norm": 27.065187454223633,
"learning_rate": 3.204057137548371e-08,
"logits/chosen": -1.7500317096710205,
"logits/rejected": -1.7493479251861572,
"logps/chosen": -54.37158203125,
"logps/rejected": -56.19264602661133,
"loss": 0.6884,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0221403818577528,
"rewards/margins": 0.011624941602349281,
"rewards/rejected": 0.01051543839275837,
"step": 1300
},
{
"epoch": 0.9438040345821326,
"grad_norm": 26.071910858154297,
"learning_rate": 3.173833747064351e-08,
"logits/chosen": -1.709240198135376,
"logits/rejected": -1.7107799053192139,
"logps/chosen": -47.57482147216797,
"logps/rejected": -51.987648010253906,
"loss": 0.6751,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.043011344969272614,
"rewards/margins": 0.03987384960055351,
"rewards/rejected": 0.00313749467022717,
"step": 1310
},
{
"epoch": 0.9510086455331412,
"grad_norm": 28.993371963500977,
"learning_rate": 3.143503779968213e-08,
"logits/chosen": -1.6517963409423828,
"logits/rejected": -1.636230230331421,
"logps/chosen": -53.55908966064453,
"logps/rejected": -56.618064880371094,
"loss": 0.6714,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.047532517462968826,
"rewards/margins": 0.04704531282186508,
"rewards/rejected": 0.0004871990531682968,
"step": 1320
},
{
"epoch": 0.9582132564841499,
"grad_norm": 27.038301467895508,
"learning_rate": 3.113072033385589e-08,
"logits/chosen": -1.6915805339813232,
"logits/rejected": -1.6699497699737549,
"logps/chosen": -58.74944305419922,
"logps/rejected": -60.51899337768555,
"loss": 0.6719,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04666688293218613,
"rewards/margins": 0.04686695709824562,
"rewards/rejected": -0.00020007490820717067,
"step": 1330
},
{
"epoch": 0.9654178674351584,
"grad_norm": 28.07818031311035,
"learning_rate": 3.082543320540015e-08,
"logits/chosen": -1.6783336400985718,
"logits/rejected": -1.675157904624939,
"logps/chosen": -52.443756103515625,
"logps/rejected": -56.30717849731445,
"loss": 0.6767,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.04153715819120407,
"rewards/margins": 0.03781905025243759,
"rewards/rejected": 0.0037181121297180653,
"step": 1340
},
{
"epoch": 0.9726224783861671,
"grad_norm": 25.59157943725586,
"learning_rate": 3.051922469991655e-08,
"logits/chosen": -1.5502612590789795,
"logits/rejected": -1.5441707372665405,
"logps/chosen": -59.5379524230957,
"logps/rejected": -61.09001541137695,
"loss": 0.6723,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.05017656087875366,
"rewards/margins": 0.04551985114812851,
"rewards/rejected": 0.004656708799302578,
"step": 1350
},
{
"epoch": 0.9798270893371758,
"grad_norm": 32.95548629760742,
"learning_rate": 3.0212143248735886e-08,
"logits/chosen": -1.6339962482452393,
"logits/rejected": -1.6290324926376343,
"logps/chosen": -50.37676239013672,
"logps/rejected": -53.350425720214844,
"loss": 0.6692,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.052200425416231155,
"rewards/margins": 0.051705501973629,
"rewards/rejected": 0.0004949237918481231,
"step": 1360
},
{
"epoch": 0.9870317002881844,
"grad_norm": 23.29275131225586,
"learning_rate": 2.9904237421258046e-08,
"logits/chosen": -1.6470777988433838,
"logits/rejected": -1.6439279317855835,
"logps/chosen": -50.2126579284668,
"logps/rejected": -54.6882438659668,
"loss": 0.6747,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.042076610028743744,
"rewards/margins": 0.04092058166861534,
"rewards/rejected": 0.0011560240527614951,
"step": 1370
},
{
"epoch": 0.9942363112391931,
"grad_norm": 29.630380630493164,
"learning_rate": 2.9595555917269997e-08,
"logits/chosen": -1.5971949100494385,
"logits/rejected": -1.5871821641921997,
"logps/chosen": -53.988670349121094,
"logps/rejected": -55.97953414916992,
"loss": 0.6795,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.04265677556395531,
"rewards/margins": 0.030483752489089966,
"rewards/rejected": 0.012173019349575043,
"step": 1380
},
{
"epoch": 1.0014409221902016,
"grad_norm": 29.612489700317383,
"learning_rate": 2.928614755924327e-08,
"logits/chosen": -1.7032960653305054,
"logits/rejected": -1.698068618774414,
"logps/chosen": -50.45367431640625,
"logps/rejected": -53.488685607910156,
"loss": 0.6765,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0573929063975811,
"rewards/margins": 0.039073482155799866,
"rewards/rejected": 0.018319427967071533,
"step": 1390
},
{
"epoch": 1.0086455331412103,
"grad_norm": 20.9265193939209,
"learning_rate": 2.8976061284611908e-08,
"logits/chosen": -1.5756412744522095,
"logits/rejected": -1.564943552017212,
"logps/chosen": -54.020263671875,
"logps/rejected": -56.86461639404297,
"loss": 0.6636,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05064217373728752,
"rewards/margins": 0.06473135948181152,
"rewards/rejected": -0.01408919133245945,
"step": 1400
},
{
"epoch": 1.015850144092219,
"grad_norm": 24.08535385131836,
"learning_rate": 2.8665346138032327e-08,
"logits/chosen": -1.6361067295074463,
"logits/rejected": -1.6422010660171509,
"logps/chosen": -49.97898483276367,
"logps/rejected": -54.96118927001953,
"loss": 0.6666,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.05061709135770798,
"rewards/margins": 0.05784117057919502,
"rewards/rejected": -0.007224074099212885,
"step": 1410
},
{
"epoch": 1.0230547550432276,
"grad_norm": 28.920146942138672,
"learning_rate": 2.8354051263626227e-08,
"logits/chosen": -1.6691162586212158,
"logits/rejected": -1.6595113277435303,
"logps/chosen": -56.175636291503906,
"logps/rejected": -59.953834533691406,
"loss": 0.6682,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.04565143585205078,
"rewards/margins": 0.056333862245082855,
"rewards/rejected": -0.010682420805096626,
"step": 1420
},
{
"epoch": 1.0302593659942363,
"grad_norm": 28.18346405029297,
"learning_rate": 2.8042225897207648e-08,
"logits/chosen": -1.71315598487854,
"logits/rejected": -1.7045456171035767,
"logps/chosen": -44.08661651611328,
"logps/rejected": -46.05047607421875,
"loss": 0.6732,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.044674523174762726,
"rewards/margins": 0.045459240674972534,
"rewards/rejected": -0.0007847134256735444,
"step": 1430
},
{
"epoch": 1.037463976945245,
"grad_norm": 29.775287628173828,
"learning_rate": 2.7729919358495728e-08,
"logits/chosen": -1.654279351234436,
"logits/rejected": -1.640808343887329,
"logps/chosen": -64.65403747558594,
"logps/rejected": -64.36153411865234,
"loss": 0.6752,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.039084166288375854,
"rewards/margins": 0.040439385920763016,
"rewards/rejected": -0.0013552254531532526,
"step": 1440
},
{
"epoch": 1.0446685878962536,
"grad_norm": 23.282686233520508,
"learning_rate": 2.741718104331393e-08,
"logits/chosen": -1.750061273574829,
"logits/rejected": -1.7619024515151978,
"logps/chosen": -48.79555130004883,
"logps/rejected": -56.19548416137695,
"loss": 0.6678,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.03792846202850342,
"rewards/margins": 0.057776786386966705,
"rewards/rejected": -0.01984831690788269,
"step": 1450
},
{
"epoch": 1.0518731988472623,
"grad_norm": 28.902856826782227,
"learning_rate": 2.710406041577751e-08,
"logits/chosen": -1.6216195821762085,
"logits/rejected": -1.6019713878631592,
"logps/chosen": -53.21555709838867,
"logps/rejected": -58.60173797607422,
"loss": 0.6597,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.057058185338974,
"rewards/margins": 0.07414309680461884,
"rewards/rejected": -0.017084909602999687,
"step": 1460
},
{
"epoch": 1.059077809798271,
"grad_norm": 27.661142349243164,
"learning_rate": 2.679060700046994e-08,
"logits/chosen": -1.7108032703399658,
"logits/rejected": -1.69040846824646,
"logps/chosen": -48.76203536987305,
"logps/rejected": -52.77034378051758,
"loss": 0.6657,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.05403587222099304,
"rewards/margins": 0.06079152971506119,
"rewards/rejected": -0.006755647249519825,
"step": 1470
},
{
"epoch": 1.0662824207492796,
"grad_norm": 24.555545806884766,
"learning_rate": 2.647687037460996e-08,
"logits/chosen": -1.650090217590332,
"logits/rejected": -1.6332178115844727,
"logps/chosen": -53.99782180786133,
"logps/rejected": -56.94426727294922,
"loss": 0.6725,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05287463590502739,
"rewards/margins": 0.04700274020433426,
"rewards/rejected": 0.005871894769370556,
"step": 1480
},
{
"epoch": 1.0734870317002883,
"grad_norm": 35.28300857543945,
"learning_rate": 2.616290016021016e-08,
"logits/chosen": -1.62253737449646,
"logits/rejected": -1.610269546508789,
"logps/chosen": -61.49285888671875,
"logps/rejected": -63.3193244934082,
"loss": 0.6735,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.045845337212085724,
"rewards/margins": 0.04515889286994934,
"rewards/rejected": 0.0006864480674266815,
"step": 1490
},
{
"epoch": 1.080691642651297,
"grad_norm": 26.37901496887207,
"learning_rate": 2.584874601622854e-08,
"logits/chosen": -1.6861085891723633,
"logits/rejected": -1.6631263494491577,
"logps/chosen": -51.32836151123047,
"logps/rejected": -54.0167236328125,
"loss": 0.6717,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05208291485905647,
"rewards/margins": 0.04669463634490967,
"rewards/rejected": 0.0053882719948887825,
"step": 1500
},
{
"epoch": 1.0878962536023056,
"grad_norm": 29.798765182495117,
"learning_rate": 2.5534457630714267e-08,
"logits/chosen": -1.6969906091690063,
"logits/rejected": -1.6932684183120728,
"logps/chosen": -49.14203643798828,
"logps/rejected": -53.022621154785156,
"loss": 0.6649,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05330803245306015,
"rewards/margins": 0.06402654945850372,
"rewards/rejected": -0.010718528181314468,
"step": 1510
},
{
"epoch": 1.0951008645533142,
"grad_norm": 29.122047424316406,
"learning_rate": 2.5220084712948764e-08,
"logits/chosen": -1.6610676050186157,
"logits/rejected": -1.6523653268814087,
"logps/chosen": -60.46284103393555,
"logps/rejected": -59.5699577331543,
"loss": 0.6877,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.031362272799015045,
"rewards/margins": 0.015023702755570412,
"rewards/rejected": 0.016338571906089783,
"step": 1520
},
{
"epoch": 1.1023054755043227,
"grad_norm": 29.082090377807617,
"learning_rate": 2.490567698558343e-08,
"logits/chosen": -1.706171989440918,
"logits/rejected": -1.6938079595565796,
"logps/chosen": -54.25682830810547,
"logps/rejected": -58.964813232421875,
"loss": 0.6691,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.05163877084851265,
"rewards/margins": 0.053050000220537186,
"rewards/rejected": -0.0014112277422100306,
"step": 1530
},
{
"epoch": 1.1095100864553313,
"grad_norm": 28.59368324279785,
"learning_rate": 2.4591284176775326e-08,
"logits/chosen": -1.7147783041000366,
"logits/rejected": -1.702530860900879,
"logps/chosen": -52.58336639404297,
"logps/rejected": -56.28779983520508,
"loss": 0.6688,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.05651012808084488,
"rewards/margins": 0.053883958607912064,
"rewards/rejected": 0.002626165049150586,
"step": 1540
},
{
"epoch": 1.11671469740634,
"grad_norm": 23.492542266845703,
"learning_rate": 2.4276956012321926e-08,
"logits/chosen": -1.6896263360977173,
"logits/rejected": -1.6793534755706787,
"logps/chosen": -53.504417419433594,
"logps/rejected": -55.97063446044922,
"loss": 0.6709,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04645276069641113,
"rewards/margins": 0.049985408782958984,
"rewards/rejected": -0.0035326494835317135,
"step": 1550
},
{
"epoch": 1.1239193083573487,
"grad_norm": 32.83745574951172,
"learning_rate": 2.3962742207796268e-08,
"logits/chosen": -1.727237343788147,
"logits/rejected": -1.7086503505706787,
"logps/chosen": -54.65483856201172,
"logps/rejected": -58.435203552246094,
"loss": 0.6647,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.05584675073623657,
"rewards/margins": 0.06230972334742546,
"rewards/rejected": -0.0064629726111888885,
"step": 1560
},
{
"epoch": 1.1311239193083573,
"grad_norm": 21.926939010620117,
"learning_rate": 2.364869246068368e-08,
"logits/chosen": -1.7023918628692627,
"logits/rejected": -1.6883018016815186,
"logps/chosen": -56.33196258544922,
"logps/rejected": -59.65509796142578,
"loss": 0.6738,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.03886229544878006,
"rewards/margins": 0.04508962482213974,
"rewards/rejected": -0.006227326579391956,
"step": 1570
},
{
"epoch": 1.138328530259366,
"grad_norm": 33.17292022705078,
"learning_rate": 2.3334856442521435e-08,
"logits/chosen": -1.6609952449798584,
"logits/rejected": -1.6502721309661865,
"logps/chosen": -56.61347198486328,
"logps/rejected": -60.70231246948242,
"loss": 0.6603,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.05509304255247116,
"rewards/margins": 0.07247869670391083,
"rewards/rejected": -0.017385641112923622,
"step": 1580
},
{
"epoch": 1.1455331412103746,
"grad_norm": 34.729270935058594,
"learning_rate": 2.3021283791042474e-08,
"logits/chosen": -1.6414772272109985,
"logits/rejected": -1.629290223121643,
"logps/chosen": -52.38581085205078,
"logps/rejected": -57.90489959716797,
"loss": 0.6635,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": 0.04822907969355583,
"rewards/margins": 0.06632965803146362,
"rewards/rejected": -0.018100585788488388,
"step": 1590
},
{
"epoch": 1.1527377521613833,
"grad_norm": 29.043781280517578,
"learning_rate": 2.2708024102324454e-08,
"logits/chosen": -1.689874291419983,
"logits/rejected": -1.6721159219741821,
"logps/chosen": -56.054420471191406,
"logps/rejected": -58.819847106933594,
"loss": 0.6717,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.05031623691320419,
"rewards/margins": 0.04818098247051239,
"rewards/rejected": 0.002135257935151458,
"step": 1600
},
{
"epoch": 1.159942363112392,
"grad_norm": 29.580005645751953,
"learning_rate": 2.23951269229454e-08,
"logits/chosen": -1.613242745399475,
"logits/rejected": -1.6088173389434814,
"logps/chosen": -53.486328125,
"logps/rejected": -56.55217742919922,
"loss": 0.6679,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.04551651328802109,
"rewards/margins": 0.05793655663728714,
"rewards/rejected": -0.012420037761330605,
"step": 1610
},
{
"epoch": 1.1671469740634006,
"grad_norm": 31.668487548828125,
"learning_rate": 2.2082641742147238e-08,
"logits/chosen": -1.7278293371200562,
"logits/rejected": -1.7244508266448975,
"logps/chosen": -56.515350341796875,
"logps/rejected": -61.60710906982422,
"loss": 0.6682,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.0525764599442482,
"rewards/margins": 0.054812002927064896,
"rewards/rejected": -0.002235544379800558,
"step": 1620
},
{
"epoch": 1.1743515850144093,
"grad_norm": 39.19826889038086,
"learning_rate": 2.177061798400832e-08,
"logits/chosen": -1.5428308248519897,
"logits/rejected": -1.5289558172225952,
"logps/chosen": -58.390045166015625,
"logps/rejected": -58.40192794799805,
"loss": 0.6704,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.04328244924545288,
"rewards/margins": 0.050250399857759476,
"rewards/rejected": -0.006967948284000158,
"step": 1630
},
{
"epoch": 1.181556195965418,
"grad_norm": 26.75935935974121,
"learning_rate": 2.145910499962628e-08,
"logits/chosen": -1.6505706310272217,
"logits/rejected": -1.6422561407089233,
"logps/chosen": -52.12128448486328,
"logps/rejected": -53.86430740356445,
"loss": 0.666,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.057096611708402634,
"rewards/margins": 0.06089169532060623,
"rewards/rejected": -0.0037950840778648853,
"step": 1640
},
{
"epoch": 1.1887608069164266,
"grad_norm": 26.792505264282227,
"learning_rate": 2.1148152059312437e-08,
"logits/chosen": -1.6734596490859985,
"logits/rejected": -1.663163185119629,
"logps/chosen": -46.515968322753906,
"logps/rejected": -47.968421936035156,
"loss": 0.6765,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.039860546588897705,
"rewards/margins": 0.039603374898433685,
"rewards/rejected": 0.000257173553109169,
"step": 1650
},
{
"epoch": 1.195965417867435,
"grad_norm": 25.975000381469727,
"learning_rate": 2.0837808344799028e-08,
"logits/chosen": -1.5417954921722412,
"logits/rejected": -1.5429438352584839,
"logps/chosen": -52.71075439453125,
"logps/rejected": -55.24102783203125,
"loss": 0.668,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04647735133767128,
"rewards/margins": 0.057897698134183884,
"rewards/rejected": -0.01142034586519003,
"step": 1660
},
{
"epoch": 1.2031700288184437,
"grad_norm": 31.299114227294922,
"learning_rate": 2.052812294146033e-08,
"logits/chosen": -1.680318832397461,
"logits/rejected": -1.673017144203186,
"logps/chosen": -52.15236282348633,
"logps/rejected": -56.82316970825195,
"loss": 0.6659,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05961848050355911,
"rewards/margins": 0.062046296894550323,
"rewards/rejected": -0.002427825704216957,
"step": 1670
},
{
"epoch": 1.2103746397694524,
"grad_norm": 31.15201759338379,
"learning_rate": 2.0219144830549163e-08,
"logits/chosen": -1.5866892337799072,
"logits/rejected": -1.5717523097991943,
"logps/chosen": -52.0449333190918,
"logps/rejected": -57.047821044921875,
"loss": 0.6621,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.05372787266969681,
"rewards/margins": 0.06935948133468628,
"rewards/rejected": -0.015631603077054024,
"step": 1680
},
{
"epoch": 1.217579250720461,
"grad_norm": 25.879758834838867,
"learning_rate": 1.9910922881449716e-08,
"logits/chosen": -1.65450918674469,
"logits/rejected": -1.646868109703064,
"logps/chosen": -57.28217697143555,
"logps/rejected": -59.71793746948242,
"loss": 0.6631,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06054146960377693,
"rewards/margins": 0.06607099622488022,
"rewards/rejected": -0.005529527552425861,
"step": 1690
},
{
"epoch": 1.2247838616714697,
"grad_norm": 27.07415771484375,
"learning_rate": 1.9603505843948214e-08,
"logits/chosen": -1.6178529262542725,
"logits/rejected": -1.6128696203231812,
"logps/chosen": -56.807891845703125,
"logps/rejected": -60.698814392089844,
"loss": 0.6603,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.06525342166423798,
"rewards/margins": 0.0739850401878357,
"rewards/rejected": -0.008731614798307419,
"step": 1700
},
{
"epoch": 1.2319884726224783,
"grad_norm": 31.243019104003906,
"learning_rate": 1.929694234052239e-08,
"logits/chosen": -1.7153377532958984,
"logits/rejected": -1.6968237161636353,
"logps/chosen": -48.6357421875,
"logps/rejected": -53.58130645751953,
"loss": 0.6613,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.059562791138887405,
"rewards/margins": 0.0704553872346878,
"rewards/rejected": -0.010892586782574654,
"step": 1710
},
{
"epoch": 1.239193083573487,
"grad_norm": 21.567594528198242,
"learning_rate": 1.8991280858651157e-08,
"logits/chosen": -1.6218206882476807,
"logits/rejected": -1.6051326990127563,
"logps/chosen": -50.012847900390625,
"logps/rejected": -52.8299674987793,
"loss": 0.6674,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.04892825335264206,
"rewards/margins": 0.06088585779070854,
"rewards/rejected": -0.011957600712776184,
"step": 1720
},
{
"epoch": 1.2463976945244957,
"grad_norm": 32.33564758300781,
"learning_rate": 1.868656974314557e-08,
"logits/chosen": -1.6614080667495728,
"logits/rejected": -1.6554081439971924,
"logps/chosen": -52.624114990234375,
"logps/rejected": -55.892662048339844,
"loss": 0.6637,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.04592607915401459,
"rewards/margins": 0.06650461256504059,
"rewards/rejected": -0.020578527823090553,
"step": 1730
},
{
"epoch": 1.2536023054755043,
"grad_norm": 23.595190048217773,
"learning_rate": 1.8382857188502422e-08,
"logits/chosen": -1.6442153453826904,
"logits/rejected": -1.6373510360717773,
"logps/chosen": -56.04620361328125,
"logps/rejected": -59.06785202026367,
"loss": 0.6666,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.04996069148182869,
"rewards/margins": 0.06172338128089905,
"rewards/rejected": -0.011762691661715508,
"step": 1740
},
{
"epoch": 1.260806916426513,
"grad_norm": 30.709379196166992,
"learning_rate": 1.8080191231281594e-08,
"logits/chosen": -1.5838955640792847,
"logits/rejected": -1.5548055171966553,
"logps/chosen": -57.59186935424805,
"logps/rejected": -56.7793083190918,
"loss": 0.6561,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06202070042490959,
"rewards/margins": 0.08498839288949966,
"rewards/rejected": -0.022967690601944923,
"step": 1750
},
{
"epoch": 1.2680115273775217,
"grad_norm": 27.285667419433594,
"learning_rate": 1.7778619742508345e-08,
"logits/chosen": -1.6928040981292725,
"logits/rejected": -1.6915152072906494,
"logps/chosen": -58.45740509033203,
"logps/rejected": -61.478302001953125,
"loss": 0.6655,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.03818890452384949,
"rewards/margins": 0.0651729553937912,
"rewards/rejected": -0.026984045282006264,
"step": 1760
},
{
"epoch": 1.2752161383285303,
"grad_norm": 23.404830932617188,
"learning_rate": 1.7478190420101796e-08,
"logits/chosen": -1.7029117345809937,
"logits/rejected": -1.6898205280303955,
"logps/chosen": -50.935115814208984,
"logps/rejected": -55.13124465942383,
"loss": 0.6696,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.0503745898604393,
"rewards/margins": 0.054055869579315186,
"rewards/rejected": -0.0036812766920775175,
"step": 1770
},
{
"epoch": 1.282420749279539,
"grad_norm": 35.39152145385742,
"learning_rate": 1.717895078133088e-08,
"logits/chosen": -1.6763395071029663,
"logits/rejected": -1.6591565608978271,
"logps/chosen": -61.310096740722656,
"logps/rejected": -61.61467742919922,
"loss": 0.6758,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.044988058507442474,
"rewards/margins": 0.03980833292007446,
"rewards/rejected": 0.005179721862077713,
"step": 1780
},
{
"epoch": 1.2896253602305476,
"grad_norm": 24.02739715576172,
"learning_rate": 1.688094815529873e-08,
"logits/chosen": -1.6351617574691772,
"logits/rejected": -1.6387808322906494,
"logps/chosen": -53.970558166503906,
"logps/rejected": -56.1927375793457,
"loss": 0.6825,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.047341324388980865,
"rewards/margins": 0.029565196484327316,
"rewards/rejected": 0.017776133492588997,
"step": 1790
},
{
"epoch": 1.2968299711815563,
"grad_norm": 32.1756706237793,
"learning_rate": 1.658422967545693e-08,
"logits/chosen": -1.718798041343689,
"logits/rejected": -1.7084662914276123,
"logps/chosen": -52.566993713378906,
"logps/rejected": -59.37955856323242,
"loss": 0.6642,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.05607623979449272,
"rewards/margins": 0.06564650684595108,
"rewards/rejected": -0.009570261463522911,
"step": 1800
},
{
"epoch": 1.304034582132565,
"grad_norm": 20.695886611938477,
"learning_rate": 1.6288842272150614e-08,
"logits/chosen": -1.611883521080017,
"logits/rejected": -1.61124587059021,
"logps/chosen": -52.52956008911133,
"logps/rejected": -56.787391662597656,
"loss": 0.6662,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.0512949600815773,
"rewards/margins": 0.05934779718518257,
"rewards/rejected": -0.008052836172282696,
"step": 1810
},
{
"epoch": 1.3112391930835736,
"grad_norm": 31.054288864135742,
"learning_rate": 1.5994832665195853e-08,
"logits/chosen": -1.7016429901123047,
"logits/rejected": -1.691511869430542,
"logps/chosen": -46.15959930419922,
"logps/rejected": -51.521705627441406,
"loss": 0.6549,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06147737428545952,
"rewards/margins": 0.08640258014202118,
"rewards/rejected": -0.024925213307142258,
"step": 1820
},
{
"epoch": 1.318443804034582,
"grad_norm": 22.64872932434082,
"learning_rate": 1.5702247356490134e-08,
"logits/chosen": -1.6389744281768799,
"logits/rejected": -1.639120101928711,
"logps/chosen": -50.16230773925781,
"logps/rejected": -53.1915283203125,
"loss": 0.6641,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.04536745697259903,
"rewards/margins": 0.06614157557487488,
"rewards/rejected": -0.02077411487698555,
"step": 1830
},
{
"epoch": 1.3256484149855907,
"grad_norm": 20.916744232177734,
"learning_rate": 1.541113262265748e-08,
"logits/chosen": -1.6537967920303345,
"logits/rejected": -1.6307786703109741,
"logps/chosen": -54.82042694091797,
"logps/rejected": -56.67564010620117,
"loss": 0.6661,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.04908507689833641,
"rewards/margins": 0.0638277605175972,
"rewards/rejected": -0.014742677100002766,
"step": 1840
},
{
"epoch": 1.3328530259365994,
"grad_norm": 28.75771141052246,
"learning_rate": 1.5121534507729073e-08,
"logits/chosen": -1.7000787258148193,
"logits/rejected": -1.6867382526397705,
"logps/chosen": -50.214149475097656,
"logps/rejected": -54.311065673828125,
"loss": 0.6588,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.05089396983385086,
"rewards/margins": 0.07929139584302902,
"rewards/rejected": -0.02839742600917816,
"step": 1850
},
{
"epoch": 1.340057636887608,
"grad_norm": 29.51345443725586,
"learning_rate": 1.4833498815860756e-08,
"logits/chosen": -1.6562092304229736,
"logits/rejected": -1.6471668481826782,
"logps/chosen": -56.003273010253906,
"logps/rejected": -61.8942985534668,
"loss": 0.6588,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05405973643064499,
"rewards/margins": 0.0779876634478569,
"rewards/rejected": -0.023927928879857063,
"step": 1860
},
{
"epoch": 1.3472622478386167,
"grad_norm": 24.563007354736328,
"learning_rate": 1.4547071104088443e-08,
"logits/chosen": -1.6281534433364868,
"logits/rejected": -1.60134756565094,
"logps/chosen": -48.80409622192383,
"logps/rejected": -52.90191650390625,
"loss": 0.6614,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.05820552632212639,
"rewards/margins": 0.07069076597690582,
"rewards/rejected": -0.01248523872345686,
"step": 1870
},
{
"epoch": 1.3544668587896254,
"grad_norm": 28.101259231567383,
"learning_rate": 1.4262296675122592e-08,
"logits/chosen": -1.6599979400634766,
"logits/rejected": -1.6438566446304321,
"logps/chosen": -55.794822692871094,
"logps/rejected": -57.25176239013672,
"loss": 0.6673,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0343967080116272,
"rewards/margins": 0.060233019292354584,
"rewards/rejected": -0.025836322456598282,
"step": 1880
},
{
"epoch": 1.361671469740634,
"grad_norm": 28.072351455688477,
"learning_rate": 1.3979220570182902e-08,
"logits/chosen": -1.59923255443573,
"logits/rejected": -1.5962598323822021,
"logps/chosen": -52.601112365722656,
"logps/rejected": -57.248451232910156,
"loss": 0.6646,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.045020636171102524,
"rewards/margins": 0.06425632536411285,
"rewards/rejected": -0.019235694780945778,
"step": 1890
},
{
"epoch": 1.3688760806916427,
"grad_norm": 20.26775360107422,
"learning_rate": 1.369788756187445e-08,
"logits/chosen": -1.6709423065185547,
"logits/rejected": -1.6719890832901,
"logps/chosen": -51.86114501953125,
"logps/rejected": -55.500762939453125,
"loss": 0.6668,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05930880457162857,
"rewards/margins": 0.060519617050886154,
"rewards/rejected": -0.0012108208611607552,
"step": 1900
},
{
"epoch": 1.3760806916426513,
"grad_norm": 26.241369247436523,
"learning_rate": 1.3418342147106212e-08,
"logits/chosen": -1.7067817449569702,
"logits/rejected": -1.706256628036499,
"logps/chosen": -52.70702362060547,
"logps/rejected": -58.025123596191406,
"loss": 0.6565,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.05462120845913887,
"rewards/margins": 0.08217908442020416,
"rewards/rejected": -0.027557870373129845,
"step": 1910
},
{
"epoch": 1.38328530259366,
"grad_norm": 25.819034576416016,
"learning_rate": 1.3140628540053218e-08,
"logits/chosen": -1.726575255393982,
"logits/rejected": -1.7202606201171875,
"logps/chosen": -43.758033752441406,
"logps/rejected": -49.98960494995117,
"loss": 0.6633,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.05085862800478935,
"rewards/margins": 0.06844624131917953,
"rewards/rejected": -0.01758761703968048,
"step": 1920
},
{
"epoch": 1.3904899135446687,
"grad_norm": 36.01348114013672,
"learning_rate": 1.286479066516345e-08,
"logits/chosen": -1.5930241346359253,
"logits/rejected": -1.5837476253509521,
"logps/chosen": -59.70249557495117,
"logps/rejected": -61.39208984375,
"loss": 0.6697,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.059021733701229095,
"rewards/margins": 0.05476250499486923,
"rewards/rejected": 0.004259222187101841,
"step": 1930
},
{
"epoch": 1.397694524495677,
"grad_norm": 24.216604232788086,
"learning_rate": 1.2590872150210574e-08,
"logits/chosen": -1.7261098623275757,
"logits/rejected": -1.724927544593811,
"logps/chosen": -49.624183654785156,
"logps/rejected": -52.33625030517578,
"loss": 0.6671,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.05638096481561661,
"rewards/margins": 0.06055796891450882,
"rewards/rejected": -0.004177004564553499,
"step": 1940
},
{
"epoch": 1.4048991354466858,
"grad_norm": 28.147171020507812,
"learning_rate": 1.2318916319393555e-08,
"logits/chosen": -1.6773601770401,
"logits/rejected": -1.669012427330017,
"logps/chosen": -50.91980743408203,
"logps/rejected": -54.6119499206543,
"loss": 0.6592,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.056795258074998856,
"rewards/margins": 0.07905852049589157,
"rewards/rejected": -0.022263258695602417,
"step": 1950
},
{
"epoch": 1.4121037463976944,
"grad_norm": 30.398366928100586,
"learning_rate": 1.2048966186484282e-08,
"logits/chosen": -1.6361877918243408,
"logits/rejected": -1.6289517879486084,
"logps/chosen": -52.20341873168945,
"logps/rejected": -57.09885787963867,
"loss": 0.6561,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.060558564960956573,
"rewards/margins": 0.08404217660427094,
"rewards/rejected": -0.023483622819185257,
"step": 1960
},
{
"epoch": 1.419308357348703,
"grad_norm": 32.82428741455078,
"learning_rate": 1.1781064448024333e-08,
"logits/chosen": -1.6930503845214844,
"logits/rejected": -1.6847530603408813,
"logps/chosen": -46.96501159667969,
"logps/rejected": -51.72229766845703,
"loss": 0.6515,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.060513317584991455,
"rewards/margins": 0.09518440812826157,
"rewards/rejected": -0.03467109426856041,
"step": 1970
},
{
"epoch": 1.4265129682997117,
"grad_norm": 28.581178665161133,
"learning_rate": 1.1515253476571923e-08,
"logits/chosen": -1.656628966331482,
"logits/rejected": -1.655122995376587,
"logps/chosen": -51.16161346435547,
"logps/rejected": -56.41352081298828,
"loss": 0.6693,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.04029519110918045,
"rewards/margins": 0.05936024710536003,
"rewards/rejected": -0.019065069034695625,
"step": 1980
},
{
"epoch": 1.4337175792507204,
"grad_norm": 32.20552062988281,
"learning_rate": 1.1251575314000034e-08,
"logits/chosen": -1.6694313287734985,
"logits/rejected": -1.6587368249893188,
"logps/chosen": -49.67891311645508,
"logps/rejected": -52.66162872314453,
"loss": 0.6647,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.060387950390577316,
"rewards/margins": 0.06579816341400146,
"rewards/rejected": -0.00541021628305316,
"step": 1990
},
{
"epoch": 1.440922190201729,
"grad_norm": 19.01755142211914,
"learning_rate": 1.0990071664846861e-08,
"logits/chosen": -1.7050960063934326,
"logits/rejected": -1.6924508810043335,
"logps/chosen": -48.6504020690918,
"logps/rejected": -55.10358810424805,
"loss": 0.6532,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.05204867571592331,
"rewards/margins": 0.09019048511981964,
"rewards/rejected": -0.03814180940389633,
"step": 2000
},
{
"epoch": 1.4481268011527377,
"grad_norm": 29.312150955200195,
"learning_rate": 1.0730783889719711e-08,
"logits/chosen": -1.6206512451171875,
"logits/rejected": -1.6064426898956299,
"logps/chosen": -50.724178314208984,
"logps/rejected": -54.427764892578125,
"loss": 0.6648,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04072165489196777,
"rewards/margins": 0.06882860511541367,
"rewards/rejected": -0.02810695208609104,
"step": 2010
},
{
"epoch": 1.4553314121037464,
"grad_norm": 36.57780456542969,
"learning_rate": 1.0473752998753114e-08,
"logits/chosen": -1.7069648504257202,
"logits/rejected": -1.7023489475250244,
"logps/chosen": -52.97856903076172,
"logps/rejected": -55.6297607421875,
"loss": 0.6662,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.0466485396027565,
"rewards/margins": 0.06170258671045303,
"rewards/rejected": -0.015054039657115936,
"step": 2020
},
{
"epoch": 1.462536023054755,
"grad_norm": 31.10540008544922,
"learning_rate": 1.0219019645122575e-08,
"logits/chosen": -1.72427237033844,
"logits/rejected": -1.7157777547836304,
"logps/chosen": -52.942588806152344,
"logps/rejected": -57.13978958129883,
"loss": 0.6698,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.052157748490571976,
"rewards/margins": 0.05456269904971123,
"rewards/rejected": -0.0024049447383731604,
"step": 2030
},
{
"epoch": 1.4697406340057637,
"grad_norm": 35.095802307128906,
"learning_rate": 9.966624118614611e-09,
"logits/chosen": -1.6719284057617188,
"logits/rejected": -1.6706676483154297,
"logps/chosen": -57.96167755126953,
"logps/rejected": -62.491973876953125,
"loss": 0.6654,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.051690973341464996,
"rewards/margins": 0.0684889405965805,
"rewards/rejected": -0.016797970980405807,
"step": 2040
},
{
"epoch": 1.4769452449567724,
"grad_norm": 28.057044982910156,
"learning_rate": 9.71660633925438e-09,
"logits/chosen": -1.6711444854736328,
"logits/rejected": -1.6477901935577393,
"logps/chosen": -57.71684646606445,
"logps/rejected": -60.13401412963867,
"loss": 0.6615,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.04812953621149063,
"rewards/margins": 0.07305117696523666,
"rewards/rejected": -0.024921633303165436,
"step": 2050
},
{
"epoch": 1.484149855907781,
"grad_norm": 25.635986328125,
"learning_rate": 9.469005850991705e-09,
"logits/chosen": -1.6958105564117432,
"logits/rejected": -1.673832654953003,
"logps/chosen": -50.827880859375,
"logps/rejected": -51.07634735107422,
"loss": 0.666,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.034776292741298676,
"rewards/margins": 0.06257718801498413,
"rewards/rejected": -0.027800898998975754,
"step": 2060
},
{
"epoch": 1.4913544668587897,
"grad_norm": 36.9734992980957,
"learning_rate": 9.223861815446682e-09,
"logits/chosen": -1.7042324542999268,
"logits/rejected": -1.6777251958847046,
"logps/chosen": -58.7913703918457,
"logps/rejected": -60.8325080871582,
"loss": 0.6662,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.034731216728687286,
"rewards/margins": 0.062271714210510254,
"rewards/rejected": -0.027540501207113266,
"step": 2070
},
{
"epoch": 1.4985590778097984,
"grad_norm": 26.555492401123047,
"learning_rate": 8.981213005715627e-09,
"logits/chosen": -1.5820460319519043,
"logits/rejected": -1.5792698860168457,
"logps/chosen": -55.81233596801758,
"logps/rejected": -60.40593338012695,
"loss": 0.6603,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.05028228089213371,
"rewards/margins": 0.07836754620075226,
"rewards/rejected": -0.02808527648448944,
"step": 2080
},
{
"epoch": 1.505763688760807,
"grad_norm": 24.672306060791016,
"learning_rate": 8.741097800238617e-09,
"logits/chosen": -1.6848942041397095,
"logits/rejected": -1.6710937023162842,
"logps/chosen": -48.45435333251953,
"logps/rejected": -52.852561950683594,
"loss": 0.6598,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04019053652882576,
"rewards/margins": 0.07907537370920181,
"rewards/rejected": -0.03888483718037605,
"step": 2090
},
{
"epoch": 1.5129682997118157,
"grad_norm": 31.16644859313965,
"learning_rate": 8.503554176729341e-09,
"logits/chosen": -1.6204407215118408,
"logits/rejected": -1.6291402578353882,
"logps/chosen": -47.65703582763672,
"logps/rejected": -51.62767791748047,
"loss": 0.6664,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.05907278507947922,
"rewards/margins": 0.06321103870868683,
"rewards/rejected": -0.00413826247677207,
"step": 2100
},
{
"epoch": 1.5201729106628243,
"grad_norm": 26.00917625427246,
"learning_rate": 8.268619706168376e-09,
"logits/chosen": -1.6322330236434937,
"logits/rejected": -1.6150935888290405,
"logps/chosen": -50.94877624511719,
"logps/rejected": -54.5855598449707,
"loss": 0.6603,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.05506844073534012,
"rewards/margins": 0.07977042347192764,
"rewards/rejected": -0.024701988324522972,
"step": 2110
},
{
"epoch": 1.527377521613833,
"grad_norm": 30.194684982299805,
"learning_rate": 8.036331546860777e-09,
"logits/chosen": -1.4862804412841797,
"logits/rejected": -1.4699585437774658,
"logps/chosen": -58.9954948425293,
"logps/rejected": -61.62345504760742,
"loss": 0.6674,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.05533316731452942,
"rewards/margins": 0.06020069867372513,
"rewards/rejected": -0.004867529030889273,
"step": 2120
},
{
"epoch": 1.5345821325648417,
"grad_norm": 29.14430809020996,
"learning_rate": 7.806726438559003e-09,
"logits/chosen": -1.6214964389801025,
"logits/rejected": -1.6216814517974854,
"logps/chosen": -57.3265495300293,
"logps/rejected": -60.05931854248047,
"loss": 0.6725,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04320087283849716,
"rewards/margins": 0.0479980893433094,
"rewards/rejected": -0.004797212313860655,
"step": 2130
},
{
"epoch": 1.54178674351585,
"grad_norm": 43.7021484375,
"learning_rate": 7.579840696651938e-09,
"logits/chosen": -1.6111637353897095,
"logits/rejected": -1.598354697227478,
"logps/chosen": -59.49543380737305,
"logps/rejected": -62.404541015625,
"loss": 0.6614,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.04770822077989578,
"rewards/margins": 0.07293849438428879,
"rewards/rejected": -0.025230273604393005,
"step": 2140
},
{
"epoch": 1.5489913544668588,
"grad_norm": 31.50617218017578,
"learning_rate": 7.355710206421098e-09,
"logits/chosen": -1.5231688022613525,
"logits/rejected": -1.517321228981018,
"logps/chosen": -58.44475555419922,
"logps/rejected": -63.36370849609375,
"loss": 0.6602,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.05046730488538742,
"rewards/margins": 0.07906536757946014,
"rewards/rejected": -0.028598055243492126,
"step": 2150
},
{
"epoch": 1.5561959654178674,
"grad_norm": 27.282373428344727,
"learning_rate": 7.134370417364849e-09,
"logits/chosen": -1.6577104330062866,
"logits/rejected": -1.648329496383667,
"logps/chosen": -59.193382263183594,
"logps/rejected": -60.406333923339844,
"loss": 0.6538,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.05481324344873428,
"rewards/margins": 0.08997657150030136,
"rewards/rejected": -0.035163331776857376,
"step": 2160
},
{
"epoch": 1.563400576368876,
"grad_norm": 35.18039321899414,
"learning_rate": 6.915856337591572e-09,
"logits/chosen": -1.6204932928085327,
"logits/rejected": -1.6136722564697266,
"logps/chosen": -50.05255889892578,
"logps/rejected": -54.053077697753906,
"loss": 0.6627,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.07332804799079895,
"rewards/margins": 0.07051368057727814,
"rewards/rejected": 0.002814366715028882,
"step": 2170
},
{
"epoch": 1.5706051873198847,
"grad_norm": 32.98238754272461,
"learning_rate": 6.700202528282603e-09,
"logits/chosen": -1.6378253698349,
"logits/rejected": -1.6315876245498657,
"logps/chosen": -60.23234176635742,
"logps/rejected": -64.01595306396484,
"loss": 0.6756,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.02953862026333809,
"rewards/margins": 0.04212776944041252,
"rewards/rejected": -0.012589153833687305,
"step": 2180
},
{
"epoch": 1.5778097982708934,
"grad_norm": 35.59558868408203,
"learning_rate": 6.487443098225892e-09,
"logits/chosen": -1.699599027633667,
"logits/rejected": -1.697575569152832,
"logps/chosen": -55.896812438964844,
"logps/rejected": -62.44630813598633,
"loss": 0.6658,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.05821943283081055,
"rewards/margins": 0.06381964683532715,
"rewards/rejected": -0.005600206553936005,
"step": 2190
},
{
"epoch": 1.585014409221902,
"grad_norm": 31.080228805541992,
"learning_rate": 6.277611698421179e-09,
"logits/chosen": -1.6873823404312134,
"logits/rejected": -1.6729217767715454,
"logps/chosen": -55.1784782409668,
"logps/rejected": -58.860107421875,
"loss": 0.6497,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.08172028511762619,
"rewards/margins": 0.09651371091604233,
"rewards/rejected": -0.014793431386351585,
"step": 2200
},
{
"epoch": 1.5922190201729105,
"grad_norm": 28.780492782592773,
"learning_rate": 6.070741516757608e-09,
"logits/chosen": -1.6578247547149658,
"logits/rejected": -1.6496769189834595,
"logps/chosen": -55.101890563964844,
"logps/rejected": -56.972412109375,
"loss": 0.6647,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05884435027837753,
"rewards/margins": 0.06767454743385315,
"rewards/rejected": -0.008830199018120766,
"step": 2210
},
{
"epoch": 1.5994236311239192,
"grad_norm": 31.7725887298584,
"learning_rate": 5.866865272764607e-09,
"logits/chosen": -1.639620065689087,
"logits/rejected": -1.632886290550232,
"logps/chosen": -60.59607696533203,
"logps/rejected": -65.77485656738281,
"loss": 0.6552,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.06030546501278877,
"rewards/margins": 0.088438019156456,
"rewards/rejected": -0.028132546693086624,
"step": 2220
},
{
"epoch": 1.6066282420749278,
"grad_norm": 26.901762008666992,
"learning_rate": 5.666015212436795e-09,
"logits/chosen": -1.6595134735107422,
"logits/rejected": -1.6518280506134033,
"logps/chosen": -55.67797088623047,
"logps/rejected": -59.78864288330078,
"loss": 0.6686,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.04280759021639824,
"rewards/margins": 0.05854750797152519,
"rewards/rejected": -0.015739915892481804,
"step": 2230
},
{
"epoch": 1.6138328530259365,
"grad_norm": 28.554101943969727,
"learning_rate": 5.46822310313379e-09,
"logits/chosen": -1.6626936197280884,
"logits/rejected": -1.6616346836090088,
"logps/chosen": -46.335777282714844,
"logps/rejected": -52.65189743041992,
"loss": 0.6545,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.06703133881092072,
"rewards/margins": 0.09231172502040863,
"rewards/rejected": -0.025280386209487915,
"step": 2240
},
{
"epoch": 1.6210374639769451,
"grad_norm": 29.99852752685547,
"learning_rate": 5.273520228555767e-09,
"logits/chosen": -1.712920904159546,
"logits/rejected": -1.695810317993164,
"logps/chosen": -56.5516242980957,
"logps/rejected": -60.3693962097168,
"loss": 0.6731,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.04221775382757187,
"rewards/margins": 0.051514916121959686,
"rewards/rejected": -0.009297164157032967,
"step": 2250
},
{
"epoch": 1.6282420749279538,
"grad_norm": 27.436594009399414,
"learning_rate": 5.081937383795484e-09,
"logits/chosen": -1.606261968612671,
"logits/rejected": -1.5864953994750977,
"logps/chosen": -50.1980094909668,
"logps/rejected": -53.731651306152344,
"loss": 0.6628,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.061611782759428024,
"rewards/margins": 0.06988058984279633,
"rewards/rejected": -0.008268805220723152,
"step": 2260
},
{
"epoch": 1.6354466858789625,
"grad_norm": 33.33866882324219,
"learning_rate": 4.893504870467588e-09,
"logits/chosen": -1.676597237586975,
"logits/rejected": -1.6628268957138062,
"logps/chosen": -52.58159637451172,
"logps/rejected": -55.40885543823242,
"loss": 0.6674,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.04663368687033653,
"rewards/margins": 0.06050438806414604,
"rewards/rejected": -0.013870703987777233,
"step": 2270
},
{
"epoch": 1.6426512968299711,
"grad_norm": 28.910106658935547,
"learning_rate": 4.708252491915951e-09,
"logits/chosen": -1.6143203973770142,
"logits/rejected": -1.6018993854522705,
"logps/chosen": -59.179710388183594,
"logps/rejected": -62.8307991027832,
"loss": 0.6755,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.043866075575351715,
"rewards/margins": 0.04365686699748039,
"rewards/rejected": 0.00020921006216667593,
"step": 2280
},
{
"epoch": 1.6498559077809798,
"grad_norm": 36.226402282714844,
"learning_rate": 4.526209548499877e-09,
"logits/chosen": -1.6284698247909546,
"logits/rejected": -1.6161683797836304,
"logps/chosen": -56.088829040527344,
"logps/rejected": -55.797828674316406,
"loss": 0.6607,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.04964672029018402,
"rewards/margins": 0.07595410943031311,
"rewards/rejected": -0.026307392865419388,
"step": 2290
},
{
"epoch": 1.6570605187319885,
"grad_norm": 27.529035568237305,
"learning_rate": 4.347404832959775e-09,
"logits/chosen": -1.6585400104522705,
"logits/rejected": -1.6393734216690063,
"logps/chosen": -49.68330764770508,
"logps/rejected": -51.410423278808594,
"loss": 0.6552,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.062341026961803436,
"rewards/margins": 0.09110082685947418,
"rewards/rejected": -0.02875981293618679,
"step": 2300
},
{
"epoch": 1.6642651296829971,
"grad_norm": 36.036582946777344,
"learning_rate": 4.171866625863229e-09,
"logits/chosen": -1.6054232120513916,
"logits/rejected": -1.591740608215332,
"logps/chosen": -62.186973571777344,
"logps/rejected": -63.91868209838867,
"loss": 0.6753,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.06225902959704399,
"rewards/margins": 0.044443391263484955,
"rewards/rejected": 0.01781563274562359,
"step": 2310
},
{
"epoch": 1.6714697406340058,
"grad_norm": 24.637149810791016,
"learning_rate": 3.9996226911319546e-09,
"logits/chosen": -1.6677039861679077,
"logits/rejected": -1.6617395877838135,
"logps/chosen": -50.32910919189453,
"logps/rejected": -52.85070037841797,
"loss": 0.6627,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.048066943883895874,
"rewards/margins": 0.07331161946058273,
"rewards/rejected": -0.02524467371404171,
"step": 2320
},
{
"epoch": 1.6786743515850144,
"grad_norm": 28.134784698486328,
"learning_rate": 3.830700271650567e-09,
"logits/chosen": -1.7151075601577759,
"logits/rejected": -1.70746648311615,
"logps/chosen": -49.42131042480469,
"logps/rejected": -54.725257873535156,
"loss": 0.6551,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04775792732834816,
"rewards/margins": 0.09152387082576752,
"rewards/rejected": -0.04376594349741936,
"step": 2330
},
{
"epoch": 1.685878962536023,
"grad_norm": 31.970741271972656,
"learning_rate": 3.665126084957723e-09,
"logits/chosen": -1.6030555963516235,
"logits/rejected": -1.5942625999450684,
"logps/chosen": -47.702667236328125,
"logps/rejected": -51.36510467529297,
"loss": 0.6517,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.06474478542804718,
"rewards/margins": 0.09533517807722092,
"rewards/rejected": -0.030590396374464035,
"step": 2340
},
{
"epoch": 1.6930835734870318,
"grad_norm": 30.708681106567383,
"learning_rate": 3.502926319020327e-09,
"logits/chosen": -1.5764755010604858,
"logits/rejected": -1.5539076328277588,
"logps/chosen": -60.64631271362305,
"logps/rejected": -61.80460739135742,
"loss": 0.6634,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.047983959317207336,
"rewards/margins": 0.07177029550075531,
"rewards/rejected": -0.023786336183547974,
"step": 2350
},
{
"epoch": 1.7002881844380404,
"grad_norm": 29.933921813964844,
"learning_rate": 3.3441266280915427e-09,
"logits/chosen": -1.6186996698379517,
"logits/rejected": -1.5993722677230835,
"logps/chosen": -49.01648712158203,
"logps/rejected": -54.099952697753906,
"loss": 0.6526,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.055793993175029755,
"rewards/margins": 0.09487829357385635,
"rewards/rejected": -0.0390842966735363,
"step": 2360
},
{
"epoch": 1.707492795389049,
"grad_norm": 32.35788345336914,
"learning_rate": 3.1887521286532023e-09,
"logits/chosen": -1.7109838724136353,
"logits/rejected": -1.6939365863800049,
"logps/chosen": -57.83002853393555,
"logps/rejected": -58.35017013549805,
"loss": 0.6684,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.055140089243650436,
"rewards/margins": 0.055782973766326904,
"rewards/rejected": -0.0006428823107853532,
"step": 2370
},
{
"epoch": 1.7146974063400577,
"grad_norm": 35.61783981323242,
"learning_rate": 3.0368273954432698e-09,
"logits/chosen": -1.7026067972183228,
"logits/rejected": -1.6995939016342163,
"logps/chosen": -57.07318878173828,
"logps/rejected": -60.9394416809082,
"loss": 0.6619,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.05400969833135605,
"rewards/margins": 0.07402704656124115,
"rewards/rejected": -0.0200173519551754,
"step": 2380
},
{
"epoch": 1.7219020172910664,
"grad_norm": 25.527734756469727,
"learning_rate": 2.888376457568964e-09,
"logits/chosen": -1.7665998935699463,
"logits/rejected": -1.752995491027832,
"logps/chosen": -52.87370681762695,
"logps/rejected": -56.10619354248047,
"loss": 0.6711,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.0432610847055912,
"rewards/margins": 0.053575366735458374,
"rewards/rejected": -0.01031428575515747,
"step": 2390
},
{
"epoch": 1.729106628242075,
"grad_norm": 25.41728973388672,
"learning_rate": 2.7434227947062324e-09,
"logits/chosen": -1.6827430725097656,
"logits/rejected": -1.6787803173065186,
"logps/chosen": -53.131996154785156,
"logps/rejected": -58.491615295410156,
"loss": 0.6501,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.06697873771190643,
"rewards/margins": 0.0972275361418724,
"rewards/rejected": -0.030248800292611122,
"step": 2400
},
{
"epoch": 1.7363112391930837,
"grad_norm": 31.781221389770508,
"learning_rate": 2.6019893333860954e-09,
"logits/chosen": -1.6549618244171143,
"logits/rejected": -1.6534698009490967,
"logps/chosen": -55.04658126831055,
"logps/rejected": -59.05900192260742,
"loss": 0.6644,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.043295614421367645,
"rewards/margins": 0.06643761694431305,
"rewards/rejected": -0.02314199134707451,
"step": 2410
},
{
"epoch": 1.7435158501440924,
"grad_norm": 25.171480178833008,
"learning_rate": 2.4640984433684758e-09,
"logits/chosen": -1.639723539352417,
"logits/rejected": -1.6242148876190186,
"logps/chosen": -50.12556457519531,
"logps/rejected": -52.221473693847656,
"loss": 0.6612,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.041132062673568726,
"rewards/margins": 0.07752031087875366,
"rewards/rejected": -0.03638824075460434,
"step": 2420
},
{
"epoch": 1.7507204610951008,
"grad_norm": 29.74759292602539,
"learning_rate": 2.3297719341040856e-09,
"logits/chosen": -1.6303246021270752,
"logits/rejected": -1.6217195987701416,
"logps/chosen": -52.71974563598633,
"logps/rejected": -58.83891677856445,
"loss": 0.6638,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.039412498474121094,
"rewards/margins": 0.07280907779932022,
"rewards/rejected": -0.033396583050489426,
"step": 2430
},
{
"epoch": 1.7579250720461095,
"grad_norm": 28.20793342590332,
"learning_rate": 2.199031051284972e-09,
"logits/chosen": -1.699127435684204,
"logits/rejected": -1.6883512735366821,
"logps/chosen": -51.349586486816406,
"logps/rejected": -54.69416427612305,
"loss": 0.6645,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.052664875984191895,
"rewards/margins": 0.0667290985584259,
"rewards/rejected": -0.014064219780266285,
"step": 2440
},
{
"epoch": 1.7651296829971181,
"grad_norm": 23.49250602722168,
"learning_rate": 2.0718964734841667e-09,
"logits/chosen": -1.6610476970672607,
"logits/rejected": -1.6484695672988892,
"logps/chosen": -56.7994270324707,
"logps/rejected": -58.2902717590332,
"loss": 0.6643,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.050318509340286255,
"rewards/margins": 0.06775031238794327,
"rewards/rejected": -0.01743180863559246,
"step": 2450
},
{
"epoch": 1.7723342939481268,
"grad_norm": 23.373260498046875,
"learning_rate": 1.948388308885102e-09,
"logits/chosen": -1.7280973196029663,
"logits/rejected": -1.7087351083755493,
"logps/chosen": -50.309112548828125,
"logps/rejected": -52.12499237060547,
"loss": 0.6544,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.05861986428499222,
"rewards/margins": 0.09302407503128052,
"rewards/rejected": -0.034404207020998,
"step": 2460
},
{
"epoch": 1.7795389048991355,
"grad_norm": 37.758689880371094,
"learning_rate": 1.8285260921011846e-09,
"logits/chosen": -1.7070372104644775,
"logits/rejected": -1.6979036331176758,
"logps/chosen": -60.36076736450195,
"logps/rejected": -62.269325256347656,
"loss": 0.6707,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.048447128385305405,
"rewards/margins": 0.05468007177114487,
"rewards/rejected": -0.006232939660549164,
"step": 2470
},
{
"epoch": 1.7867435158501441,
"grad_norm": 23.636079788208008,
"learning_rate": 1.712328781086131e-09,
"logits/chosen": -1.6355743408203125,
"logits/rejected": -1.6160838603973389,
"logps/chosen": -55.45972442626953,
"logps/rejected": -57.82331466674805,
"loss": 0.6592,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05385429412126541,
"rewards/margins": 0.07988177239894867,
"rewards/rejected": -0.026027489453554153,
"step": 2480
},
{
"epoch": 1.7939481268011528,
"grad_norm": 24.043498992919922,
"learning_rate": 1.59981475413547e-09,
"logits/chosen": -1.6116002798080444,
"logits/rejected": -1.594987154006958,
"logps/chosen": -49.88407516479492,
"logps/rejected": -53.558563232421875,
"loss": 0.6472,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.07869906723499298,
"rewards/margins": 0.10210321098566055,
"rewards/rejected": -0.023404140025377274,
"step": 2490
},
{
"epoch": 1.8011527377521612,
"grad_norm": 27.632863998413086,
"learning_rate": 1.491001806979772e-09,
"logits/chosen": -1.5927178859710693,
"logits/rejected": -1.5906251668930054,
"logps/chosen": -51.23817825317383,
"logps/rejected": -57.8389778137207,
"loss": 0.6476,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.05806579068303108,
"rewards/margins": 0.10276976972818375,
"rewards/rejected": -0.04470398277044296,
"step": 2500
},
{
"epoch": 1.8083573487031699,
"grad_norm": 30.61586570739746,
"learning_rate": 1.3859071499699698e-09,
"logits/chosen": -1.6454540491104126,
"logits/rejected": -1.6320326328277588,
"logps/chosen": -56.015342712402344,
"logps/rejected": -58.20482635498047,
"loss": 0.6633,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.04570765793323517,
"rewards/margins": 0.07200601696968079,
"rewards/rejected": -0.026298364624381065,
"step": 2510
},
{
"epoch": 1.8155619596541785,
"grad_norm": 22.36056137084961,
"learning_rate": 1.2845474053553156e-09,
"logits/chosen": -1.6226694583892822,
"logits/rejected": -1.6134302616119385,
"logps/chosen": -55.82828903198242,
"logps/rejected": -58.674468994140625,
"loss": 0.6533,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.04953853040933609,
"rewards/margins": 0.09255535155534744,
"rewards/rejected": -0.043016817420721054,
"step": 2520
},
{
"epoch": 1.8227665706051872,
"grad_norm": 26.279998779296875,
"learning_rate": 1.1869386046543222e-09,
"logits/chosen": -1.6014668941497803,
"logits/rejected": -1.59114670753479,
"logps/chosen": -50.55707931518555,
"logps/rejected": -54.89263153076172,
"loss": 0.6394,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": 0.080271877348423,
"rewards/margins": 0.12159235775470734,
"rewards/rejected": -0.04132048413157463,
"step": 2530
},
{
"epoch": 1.8299711815561959,
"grad_norm": 29.773160934448242,
"learning_rate": 1.0930961861191302e-09,
"logits/chosen": -1.6185270547866821,
"logits/rejected": -1.6171890497207642,
"logps/chosen": -45.08892059326172,
"logps/rejected": -50.504432678222656,
"loss": 0.6624,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.0479862280189991,
"rewards/margins": 0.07539573311805725,
"rewards/rejected": -0.02740950882434845,
"step": 2540
},
{
"epoch": 1.8371757925072045,
"grad_norm": 25.530391693115234,
"learning_rate": 1.003034992293733e-09,
"logits/chosen": -1.6437381505966187,
"logits/rejected": -1.6238796710968018,
"logps/chosen": -46.61026382446289,
"logps/rejected": -51.9169807434082,
"loss": 0.6639,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.06347519159317017,
"rewards/margins": 0.06826033443212509,
"rewards/rejected": -0.004785154014825821,
"step": 2550
},
{
"epoch": 1.8443804034582132,
"grad_norm": 24.680849075317383,
"learning_rate": 9.16769267666434e-10,
"logits/chosen": -1.6180617809295654,
"logits/rejected": -1.5980396270751953,
"logps/chosen": -50.46776580810547,
"logps/rejected": -51.920799255371094,
"loss": 0.6602,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.0773412436246872,
"rewards/margins": 0.07882087677717209,
"rewards/rejected": -0.0014796493342146277,
"step": 2560
},
{
"epoch": 1.8515850144092219,
"grad_norm": 37.89069747924805,
"learning_rate": 8.343126564168412e-10,
"logits/chosen": -1.6480731964111328,
"logits/rejected": -1.6331026554107666,
"logps/chosen": -56.44977951049805,
"logps/rejected": -60.410072326660156,
"loss": 0.6589,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.05443434789776802,
"rewards/margins": 0.07955367118120193,
"rewards/rejected": -0.025119328871369362,
"step": 2570
},
{
"epoch": 1.8587896253602305,
"grad_norm": 27.42110252380371,
"learning_rate": 7.55678200257856e-10,
"logits/chosen": -1.624707818031311,
"logits/rejected": -1.605147361755371,
"logps/chosen": -54.56407928466797,
"logps/rejected": -56.504920959472656,
"loss": 0.6623,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.05488407611846924,
"rewards/margins": 0.07069625705480576,
"rewards/rejected": -0.015812188386917114,
"step": 2580
},
{
"epoch": 1.8659942363112392,
"grad_norm": 31.333648681640625,
"learning_rate": 6.808783363729364e-10,
"logits/chosen": -1.5887932777404785,
"logits/rejected": -1.574791431427002,
"logps/chosen": -48.72455596923828,
"logps/rejected": -53.04241180419922,
"loss": 0.6551,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.05235203355550766,
"rewards/margins": 0.09002417325973511,
"rewards/rejected": -0.03767213225364685,
"step": 2590
},
{
"epoch": 1.8731988472622478,
"grad_norm": 31.04511260986328,
"learning_rate": 6.099248954489794e-10,
"logits/chosen": -1.6404969692230225,
"logits/rejected": -1.6319074630737305,
"logps/chosen": -50.55232620239258,
"logps/rejected": -53.26167678833008,
"loss": 0.6655,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.06544728577136993,
"rewards/margins": 0.06491607427597046,
"rewards/rejected": 0.0005312118446454406,
"step": 2600
},
{
"epoch": 1.8804034582132565,
"grad_norm": 27.33155632019043,
"learning_rate": 5.428290998051116e-10,
"logits/chosen": -1.6392921209335327,
"logits/rejected": -1.6253206729888916,
"logps/chosen": -49.612022399902344,
"logps/rejected": -52.06800079345703,
"loss": 0.6629,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0555211678147316,
"rewards/margins": 0.07389000803232193,
"rewards/rejected": -0.018368840217590332,
"step": 2610
},
{
"epoch": 1.8876080691642652,
"grad_norm": 24.837400436401367,
"learning_rate": 4.796015616177401e-10,
"logits/chosen": -1.597421407699585,
"logits/rejected": -1.5863425731658936,
"logps/chosen": -50.992122650146484,
"logps/rejected": -53.921241760253906,
"loss": 0.6523,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.06237851455807686,
"rewards/margins": 0.09439820796251297,
"rewards/rejected": -0.03201969712972641,
"step": 2620
},
{
"epoch": 1.8948126801152738,
"grad_norm": 22.60968017578125,
"learning_rate": 4.2025228124205335e-10,
"logits/chosen": -1.6710373163223267,
"logits/rejected": -1.6665174961090088,
"logps/chosen": -61.06689453125,
"logps/rejected": -62.720733642578125,
"loss": 0.6719,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.042091116309165955,
"rewards/margins": 0.04989165440201759,
"rewards/rejected": -0.007800539024174213,
"step": 2630
},
{
"epoch": 1.9020172910662825,
"grad_norm": 25.86883544921875,
"learning_rate": 3.64790645630339e-10,
"logits/chosen": -1.6532398462295532,
"logits/rejected": -1.6340618133544922,
"logps/chosen": -55.06281661987305,
"logps/rejected": -56.2059440612793,
"loss": 0.6654,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.0360020287334919,
"rewards/margins": 0.06869436055421829,
"rewards/rejected": -0.0326923243701458,
"step": 2640
},
{
"epoch": 1.9092219020172911,
"grad_norm": 36.909793853759766,
"learning_rate": 3.1322542684729945e-10,
"logits/chosen": -1.6118682622909546,
"logits/rejected": -1.5950068235397339,
"logps/chosen": -64.02647399902344,
"logps/rejected": -68.84220123291016,
"loss": 0.6553,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.046797286719083786,
"rewards/margins": 0.08603934943675995,
"rewards/rejected": -0.03924206644296646,
"step": 2650
},
{
"epoch": 1.9164265129682998,
"grad_norm": 22.410232543945312,
"learning_rate": 2.6556478068261447e-10,
"logits/chosen": -1.6788618564605713,
"logits/rejected": -1.6813141107559204,
"logps/chosen": -54.3393669128418,
"logps/rejected": -57.85358810424805,
"loss": 0.6593,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0549929141998291,
"rewards/margins": 0.07665206491947174,
"rewards/rejected": -0.02165914885699749,
"step": 2660
},
{
"epoch": 1.9236311239193085,
"grad_norm": 28.00406265258789,
"learning_rate": 2.2181624536098952e-10,
"logits/chosen": -1.6921355724334717,
"logits/rejected": -1.6832196712493896,
"logps/chosen": -51.8244514465332,
"logps/rejected": -56.756736755371094,
"loss": 0.6567,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.05170941352844238,
"rewards/margins": 0.08497369289398193,
"rewards/rejected": -0.03326428681612015,
"step": 2670
},
{
"epoch": 1.9308357348703171,
"grad_norm": 24.003829956054688,
"learning_rate": 1.819867403498737e-10,
"logits/chosen": -1.6733729839324951,
"logits/rejected": -1.6761459112167358,
"logps/chosen": -55.07045364379883,
"logps/rejected": -60.26169967651367,
"loss": 0.6731,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.05987237021327019,
"rewards/margins": 0.04959547519683838,
"rewards/rejected": 0.01027689315378666,
"step": 2680
},
{
"epoch": 1.9380403458213258,
"grad_norm": 30.30132293701172,
"learning_rate": 1.4608256526505157e-10,
"logits/chosen": -1.5777008533477783,
"logits/rejected": -1.569038987159729,
"logps/chosen": -60.8995475769043,
"logps/rejected": -62.53614044189453,
"loss": 0.671,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.04738503322005272,
"rewards/margins": 0.05407433584332466,
"rewards/rejected": -0.006689299829304218,
"step": 2690
},
{
"epoch": 1.9452449567723344,
"grad_norm": 34.3319206237793,
"learning_rate": 1.1410939887425141e-10,
"logits/chosen": -1.708012342453003,
"logits/rejected": -1.6876204013824463,
"logps/chosen": -53.042701721191406,
"logps/rejected": -55.08272171020508,
"loss": 0.6628,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.038316644728183746,
"rewards/margins": 0.07014557719230652,
"rewards/rejected": -0.031828928738832474,
"step": 2700
},
{
"epoch": 1.952449567723343,
"grad_norm": 27.495458602905273,
"learning_rate": 8.607229819898865e-11,
"logits/chosen": -1.6768684387207031,
"logits/rejected": -1.6618626117706299,
"logps/chosen": -54.03479766845703,
"logps/rejected": -56.34186553955078,
"loss": 0.6652,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.04240185394883156,
"rewards/margins": 0.06518776714801788,
"rewards/rejected": -0.022785909473896027,
"step": 2710
},
{
"epoch": 1.9596541786743515,
"grad_norm": 31.330108642578125,
"learning_rate": 6.19756977147029e-11,
"logits/chosen": -1.660762071609497,
"logits/rejected": -1.6496670246124268,
"logps/chosen": -48.30510711669922,
"logps/rejected": -53.958099365234375,
"loss": 0.66,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.058389853686094284,
"rewards/margins": 0.08087347447872162,
"rewards/rejected": -0.02248362824320793,
"step": 2720
},
{
"epoch": 1.9668587896253602,
"grad_norm": 30.01300811767578,
"learning_rate": 4.1823408649391265e-11,
"logits/chosen": -1.5987229347229004,
"logits/rejected": -1.592441439628601,
"logps/chosen": -52.67683792114258,
"logps/rejected": -55.06285858154297,
"loss": 0.6684,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.045865319669246674,
"rewards/margins": 0.05944997817277908,
"rewards/rejected": -0.013584655709564686,
"step": 2730
},
{
"epoch": 1.9740634005763689,
"grad_norm": 27.41546058654785,
"learning_rate": 2.5618618380812694e-11,
"logits/chosen": -1.725508451461792,
"logits/rejected": -1.7198139429092407,
"logps/chosen": -48.2393913269043,
"logps/rejected": -53.41130447387695,
"loss": 0.6594,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0503121018409729,
"rewards/margins": 0.07980336248874664,
"rewards/rejected": -0.029491260647773743,
"step": 2740
},
{
"epoch": 1.9812680115273775,
"grad_norm": 24.554004669189453,
"learning_rate": 1.3363889932338501e-11,
"logits/chosen": -1.6208444833755493,
"logits/rejected": -1.6235567331314087,
"logps/chosen": -53.99614334106445,
"logps/rejected": -59.610862731933594,
"loss": 0.666,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05232435464859009,
"rewards/margins": 0.061102528125047684,
"rewards/rejected": -0.008778175339102745,
"step": 2750
},
{
"epoch": 1.9884726224783862,
"grad_norm": 25.978759765625,
"learning_rate": 5.061161567596061e-12,
"logits/chosen": -1.7159192562103271,
"logits/rejected": -1.7112420797348022,
"logps/chosen": -54.39562225341797,
"logps/rejected": -58.398590087890625,
"loss": 0.6618,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05306984856724739,
"rewards/margins": 0.07363148033618927,
"rewards/rejected": -0.02056163363158703,
"step": 2760
},
{
"epoch": 1.9956772334293948,
"grad_norm": 23.055919647216797,
"learning_rate": 7.11746483889053e-13,
"logits/chosen": -1.6436536312103271,
"logits/rejected": -1.631366491317749,
"logps/chosen": -55.227935791015625,
"logps/rejected": -57.9941520690918,
"loss": 0.663,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.051635853946208954,
"rewards/margins": 0.07143048942089081,
"rewards/rejected": -0.019794631749391556,
"step": 2770
},
{
"epoch": 2.0,
"step": 2776,
"total_flos": 0.0,
"train_loss": 0.67492206965812,
"train_runtime": 4341.9182,
"train_samples_per_second": 10.227,
"train_steps_per_second": 0.639
}
],
"logging_steps": 10,
"max_steps": 2776,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}