sfulay's picture
Model save
f3b3d0c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988571428571429,
"eval_steps": 50,
"global_step": 437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022857142857142857,
"grad_norm": 5.965044878833196,
"learning_rate": 1.1363636363636363e-07,
"logits/chosen": -2.7006218433380127,
"logits/rejected": -2.6247599124908447,
"logps/chosen": -301.24932861328125,
"logps/rejected": -281.7940979003906,
"loss": 0.6931,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.0003684944240376353,
"rewards/margins": 0.0008126062457449734,
"rewards/rejected": -0.000444111879914999,
"step": 10
},
{
"epoch": 0.045714285714285714,
"grad_norm": 4.694626134382372,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -2.6410038471221924,
"logits/rejected": -2.60575008392334,
"logps/chosen": -278.92498779296875,
"logps/rejected": -254.63601684570312,
"loss": 0.6925,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.002462259028106928,
"rewards/margins": 0.0011314961593598127,
"rewards/rejected": 0.0013307628687471151,
"step": 20
},
{
"epoch": 0.06857142857142857,
"grad_norm": 5.220071225612144,
"learning_rate": 3.4090909090909085e-07,
"logits/chosen": -2.638200044631958,
"logits/rejected": -2.617208242416382,
"logps/chosen": -263.2459411621094,
"logps/rejected": -263.34710693359375,
"loss": 0.689,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.014371426776051521,
"rewards/margins": 0.007912042550742626,
"rewards/rejected": 0.006459384225308895,
"step": 30
},
{
"epoch": 0.09142857142857143,
"grad_norm": 5.914085075708232,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -2.64882493019104,
"logits/rejected": -2.585529327392578,
"logps/chosen": -290.2810974121094,
"logps/rejected": -268.34210205078125,
"loss": 0.6806,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.03716137260198593,
"rewards/margins": 0.0442696288228035,
"rewards/rejected": -0.007108256220817566,
"step": 40
},
{
"epoch": 0.11428571428571428,
"grad_norm": 8.967256960057256,
"learning_rate": 4.997124959943201e-07,
"logits/chosen": -2.6775121688842773,
"logits/rejected": -2.5971298217773438,
"logps/chosen": -293.7924499511719,
"logps/rejected": -254.38064575195312,
"loss": 0.6696,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.02517825737595558,
"rewards/margins": 0.1003413200378418,
"rewards/rejected": -0.07516306638717651,
"step": 50
},
{
"epoch": 0.11428571428571428,
"eval_logits/chosen": -2.5406415462493896,
"eval_logits/rejected": -2.4382479190826416,
"eval_logps/chosen": -276.4425964355469,
"eval_logps/rejected": -235.50723266601562,
"eval_loss": 0.658383309841156,
"eval_rewards/accuracies": 0.6853448152542114,
"eval_rewards/chosen": -0.008386622183024883,
"eval_rewards/margins": 0.1559244692325592,
"eval_rewards/rejected": -0.16431109607219696,
"eval_runtime": 91.7124,
"eval_samples_per_second": 19.965,
"eval_steps_per_second": 0.316,
"step": 50
},
{
"epoch": 0.13714285714285715,
"grad_norm": 7.378725048645318,
"learning_rate": 4.979579212164186e-07,
"logits/chosen": -2.578993320465088,
"logits/rejected": -2.4725637435913086,
"logps/chosen": -293.21600341796875,
"logps/rejected": -274.92535400390625,
"loss": 0.6509,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.1271006315946579,
"rewards/margins": 0.13663128018379211,
"rewards/rejected": -0.2637318968772888,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 7.529436455012959,
"learning_rate": 4.946196886175515e-07,
"logits/chosen": -2.5928056240081787,
"logits/rejected": -2.543529748916626,
"logps/chosen": -294.546630859375,
"logps/rejected": -301.3702697753906,
"loss": 0.6315,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.1876838505268097,
"rewards/margins": 0.2297508269548416,
"rewards/rejected": -0.4174346923828125,
"step": 70
},
{
"epoch": 0.18285714285714286,
"grad_norm": 12.054303957362464,
"learning_rate": 4.897191188239667e-07,
"logits/chosen": -2.6392509937286377,
"logits/rejected": -2.590977668762207,
"logps/chosen": -285.3960266113281,
"logps/rejected": -307.17535400390625,
"loss": 0.62,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.18367011845111847,
"rewards/margins": 0.33499467372894287,
"rewards/rejected": -0.5186647176742554,
"step": 80
},
{
"epoch": 0.2057142857142857,
"grad_norm": 13.273475435863975,
"learning_rate": 4.832875107981763e-07,
"logits/chosen": -2.7371668815612793,
"logits/rejected": -2.6849629878997803,
"logps/chosen": -296.71575927734375,
"logps/rejected": -316.90338134765625,
"loss": 0.6249,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.20430462062358856,
"rewards/margins": 0.40924978256225586,
"rewards/rejected": -0.6135543584823608,
"step": 90
},
{
"epoch": 0.22857142857142856,
"grad_norm": 15.686669316278751,
"learning_rate": 4.753659419387223e-07,
"logits/chosen": -2.769486665725708,
"logits/rejected": -2.6865835189819336,
"logps/chosen": -318.80413818359375,
"logps/rejected": -312.09326171875,
"loss": 0.6122,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.324177622795105,
"rewards/margins": 0.4622408449649811,
"rewards/rejected": -0.7864184975624084,
"step": 100
},
{
"epoch": 0.22857142857142856,
"eval_logits/chosen": -2.651167869567871,
"eval_logits/rejected": -2.5533361434936523,
"eval_logps/chosen": -316.30194091796875,
"eval_logps/rejected": -308.60577392578125,
"eval_loss": 0.6111233234405518,
"eval_rewards/accuracies": 0.6767241358757019,
"eval_rewards/chosen": -0.40698006749153137,
"eval_rewards/margins": 0.4883164167404175,
"eval_rewards/rejected": -0.8952965140342712,
"eval_runtime": 90.9103,
"eval_samples_per_second": 20.141,
"eval_steps_per_second": 0.319,
"step": 100
},
{
"epoch": 0.25142857142857145,
"grad_norm": 12.723184548250023,
"learning_rate": 4.660050057270191e-07,
"logits/chosen": -2.619276523590088,
"logits/rejected": -2.556680202484131,
"logps/chosen": -375.2064208984375,
"logps/rejected": -391.784423828125,
"loss": 0.6021,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.589028000831604,
"rewards/margins": 0.3497200608253479,
"rewards/rejected": -0.9387480020523071,
"step": 110
},
{
"epoch": 0.2742857142857143,
"grad_norm": 16.182958724416615,
"learning_rate": 4.5526448859687144e-07,
"logits/chosen": -1.8494535684585571,
"logits/rejected": -1.6301162242889404,
"logps/chosen": -390.48797607421875,
"logps/rejected": -364.620361328125,
"loss": 0.5814,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7958351969718933,
"rewards/margins": 0.5332263708114624,
"rewards/rejected": -1.329061508178711,
"step": 120
},
{
"epoch": 0.29714285714285715,
"grad_norm": 17.332692843610236,
"learning_rate": 4.432129880904388e-07,
"logits/chosen": -0.4575839638710022,
"logits/rejected": -0.06781496107578278,
"logps/chosen": -410.9315490722656,
"logps/rejected": -413.829833984375,
"loss": 0.5548,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.0577561855316162,
"rewards/margins": 0.5758394598960876,
"rewards/rejected": -1.6335957050323486,
"step": 130
},
{
"epoch": 0.32,
"grad_norm": 20.594750248375647,
"learning_rate": 4.299274747394055e-07,
"logits/chosen": 0.2059406340122223,
"logits/rejected": 0.5167960524559021,
"logps/chosen": -435.4883728027344,
"logps/rejected": -472.76092529296875,
"loss": 0.5654,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.200407862663269,
"rewards/margins": 0.8080868721008301,
"rewards/rejected": -2.0084948539733887,
"step": 140
},
{
"epoch": 0.34285714285714286,
"grad_norm": 16.29523919912318,
"learning_rate": 4.1549280046953653e-07,
"logits/chosen": -0.2454165518283844,
"logits/rejected": 0.22050300240516663,
"logps/chosen": -396.6532287597656,
"logps/rejected": -463.4326171875,
"loss": 0.5476,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0966728925704956,
"rewards/margins": 0.7746630311012268,
"rewards/rejected": -1.871335744857788,
"step": 150
},
{
"epoch": 0.34285714285714286,
"eval_logits/chosen": 0.14409177005290985,
"eval_logits/rejected": 0.9770079255104065,
"eval_logps/chosen": -409.03546142578125,
"eval_logps/rejected": -453.3369140625,
"eval_loss": 0.5582876801490784,
"eval_rewards/accuracies": 0.7370689511299133,
"eval_rewards/chosen": -1.3343148231506348,
"eval_rewards/margins": 1.0082927942276,
"eval_rewards/rejected": -2.3426077365875244,
"eval_runtime": 91.388,
"eval_samples_per_second": 20.035,
"eval_steps_per_second": 0.317,
"step": 150
},
{
"epoch": 0.3657142857142857,
"grad_norm": 31.42845724506196,
"learning_rate": 4.000011566683401e-07,
"logits/chosen": -0.0020641356240957975,
"logits/rejected": 0.659235954284668,
"logps/chosen": -442.47259521484375,
"logps/rejected": -490.87762451171875,
"loss": 0.5549,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4313229322433472,
"rewards/margins": 0.9210258722305298,
"rewards/rejected": -2.352349042892456,
"step": 160
},
{
"epoch": 0.38857142857142857,
"grad_norm": 21.881739335743443,
"learning_rate": 3.8355148537705047e-07,
"logits/chosen": -0.8011367917060852,
"logits/rejected": -0.18294472992420197,
"logps/chosen": -420.85791015625,
"logps/rejected": -446.387451171875,
"loss": 0.5563,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1675300598144531,
"rewards/margins": 0.6390342712402344,
"rewards/rejected": -1.8065645694732666,
"step": 170
},
{
"epoch": 0.4114285714285714,
"grad_norm": 24.301433957337014,
"learning_rate": 3.662488473675315e-07,
"logits/chosen": -0.6645376086235046,
"logits/rejected": 0.36614301800727844,
"logps/chosen": -447.889892578125,
"logps/rejected": -494.78070068359375,
"loss": 0.5498,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1771008968353271,
"rewards/margins": 1.1712000370025635,
"rewards/rejected": -2.3483011722564697,
"step": 180
},
{
"epoch": 0.4342857142857143,
"grad_norm": 18.603399872507342,
"learning_rate": 3.48203751140067e-07,
"logits/chosen": -0.08548859506845474,
"logits/rejected": 0.7475250959396362,
"logps/chosen": -421.85540771484375,
"logps/rejected": -453.6908264160156,
"loss": 0.5499,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4559787511825562,
"rewards/margins": 0.7359476089477539,
"rewards/rejected": -2.1919264793395996,
"step": 190
},
{
"epoch": 0.45714285714285713,
"grad_norm": 21.90453363461546,
"learning_rate": 3.2953144712759537e-07,
"logits/chosen": -0.9407933354377747,
"logits/rejected": -0.02539023384451866,
"logps/chosen": -380.4794616699219,
"logps/rejected": -437.4371643066406,
"loss": 0.5582,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1213675737380981,
"rewards/margins": 0.9641984701156616,
"rewards/rejected": -2.0855660438537598,
"step": 200
},
{
"epoch": 0.45714285714285713,
"eval_logits/chosen": -0.4975701570510864,
"eval_logits/rejected": 0.5624167919158936,
"eval_logps/chosen": -379.0511169433594,
"eval_logps/rejected": -433.3172912597656,
"eval_loss": 0.5498641729354858,
"eval_rewards/accuracies": 0.732758641242981,
"eval_rewards/chosen": -1.034471869468689,
"eval_rewards/margins": 1.107939600944519,
"eval_rewards/rejected": -2.142411708831787,
"eval_runtime": 90.2066,
"eval_samples_per_second": 20.298,
"eval_steps_per_second": 0.321,
"step": 200
},
{
"epoch": 0.48,
"grad_norm": 16.10426120639833,
"learning_rate": 3.103511916141658e-07,
"logits/chosen": 0.09185227006673813,
"logits/rejected": 0.8966398239135742,
"logps/chosen": -387.89202880859375,
"logps/rejected": -462.49932861328125,
"loss": 0.5404,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2443852424621582,
"rewards/margins": 0.9278079271316528,
"rewards/rejected": -2.1721930503845215,
"step": 210
},
{
"epoch": 0.5028571428571429,
"grad_norm": 18.780630904417688,
"learning_rate": 2.9078548506882117e-07,
"logits/chosen": 0.5002994537353516,
"logits/rejected": 1.4443576335906982,
"logps/chosen": -440.80279541015625,
"logps/rejected": -487.53485107421875,
"loss": 0.5609,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.5573679208755493,
"rewards/margins": 0.8561462163925171,
"rewards/rejected": -2.4135143756866455,
"step": 220
},
{
"epoch": 0.5257142857142857,
"grad_norm": 20.610433717594198,
"learning_rate": 2.709592897595191e-07,
"logits/chosen": 0.22773201763629913,
"logits/rejected": 1.2361242771148682,
"logps/chosen": -401.34228515625,
"logps/rejected": -446.8021545410156,
"loss": 0.5442,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.202831506729126,
"rewards/margins": 0.8723229169845581,
"rewards/rejected": -2.0751543045043945,
"step": 230
},
{
"epoch": 0.5485714285714286,
"grad_norm": 27.325375522779876,
"learning_rate": 2.509992316440332e-07,
"logits/chosen": 0.26873356103897095,
"logits/rejected": 1.303821325302124,
"logps/chosen": -431.5526428222656,
"logps/rejected": -526.184814453125,
"loss": 0.536,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.328427791595459,
"rewards/margins": 1.219201922416687,
"rewards/rejected": -2.5476298332214355,
"step": 240
},
{
"epoch": 0.5714285714285714,
"grad_norm": 18.92218062691862,
"learning_rate": 2.3103279163519918e-07,
"logits/chosen": -0.07236287742853165,
"logits/rejected": 0.5380650758743286,
"logps/chosen": -407.7901306152344,
"logps/rejected": -495.40777587890625,
"loss": 0.5503,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2716388702392578,
"rewards/margins": 0.980434238910675,
"rewards/rejected": -2.252073287963867,
"step": 250
},
{
"epoch": 0.5714285714285714,
"eval_logits/chosen": -0.37247952818870544,
"eval_logits/rejected": 0.7719168066978455,
"eval_logps/chosen": -392.6152038574219,
"eval_logps/rejected": -450.1522216796875,
"eval_loss": 0.5393335819244385,
"eval_rewards/accuracies": 0.7370689511299133,
"eval_rewards/chosen": -1.1701123714447021,
"eval_rewards/margins": 1.1406482458114624,
"eval_rewards/rejected": -2.310760498046875,
"eval_runtime": 90.9292,
"eval_samples_per_second": 20.137,
"eval_steps_per_second": 0.319,
"step": 250
},
{
"epoch": 0.5942857142857143,
"grad_norm": 25.541848941752068,
"learning_rate": 2.1118749140573358e-07,
"logits/chosen": 0.0009159505134448409,
"logits/rejected": 0.6376093626022339,
"logps/chosen": -426.14141845703125,
"logps/rejected": -502.9112243652344,
"loss": 0.5485,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.463547706604004,
"rewards/margins": 0.8321346044540405,
"rewards/rejected": -2.295682430267334,
"step": 260
},
{
"epoch": 0.6171428571428571,
"grad_norm": 23.51335121897504,
"learning_rate": 1.9159007893272703e-07,
"logits/chosen": 0.321635901927948,
"logits/rejected": 1.6592861413955688,
"logps/chosen": -413.24859619140625,
"logps/rejected": -473.6759338378906,
"loss": 0.5267,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4072272777557373,
"rewards/margins": 1.0244569778442383,
"rewards/rejected": -2.4316840171813965,
"step": 270
},
{
"epoch": 0.64,
"grad_norm": 27.142819787480168,
"learning_rate": 1.7236571898357766e-07,
"logits/chosen": 1.0628600120544434,
"logits/rejected": 2.0229506492614746,
"logps/chosen": -440.122314453125,
"logps/rejected": -543.1414794921875,
"loss": 0.5316,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6708418130874634,
"rewards/margins": 1.129504919052124,
"rewards/rejected": -2.8003463745117188,
"step": 280
},
{
"epoch": 0.6628571428571428,
"grad_norm": 24.13150363681131,
"learning_rate": 1.5363719371356882e-07,
"logits/chosen": 0.698092520236969,
"logits/rejected": 1.5312575101852417,
"logps/chosen": -450.4425354003906,
"logps/rejected": -515.0484008789062,
"loss": 0.5339,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4953665733337402,
"rewards/margins": 0.999632716178894,
"rewards/rejected": -2.4949991703033447,
"step": 290
},
{
"epoch": 0.6857142857142857,
"grad_norm": 17.486388226084866,
"learning_rate": 1.3552411848071565e-07,
"logits/chosen": 0.3839910626411438,
"logits/rejected": 1.8341293334960938,
"logps/chosen": -441.32183837890625,
"logps/rejected": -507.97894287109375,
"loss": 0.5224,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3941259384155273,
"rewards/margins": 1.1592432260513306,
"rewards/rejected": -2.5533692836761475,
"step": 300
},
{
"epoch": 0.6857142857142857,
"eval_logits/chosen": 0.18918734788894653,
"eval_logits/rejected": 1.70877206325531,
"eval_logps/chosen": -397.884033203125,
"eval_logps/rejected": -470.09490966796875,
"eval_loss": 0.5312153100967407,
"eval_rewards/accuracies": 0.7543103694915771,
"eval_rewards/chosen": -1.2228009700775146,
"eval_rewards/margins": 1.2873866558074951,
"eval_rewards/rejected": -2.510187864303589,
"eval_runtime": 92.3596,
"eval_samples_per_second": 19.825,
"eval_steps_per_second": 0.314,
"step": 300
},
{
"epoch": 0.7085714285714285,
"grad_norm": 20.49651474517604,
"learning_rate": 1.1814217788631473e-07,
"logits/chosen": 0.41669049859046936,
"logits/rejected": 1.394052505493164,
"logps/chosen": -400.6260986328125,
"logps/rejected": -474.28094482421875,
"loss": 0.5361,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4432194232940674,
"rewards/margins": 0.9276365041732788,
"rewards/rejected": -2.3708558082580566,
"step": 310
},
{
"epoch": 0.7314285714285714,
"grad_norm": 18.75776450561332,
"learning_rate": 1.0160238692045331e-07,
"logits/chosen": 0.7597023844718933,
"logits/rejected": 1.6351118087768555,
"logps/chosen": -413.95318603515625,
"logps/rejected": -488.90460205078125,
"loss": 0.542,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6967086791992188,
"rewards/margins": 0.782455563545227,
"rewards/rejected": -2.479163885116577,
"step": 320
},
{
"epoch": 0.7542857142857143,
"grad_norm": 18.561363930407463,
"learning_rate": 8.601038193139438e-08,
"logits/chosen": 0.14268045127391815,
"logits/rejected": 1.3421038389205933,
"logps/chosen": -447.97137451171875,
"logps/rejected": -503.50433349609375,
"loss": 0.5363,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4875319004058838,
"rewards/margins": 1.0696327686309814,
"rewards/rejected": -2.5571646690368652,
"step": 330
},
{
"epoch": 0.7771428571428571,
"grad_norm": 17.499558797451687,
"learning_rate": 7.146574594727572e-08,
"logits/chosen": 0.3810690939426422,
"logits/rejected": 1.2245051860809326,
"logps/chosen": -414.9021911621094,
"logps/rejected": -506.65045166015625,
"loss": 0.5285,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.489512324333191,
"rewards/margins": 1.1567548513412476,
"rewards/rejected": -2.6462674140930176,
"step": 340
},
{
"epoch": 0.8,
"grad_norm": 17.943689215599328,
"learning_rate": 5.8061372659157306e-08,
"logits/chosen": 0.24244177341461182,
"logits/rejected": 1.3491809368133545,
"logps/chosen": -441.5047912597656,
"logps/rejected": -494.35626220703125,
"loss": 0.5396,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.517975091934204,
"rewards/margins": 0.8826116323471069,
"rewards/rejected": -2.4005866050720215,
"step": 350
},
{
"epoch": 0.8,
"eval_logits/chosen": 0.4364562928676605,
"eval_logits/rejected": 1.9215292930603027,
"eval_logps/chosen": -420.2202453613281,
"eval_logps/rejected": -493.9275207519531,
"eval_loss": 0.5290318131446838,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -1.4461628198623657,
"eval_rewards/margins": 1.3023512363433838,
"eval_rewards/rejected": -2.748514175415039,
"eval_runtime": 91.6979,
"eval_samples_per_second": 19.968,
"eval_steps_per_second": 0.316,
"step": 350
},
{
"epoch": 0.8228571428571428,
"grad_norm": 16.780749890709036,
"learning_rate": 4.5882873127531614e-08,
"logits/chosen": 0.174576535820961,
"logits/rejected": 1.4981176853179932,
"logps/chosen": -435.602783203125,
"logps/rejected": -510.8885192871094,
"loss": 0.5205,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.501977801322937,
"rewards/margins": 1.0960423946380615,
"rewards/rejected": -2.598020076751709,
"step": 360
},
{
"epoch": 0.8457142857142858,
"grad_norm": 19.04569651937684,
"learning_rate": 3.500802900154412e-08,
"logits/chosen": 0.34421294927597046,
"logits/rejected": 1.787302017211914,
"logps/chosen": -412.97747802734375,
"logps/rejected": -499.79034423828125,
"loss": 0.528,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.4191606044769287,
"rewards/margins": 1.1945868730545044,
"rewards/rejected": -2.6137473583221436,
"step": 370
},
{
"epoch": 0.8685714285714285,
"grad_norm": 21.067585045477745,
"learning_rate": 2.550629574310309e-08,
"logits/chosen": 0.211051344871521,
"logits/rejected": 1.5275977849960327,
"logps/chosen": -486.8960876464844,
"logps/rejected": -515.337646484375,
"loss": 0.5294,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6883971691131592,
"rewards/margins": 0.8909848928451538,
"rewards/rejected": -2.5793819427490234,
"step": 380
},
{
"epoch": 0.8914285714285715,
"grad_norm": 21.227279903684668,
"learning_rate": 1.7438359028687983e-08,
"logits/chosen": 0.37176352739334106,
"logits/rejected": 1.208251714706421,
"logps/chosen": -453.6361389160156,
"logps/rejected": -538.0291748046875,
"loss": 0.5333,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4226316213607788,
"rewards/margins": 1.0099334716796875,
"rewards/rejected": -2.432565212249756,
"step": 390
},
{
"epoch": 0.9142857142857143,
"grad_norm": 35.72120712786558,
"learning_rate": 1.0855747162029361e-08,
"logits/chosen": 0.5662034749984741,
"logits/rejected": 1.0855852365493774,
"logps/chosen": -437.5174865722656,
"logps/rejected": -510.6676330566406,
"loss": 0.55,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.587842345237732,
"rewards/margins": 0.8530977368354797,
"rewards/rejected": -2.4409401416778564,
"step": 400
},
{
"epoch": 0.9142857142857143,
"eval_logits/chosen": 0.49114343523979187,
"eval_logits/rejected": 1.9856219291687012,
"eval_logps/chosen": -417.03155517578125,
"eval_logps/rejected": -493.2509765625,
"eval_loss": 0.5286471843719482,
"eval_rewards/accuracies": 0.7629310488700867,
"eval_rewards/chosen": -1.414276123046875,
"eval_rewards/margins": 1.3274718523025513,
"eval_rewards/rejected": -2.741748094558716,
"eval_runtime": 91.527,
"eval_samples_per_second": 20.005,
"eval_steps_per_second": 0.317,
"step": 400
},
{
"epoch": 0.9371428571428572,
"grad_norm": 21.313276080994388,
"learning_rate": 5.8005019731033615e-09,
"logits/chosen": 0.33736371994018555,
"logits/rejected": 1.3800859451293945,
"logps/chosen": -453.69744873046875,
"logps/rejected": -516.6829833984375,
"loss": 0.5264,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.6551754474639893,
"rewards/margins": 0.9143539667129517,
"rewards/rejected": -2.5695290565490723,
"step": 410
},
{
"epoch": 0.96,
"grad_norm": 19.39515946553055,
"learning_rate": 2.3049103053431886e-09,
"logits/chosen": 0.2167482078075409,
"logits/rejected": 1.6823341846466064,
"logps/chosen": -409.4588928222656,
"logps/rejected": -498.9947814941406,
"loss": 0.5293,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.267107605934143,
"rewards/margins": 1.3829718828201294,
"rewards/rejected": -2.6500792503356934,
"step": 420
},
{
"epoch": 0.9828571428571429,
"grad_norm": 20.626302042234812,
"learning_rate": 3.9129780600541397e-10,
"logits/chosen": 0.5624532699584961,
"logits/rejected": 1.5469181537628174,
"logps/chosen": -430.54388427734375,
"logps/rejected": -515.5368041992188,
"loss": 0.5296,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.483120083808899,
"rewards/margins": 1.0355522632598877,
"rewards/rejected": -2.518672466278076,
"step": 430
},
{
"epoch": 0.9988571428571429,
"step": 437,
"total_flos": 0.0,
"train_loss": 0.5693180419214803,
"train_runtime": 11386.9149,
"train_samples_per_second": 4.918,
"train_steps_per_second": 0.038
}
],
"logging_steps": 10,
"max_steps": 437,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}