{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988571428571429, "eval_steps": 50, "global_step": 437, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022857142857142857, "grad_norm": 5.965044878833196, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.7006218433380127, "logits/rejected": -2.6247599124908447, "logps/chosen": -301.24932861328125, "logps/rejected": -281.7940979003906, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0003684944240376353, "rewards/margins": 0.0008126062457449734, "rewards/rejected": -0.000444111879914999, "step": 10 }, { "epoch": 0.045714285714285714, "grad_norm": 4.694626134382372, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6410038471221924, "logits/rejected": -2.60575008392334, "logps/chosen": -278.92498779296875, "logps/rejected": -254.63601684570312, "loss": 0.6925, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.002462259028106928, "rewards/margins": 0.0011314961593598127, "rewards/rejected": 0.0013307628687471151, "step": 20 }, { "epoch": 0.06857142857142857, "grad_norm": 5.220071225612144, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.638200044631958, "logits/rejected": -2.617208242416382, "logps/chosen": -263.2459411621094, "logps/rejected": -263.34710693359375, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.014371426776051521, "rewards/margins": 0.007912042550742626, "rewards/rejected": 0.006459384225308895, "step": 30 }, { "epoch": 0.09142857142857143, "grad_norm": 5.914085075708232, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.64882493019104, "logits/rejected": -2.585529327392578, "logps/chosen": -290.2810974121094, "logps/rejected": -268.34210205078125, "loss": 0.6806, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03716137260198593, "rewards/margins": 0.0442696288228035, "rewards/rejected": -0.007108256220817566, "step": 40 }, { "epoch": 0.11428571428571428, "grad_norm": 8.967256960057256, "learning_rate": 4.997124959943201e-07, "logits/chosen": -2.6775121688842773, "logits/rejected": -2.5971298217773438, "logps/chosen": -293.7924499511719, "logps/rejected": -254.38064575195312, "loss": 0.6696, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.02517825737595558, "rewards/margins": 0.1003413200378418, "rewards/rejected": -0.07516306638717651, "step": 50 }, { "epoch": 0.11428571428571428, "eval_logits/chosen": -2.5406415462493896, "eval_logits/rejected": -2.4382479190826416, "eval_logps/chosen": -276.4425964355469, "eval_logps/rejected": -235.50723266601562, "eval_loss": 0.658383309841156, "eval_rewards/accuracies": 0.6853448152542114, "eval_rewards/chosen": -0.008386622183024883, "eval_rewards/margins": 0.1559244692325592, "eval_rewards/rejected": -0.16431109607219696, "eval_runtime": 91.7124, "eval_samples_per_second": 19.965, "eval_steps_per_second": 0.316, "step": 50 }, { "epoch": 0.13714285714285715, "grad_norm": 7.378725048645318, "learning_rate": 4.979579212164186e-07, "logits/chosen": -2.578993320465088, "logits/rejected": -2.4725637435913086, "logps/chosen": -293.21600341796875, "logps/rejected": -274.92535400390625, "loss": 0.6509, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.1271006315946579, "rewards/margins": 0.13663128018379211, "rewards/rejected": -0.2637318968772888, "step": 60 }, { "epoch": 0.16, "grad_norm": 7.529436455012959, "learning_rate": 4.946196886175515e-07, "logits/chosen": -2.5928056240081787, "logits/rejected": -2.543529748916626, "logps/chosen": -294.546630859375, "logps/rejected": -301.3702697753906, "loss": 0.6315, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1876838505268097, "rewards/margins": 0.2297508269548416, "rewards/rejected": -0.4174346923828125, "step": 70 }, { "epoch": 0.18285714285714286, "grad_norm": 12.054303957362464, "learning_rate": 4.897191188239667e-07, "logits/chosen": -2.6392509937286377, "logits/rejected": -2.590977668762207, "logps/chosen": -285.3960266113281, "logps/rejected": -307.17535400390625, "loss": 0.62, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.18367011845111847, "rewards/margins": 0.33499467372894287, "rewards/rejected": -0.5186647176742554, "step": 80 }, { "epoch": 0.2057142857142857, "grad_norm": 13.273475435863975, "learning_rate": 4.832875107981763e-07, "logits/chosen": -2.7371668815612793, "logits/rejected": -2.6849629878997803, "logps/chosen": -296.71575927734375, "logps/rejected": -316.90338134765625, "loss": 0.6249, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.20430462062358856, "rewards/margins": 0.40924978256225586, "rewards/rejected": -0.6135543584823608, "step": 90 }, { "epoch": 0.22857142857142856, "grad_norm": 15.686669316278751, "learning_rate": 4.753659419387223e-07, "logits/chosen": -2.769486665725708, "logits/rejected": -2.6865835189819336, "logps/chosen": -318.80413818359375, "logps/rejected": -312.09326171875, "loss": 0.6122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.324177622795105, "rewards/margins": 0.4622408449649811, "rewards/rejected": -0.7864184975624084, "step": 100 }, { "epoch": 0.22857142857142856, "eval_logits/chosen": -2.651167869567871, "eval_logits/rejected": -2.5533361434936523, "eval_logps/chosen": -316.30194091796875, "eval_logps/rejected": -308.60577392578125, "eval_loss": 0.6111233234405518, "eval_rewards/accuracies": 0.6767241358757019, "eval_rewards/chosen": -0.40698006749153137, "eval_rewards/margins": 0.4883164167404175, "eval_rewards/rejected": -0.8952965140342712, "eval_runtime": 90.9103, "eval_samples_per_second": 20.141, "eval_steps_per_second": 0.319, "step": 100 }, { "epoch": 0.25142857142857145, "grad_norm": 12.723184548250023, "learning_rate": 4.660050057270191e-07, "logits/chosen": -2.619276523590088, "logits/rejected": -2.556680202484131, "logps/chosen": -375.2064208984375, "logps/rejected": -391.784423828125, "loss": 0.6021, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.589028000831604, "rewards/margins": 0.3497200608253479, "rewards/rejected": -0.9387480020523071, "step": 110 }, { "epoch": 0.2742857142857143, "grad_norm": 16.182958724416615, "learning_rate": 4.5526448859687144e-07, "logits/chosen": -1.8494535684585571, "logits/rejected": -1.6301162242889404, "logps/chosen": -390.48797607421875, "logps/rejected": -364.620361328125, "loss": 0.5814, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7958351969718933, "rewards/margins": 0.5332263708114624, "rewards/rejected": -1.329061508178711, "step": 120 }, { "epoch": 0.29714285714285715, "grad_norm": 17.332692843610236, "learning_rate": 4.432129880904388e-07, "logits/chosen": -0.4575839638710022, "logits/rejected": -0.06781496107578278, "logps/chosen": -410.9315490722656, "logps/rejected": -413.829833984375, "loss": 0.5548, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.0577561855316162, "rewards/margins": 0.5758394598960876, "rewards/rejected": -1.6335957050323486, "step": 130 }, { "epoch": 0.32, "grad_norm": 20.594750248375647, "learning_rate": 4.299274747394055e-07, "logits/chosen": 0.2059406340122223, "logits/rejected": 0.5167960524559021, "logps/chosen": -435.4883728027344, "logps/rejected": -472.76092529296875, "loss": 0.5654, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.200407862663269, "rewards/margins": 0.8080868721008301, "rewards/rejected": -2.0084948539733887, "step": 140 }, { "epoch": 0.34285714285714286, "grad_norm": 16.29523919912318, "learning_rate": 4.1549280046953653e-07, "logits/chosen": -0.2454165518283844, "logits/rejected": 0.22050300240516663, "logps/chosen": -396.6532287597656, "logps/rejected": -463.4326171875, "loss": 0.5476, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0966728925704956, "rewards/margins": 0.7746630311012268, "rewards/rejected": -1.871335744857788, "step": 150 }, { "epoch": 0.34285714285714286, "eval_logits/chosen": 0.14409177005290985, "eval_logits/rejected": 0.9770079255104065, "eval_logps/chosen": -409.03546142578125, "eval_logps/rejected": -453.3369140625, "eval_loss": 0.5582876801490784, "eval_rewards/accuracies": 0.7370689511299133, "eval_rewards/chosen": -1.3343148231506348, "eval_rewards/margins": 1.0082927942276, "eval_rewards/rejected": -2.3426077365875244, "eval_runtime": 91.388, "eval_samples_per_second": 20.035, "eval_steps_per_second": 0.317, "step": 150 }, { "epoch": 0.3657142857142857, "grad_norm": 31.42845724506196, "learning_rate": 4.000011566683401e-07, "logits/chosen": -0.0020641356240957975, "logits/rejected": 0.659235954284668, "logps/chosen": -442.47259521484375, "logps/rejected": -490.87762451171875, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -1.4313229322433472, "rewards/margins": 0.9210258722305298, "rewards/rejected": -2.352349042892456, "step": 160 }, { "epoch": 0.38857142857142857, "grad_norm": 21.881739335743443, "learning_rate": 3.8355148537705047e-07, "logits/chosen": -0.8011367917060852, "logits/rejected": -0.18294472992420197, "logps/chosen": -420.85791015625, "logps/rejected": -446.387451171875, "loss": 0.5563, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1675300598144531, "rewards/margins": 0.6390342712402344, "rewards/rejected": -1.8065645694732666, "step": 170 }, { "epoch": 0.4114285714285714, "grad_norm": 24.301433957337014, "learning_rate": 3.662488473675315e-07, "logits/chosen": -0.6645376086235046, "logits/rejected": 0.36614301800727844, "logps/chosen": -447.889892578125, "logps/rejected": -494.78070068359375, "loss": 0.5498, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1771008968353271, "rewards/margins": 1.1712000370025635, "rewards/rejected": -2.3483011722564697, "step": 180 }, { "epoch": 0.4342857142857143, "grad_norm": 18.603399872507342, "learning_rate": 3.48203751140067e-07, "logits/chosen": -0.08548859506845474, "logits/rejected": 0.7475250959396362, "logps/chosen": -421.85540771484375, "logps/rejected": -453.6908264160156, "loss": 0.5499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4559787511825562, "rewards/margins": 0.7359476089477539, "rewards/rejected": -2.1919264793395996, "step": 190 }, { "epoch": 0.45714285714285713, "grad_norm": 21.90453363461546, "learning_rate": 3.2953144712759537e-07, "logits/chosen": -0.9407933354377747, "logits/rejected": -0.02539023384451866, "logps/chosen": -380.4794616699219, "logps/rejected": -437.4371643066406, "loss": 0.5582, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1213675737380981, "rewards/margins": 0.9641984701156616, "rewards/rejected": -2.0855660438537598, "step": 200 }, { "epoch": 0.45714285714285713, "eval_logits/chosen": -0.4975701570510864, "eval_logits/rejected": 0.5624167919158936, "eval_logps/chosen": -379.0511169433594, "eval_logps/rejected": -433.3172912597656, "eval_loss": 0.5498641729354858, "eval_rewards/accuracies": 0.732758641242981, "eval_rewards/chosen": -1.034471869468689, "eval_rewards/margins": 1.107939600944519, "eval_rewards/rejected": -2.142411708831787, "eval_runtime": 90.2066, "eval_samples_per_second": 20.298, "eval_steps_per_second": 0.321, "step": 200 }, { "epoch": 0.48, "grad_norm": 16.10426120639833, "learning_rate": 3.103511916141658e-07, "logits/chosen": 0.09185227006673813, "logits/rejected": 0.8966398239135742, "logps/chosen": -387.89202880859375, "logps/rejected": -462.49932861328125, "loss": 0.5404, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2443852424621582, "rewards/margins": 0.9278079271316528, "rewards/rejected": -2.1721930503845215, "step": 210 }, { "epoch": 0.5028571428571429, "grad_norm": 18.780630904417688, "learning_rate": 2.9078548506882117e-07, "logits/chosen": 0.5002994537353516, "logits/rejected": 1.4443576335906982, "logps/chosen": -440.80279541015625, "logps/rejected": -487.53485107421875, "loss": 0.5609, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.5573679208755493, "rewards/margins": 0.8561462163925171, "rewards/rejected": -2.4135143756866455, "step": 220 }, { "epoch": 0.5257142857142857, "grad_norm": 20.610433717594198, "learning_rate": 2.709592897595191e-07, "logits/chosen": 0.22773201763629913, "logits/rejected": 1.2361242771148682, "logps/chosen": -401.34228515625, "logps/rejected": -446.8021545410156, "loss": 0.5442, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.202831506729126, "rewards/margins": 0.8723229169845581, "rewards/rejected": -2.0751543045043945, "step": 230 }, { "epoch": 0.5485714285714286, "grad_norm": 27.325375522779876, "learning_rate": 2.509992316440332e-07, "logits/chosen": 0.26873356103897095, "logits/rejected": 1.303821325302124, "logps/chosen": -431.5526428222656, "logps/rejected": -526.184814453125, "loss": 0.536, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.328427791595459, "rewards/margins": 1.219201922416687, "rewards/rejected": -2.5476298332214355, "step": 240 }, { "epoch": 0.5714285714285714, "grad_norm": 18.92218062691862, "learning_rate": 2.3103279163519918e-07, "logits/chosen": -0.07236287742853165, "logits/rejected": 0.5380650758743286, "logps/chosen": -407.7901306152344, "logps/rejected": -495.40777587890625, "loss": 0.5503, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2716388702392578, "rewards/margins": 0.980434238910675, "rewards/rejected": -2.252073287963867, "step": 250 }, { "epoch": 0.5714285714285714, "eval_logits/chosen": -0.37247952818870544, "eval_logits/rejected": 0.7719168066978455, "eval_logps/chosen": -392.6152038574219, "eval_logps/rejected": -450.1522216796875, "eval_loss": 0.5393335819244385, "eval_rewards/accuracies": 0.7370689511299133, "eval_rewards/chosen": -1.1701123714447021, "eval_rewards/margins": 1.1406482458114624, "eval_rewards/rejected": -2.310760498046875, "eval_runtime": 90.9292, "eval_samples_per_second": 20.137, "eval_steps_per_second": 0.319, "step": 250 }, { "epoch": 0.5942857142857143, "grad_norm": 25.541848941752068, "learning_rate": 2.1118749140573358e-07, "logits/chosen": 0.0009159505134448409, "logits/rejected": 0.6376093626022339, "logps/chosen": -426.14141845703125, "logps/rejected": -502.9112243652344, "loss": 0.5485, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.463547706604004, "rewards/margins": 0.8321346044540405, "rewards/rejected": -2.295682430267334, "step": 260 }, { "epoch": 0.6171428571428571, "grad_norm": 23.51335121897504, "learning_rate": 1.9159007893272703e-07, "logits/chosen": 0.321635901927948, "logits/rejected": 1.6592861413955688, "logps/chosen": -413.24859619140625, "logps/rejected": -473.6759338378906, "loss": 0.5267, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4072272777557373, "rewards/margins": 1.0244569778442383, "rewards/rejected": -2.4316840171813965, "step": 270 }, { "epoch": 0.64, "grad_norm": 27.142819787480168, "learning_rate": 1.7236571898357766e-07, "logits/chosen": 1.0628600120544434, "logits/rejected": 2.0229506492614746, "logps/chosen": -440.122314453125, "logps/rejected": -543.1414794921875, "loss": 0.5316, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6708418130874634, "rewards/margins": 1.129504919052124, "rewards/rejected": -2.8003463745117188, "step": 280 }, { "epoch": 0.6628571428571428, "grad_norm": 24.13150363681131, "learning_rate": 1.5363719371356882e-07, "logits/chosen": 0.698092520236969, "logits/rejected": 1.5312575101852417, "logps/chosen": -450.4425354003906, "logps/rejected": -515.0484008789062, "loss": 0.5339, "rewards/accuracies": 0.75, "rewards/chosen": -1.4953665733337402, "rewards/margins": 0.999632716178894, "rewards/rejected": -2.4949991703033447, "step": 290 }, { "epoch": 0.6857142857142857, "grad_norm": 17.486388226084866, "learning_rate": 1.3552411848071565e-07, "logits/chosen": 0.3839910626411438, "logits/rejected": 1.8341293334960938, "logps/chosen": -441.32183837890625, "logps/rejected": -507.97894287109375, "loss": 0.5224, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3941259384155273, "rewards/margins": 1.1592432260513306, "rewards/rejected": -2.5533692836761475, "step": 300 }, { "epoch": 0.6857142857142857, "eval_logits/chosen": 0.18918734788894653, "eval_logits/rejected": 1.70877206325531, "eval_logps/chosen": -397.884033203125, "eval_logps/rejected": -470.09490966796875, "eval_loss": 0.5312153100967407, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -1.2228009700775146, "eval_rewards/margins": 1.2873866558074951, "eval_rewards/rejected": -2.510187864303589, "eval_runtime": 92.3596, "eval_samples_per_second": 19.825, "eval_steps_per_second": 0.314, "step": 300 }, { "epoch": 0.7085714285714285, "grad_norm": 20.49651474517604, "learning_rate": 1.1814217788631473e-07, "logits/chosen": 0.41669049859046936, "logits/rejected": 1.394052505493164, "logps/chosen": -400.6260986328125, "logps/rejected": -474.28094482421875, "loss": 0.5361, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4432194232940674, "rewards/margins": 0.9276365041732788, "rewards/rejected": -2.3708558082580566, "step": 310 }, { "epoch": 0.7314285714285714, "grad_norm": 18.75776450561332, "learning_rate": 1.0160238692045331e-07, "logits/chosen": 0.7597023844718933, "logits/rejected": 1.6351118087768555, "logps/chosen": -413.95318603515625, "logps/rejected": -488.90460205078125, "loss": 0.542, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6967086791992188, "rewards/margins": 0.782455563545227, "rewards/rejected": -2.479163885116577, "step": 320 }, { "epoch": 0.7542857142857143, "grad_norm": 18.561363930407463, "learning_rate": 8.601038193139438e-08, "logits/chosen": 0.14268045127391815, "logits/rejected": 1.3421038389205933, "logps/chosen": -447.97137451171875, "logps/rejected": -503.50433349609375, "loss": 0.5363, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4875319004058838, "rewards/margins": 1.0696327686309814, "rewards/rejected": -2.5571646690368652, "step": 330 }, { "epoch": 0.7771428571428571, "grad_norm": 17.499558797451687, "learning_rate": 7.146574594727572e-08, "logits/chosen": 0.3810690939426422, "logits/rejected": 1.2245051860809326, "logps/chosen": -414.9021911621094, "logps/rejected": -506.65045166015625, "loss": 0.5285, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.489512324333191, "rewards/margins": 1.1567548513412476, "rewards/rejected": -2.6462674140930176, "step": 340 }, { "epoch": 0.8, "grad_norm": 17.943689215599328, "learning_rate": 5.8061372659157306e-08, "logits/chosen": 0.24244177341461182, "logits/rejected": 1.3491809368133545, "logps/chosen": -441.5047912597656, "logps/rejected": -494.35626220703125, "loss": 0.5396, "rewards/accuracies": 0.6875, "rewards/chosen": -1.517975091934204, "rewards/margins": 0.8826116323471069, "rewards/rejected": -2.4005866050720215, "step": 350 }, { "epoch": 0.8, "eval_logits/chosen": 0.4364562928676605, "eval_logits/rejected": 1.9215292930603027, "eval_logps/chosen": -420.2202453613281, "eval_logps/rejected": -493.9275207519531, "eval_loss": 0.5290318131446838, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.4461628198623657, "eval_rewards/margins": 1.3023512363433838, "eval_rewards/rejected": -2.748514175415039, "eval_runtime": 91.6979, "eval_samples_per_second": 19.968, "eval_steps_per_second": 0.316, "step": 350 }, { "epoch": 0.8228571428571428, "grad_norm": 16.780749890709036, "learning_rate": 4.5882873127531614e-08, "logits/chosen": 0.174576535820961, "logits/rejected": 1.4981176853179932, "logps/chosen": -435.602783203125, "logps/rejected": -510.8885192871094, "loss": 0.5205, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.501977801322937, "rewards/margins": 1.0960423946380615, "rewards/rejected": -2.598020076751709, "step": 360 }, { "epoch": 0.8457142857142858, "grad_norm": 19.04569651937684, "learning_rate": 3.500802900154412e-08, "logits/chosen": 0.34421294927597046, "logits/rejected": 1.787302017211914, "logps/chosen": -412.97747802734375, "logps/rejected": -499.79034423828125, "loss": 0.528, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.4191606044769287, "rewards/margins": 1.1945868730545044, "rewards/rejected": -2.6137473583221436, "step": 370 }, { "epoch": 0.8685714285714285, "grad_norm": 21.067585045477745, "learning_rate": 2.550629574310309e-08, "logits/chosen": 0.211051344871521, "logits/rejected": 1.5275977849960327, "logps/chosen": -486.8960876464844, "logps/rejected": -515.337646484375, "loss": 0.5294, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6883971691131592, "rewards/margins": 0.8909848928451538, "rewards/rejected": -2.5793819427490234, "step": 380 }, { "epoch": 0.8914285714285715, "grad_norm": 21.227279903684668, "learning_rate": 1.7438359028687983e-08, "logits/chosen": 0.37176352739334106, "logits/rejected": 1.208251714706421, "logps/chosen": -453.6361389160156, "logps/rejected": -538.0291748046875, "loss": 0.5333, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4226316213607788, "rewards/margins": 1.0099334716796875, "rewards/rejected": -2.432565212249756, "step": 390 }, { "epoch": 0.9142857142857143, "grad_norm": 35.72120712786558, "learning_rate": 1.0855747162029361e-08, "logits/chosen": 0.5662034749984741, "logits/rejected": 1.0855852365493774, "logps/chosen": -437.5174865722656, "logps/rejected": -510.6676330566406, "loss": 0.55, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.587842345237732, "rewards/margins": 0.8530977368354797, "rewards/rejected": -2.4409401416778564, "step": 400 }, { "epoch": 0.9142857142857143, "eval_logits/chosen": 0.49114343523979187, "eval_logits/rejected": 1.9856219291687012, "eval_logps/chosen": -417.03155517578125, "eval_logps/rejected": -493.2509765625, "eval_loss": 0.5286471843719482, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -1.414276123046875, "eval_rewards/margins": 1.3274718523025513, "eval_rewards/rejected": -2.741748094558716, "eval_runtime": 91.527, "eval_samples_per_second": 20.005, "eval_steps_per_second": 0.317, "step": 400 }, { "epoch": 0.9371428571428572, "grad_norm": 21.313276080994388, "learning_rate": 5.8005019731033615e-09, "logits/chosen": 0.33736371994018555, "logits/rejected": 1.3800859451293945, "logps/chosen": -453.69744873046875, "logps/rejected": -516.6829833984375, "loss": 0.5264, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.6551754474639893, "rewards/margins": 0.9143539667129517, "rewards/rejected": -2.5695290565490723, "step": 410 }, { "epoch": 0.96, "grad_norm": 19.39515946553055, "learning_rate": 2.3049103053431886e-09, "logits/chosen": 0.2167482078075409, "logits/rejected": 1.6823341846466064, "logps/chosen": -409.4588928222656, "logps/rejected": -498.9947814941406, "loss": 0.5293, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.267107605934143, "rewards/margins": 1.3829718828201294, "rewards/rejected": -2.6500792503356934, "step": 420 }, { "epoch": 0.9828571428571429, "grad_norm": 20.626302042234812, "learning_rate": 3.9129780600541397e-10, "logits/chosen": 0.5624532699584961, "logits/rejected": 1.5469181537628174, "logps/chosen": -430.54388427734375, "logps/rejected": -515.5368041992188, "loss": 0.5296, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.483120083808899, "rewards/margins": 1.0355522632598877, "rewards/rejected": -2.518672466278076, "step": 430 }, { "epoch": 0.9988571428571429, "step": 437, "total_flos": 0.0, "train_loss": 0.5693180419214803, "train_runtime": 11386.9149, "train_samples_per_second": 4.918, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 437, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }