{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989071038251366, "eval_steps": 400, "global_step": 457, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01092896174863388, "grad_norm": 315.2451683922351, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -1.0065257549285889, "logits/rejected": -1.0008176565170288, "logps/chosen": -0.28065255284309387, "logps/rejected": -0.28539329767227173, "loss": 3.4114, "rewards/accuracies": 0.53125, "rewards/chosen": -2.806525468826294, "rewards/margins": 0.04740738496184349, "rewards/rejected": -2.8539328575134277, "semantic_entropy": 0.7513969540596008, "step": 5 }, { "epoch": 0.02185792349726776, "grad_norm": 181.1484663719842, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -1.0534369945526123, "logits/rejected": -1.0029994249343872, "logps/chosen": -0.2570807933807373, "logps/rejected": -0.27113229036331177, "loss": 3.3911, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.570807933807373, "rewards/margins": 0.14051488041877747, "rewards/rejected": -2.711322784423828, "semantic_entropy": 0.710273802280426, "step": 10 }, { "epoch": 0.03278688524590164, "grad_norm": 178.65067568018998, "learning_rate": 3.260869565217391e-07, "logits/chosen": -1.0082308053970337, "logits/rejected": -0.9609392285346985, "logps/chosen": -0.26744094491004944, "logps/rejected": -0.27332359552383423, "loss": 3.3533, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.6744089126586914, "rewards/margins": 0.05882669612765312, "rewards/rejected": -2.7332358360290527, "semantic_entropy": 0.7273439168930054, "step": 15 }, { "epoch": 0.04371584699453552, "grad_norm": 258.46792679447157, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.9462105631828308, "logits/rejected": -0.8957524299621582, "logps/chosen": -0.27257752418518066, "logps/rejected": -0.2848864197731018, "loss": 3.3976, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.7257750034332275, "rewards/margins": 0.12308906018733978, "rewards/rejected": -2.8488640785217285, "semantic_entropy": 0.7455072999000549, "step": 20 }, { "epoch": 0.0546448087431694, "grad_norm": 273.0706988791117, "learning_rate": 5.434782608695652e-07, "logits/chosen": -0.9422909617424011, "logits/rejected": -0.8697713613510132, "logps/chosen": -0.2761459946632385, "logps/rejected": -0.2941877543926239, "loss": 3.346, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.761460065841675, "rewards/margins": 0.18041765689849854, "rewards/rejected": -2.9418773651123047, "semantic_entropy": 0.7553174495697021, "step": 25 }, { "epoch": 0.06557377049180328, "grad_norm": 260.6158887162735, "learning_rate": 6.521739130434782e-07, "logits/chosen": -1.0548616647720337, "logits/rejected": -0.9892600774765015, "logps/chosen": -0.2682558596134186, "logps/rejected": -0.284037709236145, "loss": 3.4058, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.682558536529541, "rewards/margins": 0.1578185111284256, "rewards/rejected": -2.8403773307800293, "semantic_entropy": 0.7250551581382751, "step": 30 }, { "epoch": 0.07650273224043716, "grad_norm": 124.4121289092538, "learning_rate": 7.608695652173913e-07, "logits/chosen": -1.0096337795257568, "logits/rejected": -0.9423675537109375, "logps/chosen": -0.2600244879722595, "logps/rejected": -0.27900081872940063, "loss": 3.313, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.6002449989318848, "rewards/margins": 0.18976299464702606, "rewards/rejected": -2.790008068084717, "semantic_entropy": 0.7209498882293701, "step": 35 }, { "epoch": 0.08743169398907104, "grad_norm": 147.2873491379567, "learning_rate": 8.695652173913043e-07, "logits/chosen": -0.9600120782852173, "logits/rejected": -0.8983286619186401, "logps/chosen": -0.2835314869880676, "logps/rejected": -0.2980334460735321, "loss": 3.454, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.835314989089966, "rewards/margins": 0.1450195610523224, "rewards/rejected": -2.980334520339966, "semantic_entropy": 0.7609063386917114, "step": 40 }, { "epoch": 0.09836065573770492, "grad_norm": 93.94573860721647, "learning_rate": 9.782608695652173e-07, "logits/chosen": -1.0171349048614502, "logits/rejected": -0.9333709478378296, "logps/chosen": -0.28658169507980347, "logps/rejected": -0.3058907389640808, "loss": 3.1741, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.865816831588745, "rewards/margins": 0.1930905282497406, "rewards/rejected": -3.0589072704315186, "semantic_entropy": 0.7610034346580505, "step": 45 }, { "epoch": 0.1092896174863388, "grad_norm": 286.4609615603791, "learning_rate": 9.997663088532014e-07, "logits/chosen": -0.9543835520744324, "logits/rejected": -0.8730956315994263, "logps/chosen": -0.2823755145072937, "logps/rejected": -0.2902544140815735, "loss": 3.3278, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.8237555027008057, "rewards/margins": 0.07878823578357697, "rewards/rejected": -2.902543544769287, "semantic_entropy": 0.7529318332672119, "step": 50 }, { "epoch": 0.12021857923497267, "grad_norm": 122.74091082377944, "learning_rate": 9.98817312944725e-07, "logits/chosen": -0.9809161424636841, "logits/rejected": -0.8649328947067261, "logps/chosen": -0.2803560495376587, "logps/rejected": -0.3149644732475281, "loss": 3.1758, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.803560256958008, "rewards/margins": 0.3460845947265625, "rewards/rejected": -3.1496450901031494, "semantic_entropy": 0.7631497979164124, "step": 55 }, { "epoch": 0.13114754098360656, "grad_norm": 87.3444735452583, "learning_rate": 9.971397915250336e-07, "logits/chosen": -1.033070683479309, "logits/rejected": -0.9856836199760437, "logps/chosen": -0.2708079218864441, "logps/rejected": -0.31113672256469727, "loss": 3.0283, "rewards/accuracies": 0.59375, "rewards/chosen": -2.7080790996551514, "rewards/margins": 0.40328770875930786, "rewards/rejected": -3.1113669872283936, "semantic_entropy": 0.7593907713890076, "step": 60 }, { "epoch": 0.14207650273224043, "grad_norm": 162.53486819318002, "learning_rate": 9.94736194623663e-07, "logits/chosen": -1.0021493434906006, "logits/rejected": -0.9318512082099915, "logps/chosen": -0.31085288524627686, "logps/rejected": -0.3405633866786957, "loss": 3.2559, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.1085288524627686, "rewards/margins": 0.29710477590560913, "rewards/rejected": -3.4056334495544434, "semantic_entropy": 0.8082467317581177, "step": 65 }, { "epoch": 0.15300546448087432, "grad_norm": 348.65729079716573, "learning_rate": 9.916100327075037e-07, "logits/chosen": -0.9440506100654602, "logits/rejected": -0.9250672459602356, "logps/chosen": -0.29765281081199646, "logps/rejected": -0.3226909935474396, "loss": 3.0703, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.9765281677246094, "rewards/margins": 0.2503815293312073, "rewards/rejected": -3.22691011428833, "semantic_entropy": 0.7739163041114807, "step": 70 }, { "epoch": 0.16393442622950818, "grad_norm": 78.90080101900976, "learning_rate": 9.877658715537428e-07, "logits/chosen": -0.9282974004745483, "logits/rejected": -0.912223219871521, "logps/chosen": -0.3213742971420288, "logps/rejected": -0.351571649312973, "loss": 3.1657, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.213742733001709, "rewards/margins": 0.30197376012802124, "rewards/rejected": -3.515717029571533, "semantic_entropy": 0.8054312467575073, "step": 75 }, { "epoch": 0.17486338797814208, "grad_norm": 86.95229439852206, "learning_rate": 9.832093255815216e-07, "logits/chosen": -0.9330040812492371, "logits/rejected": -0.8699474334716797, "logps/chosen": -0.3141597807407379, "logps/rejected": -0.3350343406200409, "loss": 3.1277, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.1415977478027344, "rewards/margins": 0.20874571800231934, "rewards/rejected": -3.350343704223633, "semantic_entropy": 0.7814024686813354, "step": 80 }, { "epoch": 0.18579234972677597, "grad_norm": 129.31831333568232, "learning_rate": 9.779470496520441e-07, "logits/chosen": -0.9329907298088074, "logits/rejected": -0.8826324343681335, "logps/chosen": -0.31158381700515747, "logps/rejected": -0.3667066693305969, "loss": 2.9946, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.1158382892608643, "rewards/margins": 0.5512284636497498, "rewards/rejected": -3.667067050933838, "semantic_entropy": 0.802832305431366, "step": 85 }, { "epoch": 0.19672131147540983, "grad_norm": 101.69393552819945, "learning_rate": 9.719867293491144e-07, "logits/chosen": -1.0058772563934326, "logits/rejected": -0.9232236742973328, "logps/chosen": -0.33966144919395447, "logps/rejected": -0.37590381503105164, "loss": 3.0893, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3966145515441895, "rewards/margins": 0.3624236285686493, "rewards/rejected": -3.759038209915161, "semantic_entropy": 0.8507563471794128, "step": 90 }, { "epoch": 0.20765027322404372, "grad_norm": 91.30230852825771, "learning_rate": 9.653370697542987e-07, "logits/chosen": -0.9487398862838745, "logits/rejected": -0.9483828544616699, "logps/chosen": -0.3381520211696625, "logps/rejected": -0.362968385219574, "loss": 2.9005, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.3815205097198486, "rewards/margins": 0.24816343188285828, "rewards/rejected": -3.62968373298645, "semantic_entropy": 0.8359481692314148, "step": 95 }, { "epoch": 0.2185792349726776, "grad_norm": 108.89056387914384, "learning_rate": 9.580077827331037e-07, "logits/chosen": -0.9571771621704102, "logits/rejected": -0.9095252752304077, "logps/chosen": -0.3725859522819519, "logps/rejected": -0.4279399514198303, "loss": 2.8907, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.7258598804473877, "rewards/margins": 0.553540050983429, "rewards/rejected": -4.2793989181518555, "semantic_entropy": 0.8787097930908203, "step": 100 }, { "epoch": 0.22950819672131148, "grad_norm": 127.65723295420393, "learning_rate": 9.500095727507419e-07, "logits/chosen": -1.008998155593872, "logits/rejected": -0.9757212400436401, "logps/chosen": -0.3544849753379822, "logps/rejected": -0.3977915942668915, "loss": 2.9862, "rewards/accuracies": 0.625, "rewards/chosen": -3.5448498725891113, "rewards/margins": 0.4330664277076721, "rewards/rejected": -3.9779160022735596, "semantic_entropy": 0.8548823595046997, "step": 105 }, { "epoch": 0.24043715846994534, "grad_norm": 114.56272891566377, "learning_rate": 9.413541212382004e-07, "logits/chosen": -0.9939772486686707, "logits/rejected": -0.9762369990348816, "logps/chosen": -0.3617566227912903, "logps/rejected": -0.4416491985321045, "loss": 2.8457, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.617565870285034, "rewards/margins": 0.7989261746406555, "rewards/rejected": -4.416492462158203, "semantic_entropy": 0.8938226699829102, "step": 110 }, { "epoch": 0.25136612021857924, "grad_norm": 93.3647181422965, "learning_rate": 9.320540695314438e-07, "logits/chosen": -1.007943868637085, "logits/rejected": -0.9657400846481323, "logps/chosen": -0.3646220564842224, "logps/rejected": -0.46302324533462524, "loss": 2.7542, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.6462206840515137, "rewards/margins": 0.9840116500854492, "rewards/rejected": -4.630232334136963, "semantic_entropy": 0.878681480884552, "step": 115 }, { "epoch": 0.26229508196721313, "grad_norm": 83.92874813803249, "learning_rate": 9.221230004086721e-07, "logits/chosen": -1.0344518423080444, "logits/rejected": -0.9601195454597473, "logps/chosen": -0.37165606021881104, "logps/rejected": -0.42726248502731323, "loss": 2.8688, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.7165608406066895, "rewards/margins": 0.5560643076896667, "rewards/rejected": -4.272624969482422, "semantic_entropy": 0.8952873945236206, "step": 120 }, { "epoch": 0.273224043715847, "grad_norm": 190.904470995411, "learning_rate": 9.11575418252596e-07, "logits/chosen": -0.9347244501113892, "logits/rejected": -0.8975458145141602, "logps/chosen": -0.38535335659980774, "logps/rejected": -0.4541945457458496, "loss": 2.7207, "rewards/accuracies": 0.6875, "rewards/chosen": -3.853533983230591, "rewards/margins": 0.68841153383255, "rewards/rejected": -4.541945457458496, "semantic_entropy": 0.9004859924316406, "step": 125 }, { "epoch": 0.28415300546448086, "grad_norm": 92.76045922249655, "learning_rate": 9.004267278667031e-07, "logits/chosen": -0.9624107480049133, "logits/rejected": -0.9534618258476257, "logps/chosen": -0.4056780934333801, "logps/rejected": -0.5245551466941833, "loss": 2.7139, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.056780815124512, "rewards/margins": 1.1887714862823486, "rewards/rejected": -5.245552062988281, "semantic_entropy": 0.8837997317314148, "step": 130 }, { "epoch": 0.29508196721311475, "grad_norm": 73.9902335755549, "learning_rate": 8.886932119764565e-07, "logits/chosen": -1.0003821849822998, "logits/rejected": -0.9125338792800903, "logps/chosen": -0.3864729106426239, "logps/rejected": -0.4857531189918518, "loss": 2.6137, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.8647284507751465, "rewards/margins": 0.9928020238876343, "rewards/rejected": -4.8575310707092285, "semantic_entropy": 0.9038194417953491, "step": 135 }, { "epoch": 0.30601092896174864, "grad_norm": 86.17327659708472, "learning_rate": 8.763920074482809e-07, "logits/chosen": -0.9963301420211792, "logits/rejected": -0.9396141767501831, "logps/chosen": -0.4139133095741272, "logps/rejected": -0.5436104536056519, "loss": 2.4158, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.139132976531982, "rewards/margins": 1.296971082687378, "rewards/rejected": -5.4361042976379395, "semantic_entropy": 0.9314233064651489, "step": 140 }, { "epoch": 0.31693989071038253, "grad_norm": 95.91548705026597, "learning_rate": 8.635410802610723e-07, "logits/chosen": -0.9837471842765808, "logits/rejected": -0.96197909116745, "logps/chosen": -0.3892672657966614, "logps/rejected": -0.4463191032409668, "loss": 2.5469, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8926727771759033, "rewards/margins": 0.5705188512802124, "rewards/rejected": -4.463191986083984, "semantic_entropy": 0.9131715893745422, "step": 145 }, { "epoch": 0.32786885245901637, "grad_norm": 62.831854072245996, "learning_rate": 8.501591992667849e-07, "logits/chosen": -1.0432965755462646, "logits/rejected": -1.0063092708587646, "logps/chosen": -0.42201298475265503, "logps/rejected": -0.5824503302574158, "loss": 2.4081, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.22012996673584, "rewards/margins": 1.6043736934661865, "rewards/rejected": -5.8245038986206055, "semantic_entropy": 0.9204033613204956, "step": 150 }, { "epoch": 0.33879781420765026, "grad_norm": 71.34642946039108, "learning_rate": 8.362659087784152e-07, "logits/chosen": -1.0033342838287354, "logits/rejected": -0.943057656288147, "logps/chosen": -0.4163185656070709, "logps/rejected": -0.5151209831237793, "loss": 2.5079, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.163184642791748, "rewards/margins": 0.9880247116088867, "rewards/rejected": -5.151209831237793, "semantic_entropy": 0.9188691973686218, "step": 155 }, { "epoch": 0.34972677595628415, "grad_norm": 95.49736374278223, "learning_rate": 8.218815000254231e-07, "logits/chosen": -1.036727786064148, "logits/rejected": -0.9749704599380493, "logps/chosen": -0.46870869398117065, "logps/rejected": -0.5485578775405884, "loss": 2.5679, "rewards/accuracies": 0.6875, "rewards/chosen": -4.68708610534668, "rewards/margins": 0.7984916567802429, "rewards/rejected": -5.4855780601501465, "semantic_entropy": 0.9369996786117554, "step": 160 }, { "epoch": 0.36065573770491804, "grad_norm": 90.06304805222751, "learning_rate": 8.07026981518276e-07, "logits/chosen": -1.0219743251800537, "logits/rejected": -0.9637954831123352, "logps/chosen": -0.4483868181705475, "logps/rejected": -0.5347827076911926, "loss": 2.5189, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.48386812210083, "rewards/margins": 0.8639583587646484, "rewards/rejected": -5.3478264808654785, "semantic_entropy": 0.9511811137199402, "step": 165 }, { "epoch": 0.37158469945355194, "grad_norm": 99.59126068255084, "learning_rate": 7.917240483654e-07, "logits/chosen": -1.01731276512146, "logits/rejected": -0.9495924115180969, "logps/chosen": -0.44470348954200745, "logps/rejected": -0.5198506712913513, "loss": 2.6168, "rewards/accuracies": 0.6875, "rewards/chosen": -4.447035312652588, "rewards/margins": 0.7514716386795044, "rewards/rejected": -5.198506832122803, "semantic_entropy": 0.9501636624336243, "step": 170 }, { "epoch": 0.3825136612021858, "grad_norm": 91.44599232569668, "learning_rate": 7.759950505873521e-07, "logits/chosen": -1.067455768585205, "logits/rejected": -1.031198263168335, "logps/chosen": -0.4639251232147217, "logps/rejected": -0.5280762910842896, "loss": 2.4665, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.639250755310059, "rewards/margins": 0.6415112614631653, "rewards/rejected": -5.280762672424316, "semantic_entropy": 0.9261935949325562, "step": 175 }, { "epoch": 0.39344262295081966, "grad_norm": 90.82885753738844, "learning_rate": 7.598629604744872e-07, "logits/chosen": -1.0707954168319702, "logits/rejected": -1.0595567226409912, "logps/chosen": -0.43981847167015076, "logps/rejected": -0.5758017897605896, "loss": 2.3437, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -4.398184776306152, "rewards/margins": 1.3598332405090332, "rewards/rejected": -5.7580180168151855, "semantic_entropy": 0.966151237487793, "step": 180 }, { "epoch": 0.40437158469945356, "grad_norm": 67.92403731887116, "learning_rate": 7.433513390357989e-07, "logits/chosen": -1.108884572982788, "logits/rejected": -1.1143901348114014, "logps/chosen": -0.46474918723106384, "logps/rejected": -0.5912537574768066, "loss": 2.3628, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -4.647492408752441, "rewards/margins": 1.265044927597046, "rewards/rejected": -5.912537097930908, "semantic_entropy": 0.9750612378120422, "step": 185 }, { "epoch": 0.41530054644808745, "grad_norm": 108.44092763175198, "learning_rate": 7.264843015879321e-07, "logits/chosen": -1.1020928621292114, "logits/rejected": -1.0545780658721924, "logps/chosen": -0.4519892632961273, "logps/rejected": -0.6003154516220093, "loss": 2.4166, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -4.51989221572876, "rewards/margins": 1.483262062072754, "rewards/rejected": -6.003154754638672, "semantic_entropy": 0.9666361808776855, "step": 190 }, { "epoch": 0.4262295081967213, "grad_norm": 83.4733675057699, "learning_rate": 7.092864825346266e-07, "logits/chosen": -1.129482626914978, "logits/rejected": -1.0993843078613281, "logps/chosen": -0.5358282327651978, "logps/rejected": -0.7053772211074829, "loss": 2.467, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.358282566070557, "rewards/margins": 1.6954904794692993, "rewards/rejected": -7.053772926330566, "semantic_entropy": 0.9725033044815063, "step": 195 }, { "epoch": 0.4371584699453552, "grad_norm": 84.36629950168863, "learning_rate": 6.917829993880302e-07, "logits/chosen": -1.1204617023468018, "logits/rejected": -1.0279228687286377, "logps/chosen": -0.5025330185890198, "logps/rejected": -0.6288330554962158, "loss": 2.3371, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.025330543518066, "rewards/margins": 1.263000249862671, "rewards/rejected": -6.288330078125, "semantic_entropy": 0.9786258935928345, "step": 200 }, { "epoch": 0.44808743169398907, "grad_norm": 69.08292746057649, "learning_rate": 6.739994160844309e-07, "logits/chosen": -1.0733792781829834, "logits/rejected": -1.0833173990249634, "logps/chosen": -0.4674602448940277, "logps/rejected": -0.6061697006225586, "loss": 2.2298, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.67460298538208, "rewards/margins": 1.3870941400527954, "rewards/rejected": -6.061697006225586, "semantic_entropy": 0.9808717966079712, "step": 205 }, { "epoch": 0.45901639344262296, "grad_norm": 173.88772431903314, "learning_rate": 6.559617056479827e-07, "logits/chosen": -1.1001962423324585, "logits/rejected": -1.0926573276519775, "logps/chosen": -0.5027323961257935, "logps/rejected": -0.6716328263282776, "loss": 2.2974, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.027324676513672, "rewards/margins": 1.6890045404434204, "rewards/rejected": -6.7163286209106445, "semantic_entropy": 0.9590319395065308, "step": 210 }, { "epoch": 0.46994535519125685, "grad_norm": 94.73792447265116, "learning_rate": 6.376962122569567e-07, "logits/chosen": -1.1140978336334229, "logits/rejected": -1.0545861721038818, "logps/chosen": -0.5114679336547852, "logps/rejected": -0.6812509298324585, "loss": 2.4425, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.114679336547852, "rewards/margins": 1.6978304386138916, "rewards/rejected": -6.812510013580322, "semantic_entropy": 1.0018761157989502, "step": 215 }, { "epoch": 0.4808743169398907, "grad_norm": 92.5570818396132, "learning_rate": 6.192296127679192e-07, "logits/chosen": -1.1659886837005615, "logits/rejected": -1.1348073482513428, "logps/chosen": -0.5338795781135559, "logps/rejected": -0.6664601564407349, "loss": 2.2908, "rewards/accuracies": 0.71875, "rewards/chosen": -5.338796138763428, "rewards/margins": 1.325805425643921, "rewards/rejected": -6.664601802825928, "semantic_entropy": 0.9806681871414185, "step": 220 }, { "epoch": 0.4918032786885246, "grad_norm": 69.72669762595787, "learning_rate": 6.005888777540319e-07, "logits/chosen": -1.1951611042022705, "logits/rejected": -1.1498881578445435, "logps/chosen": -0.5304981470108032, "logps/rejected": -0.6832343935966492, "loss": 2.335, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.304981708526611, "rewards/margins": 1.5273630619049072, "rewards/rejected": -6.832344055175781, "semantic_entropy": 1.0007470846176147, "step": 225 }, { "epoch": 0.5027322404371585, "grad_norm": 75.21829123168051, "learning_rate": 5.818012321143773e-07, "logits/chosen": -1.0969598293304443, "logits/rejected": -1.0912028551101685, "logps/chosen": -0.5201154351234436, "logps/rejected": -0.7016697525978088, "loss": 2.2528, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.2011542320251465, "rewards/margins": 1.8155431747436523, "rewards/rejected": -7.016697883605957, "semantic_entropy": 0.9929972887039185, "step": 230 }, { "epoch": 0.5136612021857924, "grad_norm": 83.71823549926938, "learning_rate": 5.628941153118388e-07, "logits/chosen": -1.1018104553222656, "logits/rejected": -1.0575555562973022, "logps/chosen": -0.5212110280990601, "logps/rejected": -0.664139986038208, "loss": 2.2375, "rewards/accuracies": 0.78125, "rewards/chosen": -5.21211051940918, "rewards/margins": 1.4292891025543213, "rewards/rejected": -6.641399383544922, "semantic_entropy": 0.9852234125137329, "step": 235 }, { "epoch": 0.5245901639344263, "grad_norm": 69.35272919599275, "learning_rate": 5.438951412976098e-07, "logits/chosen": -1.1364176273345947, "logits/rejected": -1.141788125038147, "logps/chosen": -0.49681615829467773, "logps/rejected": -0.6832265853881836, "loss": 2.0805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -4.9681620597839355, "rewards/margins": 1.8641045093536377, "rewards/rejected": -6.832266330718994, "semantic_entropy": 1.0116462707519531, "step": 240 }, { "epoch": 0.5355191256830601, "grad_norm": 71.19254935234711, "learning_rate": 5.248320581808619e-07, "logits/chosen": -1.0613957643508911, "logits/rejected": -1.0091025829315186, "logps/chosen": -0.5141640901565552, "logps/rejected": -0.6968377828598022, "loss": 2.1843, "rewards/accuracies": 0.75, "rewards/chosen": -5.141640663146973, "rewards/margins": 1.8267381191253662, "rewards/rejected": -6.96837854385376, "semantic_entropy": 0.9718330502510071, "step": 245 }, { "epoch": 0.546448087431694, "grad_norm": 70.83927823916859, "learning_rate": 5.057327077024744e-07, "logits/chosen": -1.146533727645874, "logits/rejected": -1.1047497987747192, "logps/chosen": -0.5028788447380066, "logps/rejected": -0.6331702470779419, "loss": 2.2718, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.028789043426514, "rewards/margins": 1.3029136657714844, "rewards/rejected": -6.331702709197998, "semantic_entropy": 0.9849420785903931, "step": 250 }, { "epoch": 0.5573770491803278, "grad_norm": 78.29453714448445, "learning_rate": 4.866249845720132e-07, "logits/chosen": -1.1301579475402832, "logits/rejected": -1.091973900794983, "logps/chosen": -0.555388331413269, "logps/rejected": -0.7187477946281433, "loss": 2.1692, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.553883075714111, "rewards/margins": 1.633594274520874, "rewards/rejected": -7.187478065490723, "semantic_entropy": 0.9999436140060425, "step": 255 }, { "epoch": 0.5683060109289617, "grad_norm": 74.66530559540921, "learning_rate": 4.675367957273505e-07, "logits/chosen": -1.096861720085144, "logits/rejected": -1.0846450328826904, "logps/chosen": -0.5131552815437317, "logps/rejected": -0.671288251876831, "loss": 2.1911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.131552219390869, "rewards/margins": 1.5813300609588623, "rewards/rejected": -6.712882041931152, "semantic_entropy": 0.9927156567573547, "step": 260 }, { "epoch": 0.5792349726775956, "grad_norm": 122.60471760919329, "learning_rate": 4.4849601957642285e-07, "logits/chosen": -1.124089002609253, "logits/rejected": -1.0828189849853516, "logps/chosen": -0.5186060070991516, "logps/rejected": -0.6825847625732422, "loss": 2.1964, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.186059474945068, "rewards/margins": 1.6397874355316162, "rewards/rejected": -6.8258466720581055, "semantic_entropy": 0.9880490303039551, "step": 265 }, { "epoch": 0.5901639344262295, "grad_norm": 86.34568213442027, "learning_rate": 4.295304652806592e-07, "logits/chosen": -1.1392979621887207, "logits/rejected": -1.1078673601150513, "logps/chosen": -0.5172940492630005, "logps/rejected": -0.699386477470398, "loss": 2.0791, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.172940254211426, "rewards/margins": 1.8209247589111328, "rewards/rejected": -6.993865013122559, "semantic_entropy": 0.9866276979446411, "step": 270 }, { "epoch": 0.6010928961748634, "grad_norm": 58.20062582439532, "learning_rate": 4.106678321395433e-07, "logits/chosen": -1.1032135486602783, "logits/rejected": -1.0302824974060059, "logps/chosen": -0.5297619104385376, "logps/rejected": -0.627161979675293, "loss": 2.1916, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.2976179122924805, "rewards/margins": 0.9740018844604492, "rewards/rejected": -6.271620750427246, "semantic_entropy": 0.9879854917526245, "step": 275 }, { "epoch": 0.6120218579234973, "grad_norm": 84.89751844734043, "learning_rate": 3.9193566913562915e-07, "logits/chosen": -1.0617036819458008, "logits/rejected": -1.0624239444732666, "logps/chosen": -0.5223734378814697, "logps/rejected": -0.7275804281234741, "loss": 2.1763, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.223733901977539, "rewards/margins": 2.0520694255828857, "rewards/rejected": -7.275804042816162, "semantic_entropy": 0.986528754234314, "step": 280 }, { "epoch": 0.6229508196721312, "grad_norm": 72.19485427611927, "learning_rate": 3.7336133469909623e-07, "logits/chosen": -1.1958709955215454, "logits/rejected": -1.1594369411468506, "logps/chosen": -0.5087668895721436, "logps/rejected": -0.702479898929596, "loss": 2.1358, "rewards/accuracies": 0.8125, "rewards/chosen": -5.0876688957214355, "rewards/margins": 1.9371296167373657, "rewards/rejected": -7.024799346923828, "semantic_entropy": 0.9978361129760742, "step": 285 }, { "epoch": 0.6338797814207651, "grad_norm": 81.98995443073827, "learning_rate": 3.549719567506076e-07, "logits/chosen": -1.1317315101623535, "logits/rejected": -1.0870287418365479, "logps/chosen": -0.5346897840499878, "logps/rejected": -0.6969183087348938, "loss": 2.1523, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.346898078918457, "rewards/margins": 1.62228524684906, "rewards/rejected": -6.969183444976807, "semantic_entropy": 1.0123668909072876, "step": 290 }, { "epoch": 0.644808743169399, "grad_norm": 67.90034025619751, "learning_rate": 3.3679439308082774e-07, "logits/chosen": -1.115994930267334, "logits/rejected": -1.1152690649032593, "logps/chosen": -0.5274439454078674, "logps/rejected": -0.7239035367965698, "loss": 1.9278, "rewards/accuracies": 0.8125, "rewards/chosen": -5.274438381195068, "rewards/margins": 1.9645967483520508, "rewards/rejected": -7.239035606384277, "semantic_entropy": 1.0061827898025513, "step": 295 }, { "epoch": 0.6557377049180327, "grad_norm": 70.85439265034472, "learning_rate": 3.1885519212446716e-07, "logits/chosen": -1.144639253616333, "logits/rejected": -1.1228580474853516, "logps/chosen": -0.542576253414154, "logps/rejected": -0.7291213274002075, "loss": 2.0159, "rewards/accuracies": 0.8125, "rewards/chosen": -5.42576265335083, "rewards/margins": 1.865450143814087, "rewards/rejected": -7.291213035583496, "semantic_entropy": 0.9855409860610962, "step": 300 }, { "epoch": 0.6666666666666666, "grad_norm": 76.04305212768787, "learning_rate": 3.0118055418614295e-07, "logits/chosen": -1.1450592279434204, "logits/rejected": -1.0869606733322144, "logps/chosen": -0.5319762229919434, "logps/rejected": -0.7148723006248474, "loss": 2.1436, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.319762706756592, "rewards/margins": 1.8289600610733032, "rewards/rejected": -7.1487226486206055, "semantic_entropy": 1.007246732711792, "step": 305 }, { "epoch": 0.6775956284153005, "grad_norm": 108.42660481521786, "learning_rate": 2.83796293174686e-07, "logits/chosen": -1.0885827541351318, "logits/rejected": -1.092543363571167, "logps/chosen": -0.5401273369789124, "logps/rejected": -0.748576283454895, "loss": 2.2592, "rewards/accuracies": 0.75, "rewards/chosen": -5.401273250579834, "rewards/margins": 2.084489107131958, "rewards/rejected": -7.485762119293213, "semantic_entropy": 0.9948571920394897, "step": 310 }, { "epoch": 0.6885245901639344, "grad_norm": 70.86094179855972, "learning_rate": 2.6672779890178046e-07, "logits/chosen": -1.1491663455963135, "logits/rejected": -1.1490873098373413, "logps/chosen": -0.5631974935531616, "logps/rejected": -0.6903260946273804, "loss": 2.1308, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -5.631974697113037, "rewards/margins": 1.2712849378585815, "rewards/rejected": -6.903260231018066, "semantic_entropy": 0.9926439523696899, "step": 315 }, { "epoch": 0.6994535519125683, "grad_norm": 57.13767602235213, "learning_rate": 2.500000000000001e-07, "logits/chosen": -1.2022249698638916, "logits/rejected": -1.1517468690872192, "logps/chosen": -0.5420448184013367, "logps/rejected": -0.747357964515686, "loss": 2.0999, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.42044734954834, "rewards/margins": 2.053131580352783, "rewards/rejected": -7.473579406738281, "semantic_entropy": 1.00899338722229, "step": 320 }, { "epoch": 0.7103825136612022, "grad_norm": 82.09122654285451, "learning_rate": 2.3363732751439923e-07, "logits/chosen": -1.1618945598602295, "logits/rejected": -1.143754243850708, "logps/chosen": -0.5291402339935303, "logps/rejected": -0.7228410243988037, "loss": 2.0999, "rewards/accuracies": 0.78125, "rewards/chosen": -5.291402339935303, "rewards/margins": 1.9370079040527344, "rewards/rejected": -7.228410243988037, "semantic_entropy": 1.0088245868682861, "step": 325 }, { "epoch": 0.7213114754098361, "grad_norm": 62.18032792736575, "learning_rate": 2.1766367922083283e-07, "logits/chosen": -1.112157940864563, "logits/rejected": -1.0798307657241821, "logps/chosen": -0.4986172318458557, "logps/rejected": -0.7466678023338318, "loss": 2.0623, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -4.986172676086426, "rewards/margins": 2.4805047512054443, "rewards/rejected": -7.466677188873291, "semantic_entropy": 0.98463374376297, "step": 330 }, { "epoch": 0.73224043715847, "grad_norm": 77.93323322539872, "learning_rate": 2.021023847231202e-07, "logits/chosen": -1.1002051830291748, "logits/rejected": -1.0612647533416748, "logps/chosen": -0.5647180080413818, "logps/rejected": -0.758574366569519, "loss": 2.0578, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -5.647180557250977, "rewards/margins": 1.938563346862793, "rewards/rejected": -7.5857439041137695, "semantic_entropy": 0.9851423501968384, "step": 335 }, { "epoch": 0.7431693989071039, "grad_norm": 86.17026425537334, "learning_rate": 1.869761713800254e-07, "logits/chosen": -1.1088799238204956, "logits/rejected": -1.064263939857483, "logps/chosen": -0.5516521334648132, "logps/rejected": -0.7183089256286621, "loss": 2.1191, "rewards/accuracies": 0.71875, "rewards/chosen": -5.516521453857422, "rewards/margins": 1.6665668487548828, "rewards/rejected": -7.183088779449463, "semantic_entropy": 0.9953676462173462, "step": 340 }, { "epoch": 0.7540983606557377, "grad_norm": 80.58633018790611, "learning_rate": 1.7230713111182164e-07, "logits/chosen": -1.156589150428772, "logits/rejected": -1.1543285846710205, "logps/chosen": -0.5463498830795288, "logps/rejected": -0.7534428238868713, "loss": 2.1553, "rewards/accuracies": 0.8125, "rewards/chosen": -5.463499546051025, "rewards/margins": 2.0709292888641357, "rewards/rejected": -7.534428596496582, "semantic_entropy": 0.9916456341743469, "step": 345 }, { "epoch": 0.7650273224043715, "grad_norm": 89.6142113445473, "learning_rate": 1.5811668813491696e-07, "logits/chosen": -1.1436890363693237, "logits/rejected": -1.124874234199524, "logps/chosen": -0.5143482685089111, "logps/rejected": -0.6786841154098511, "loss": 2.0898, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.143482685089111, "rewards/margins": 1.643358588218689, "rewards/rejected": -6.786840915679932, "semantic_entropy": 0.9892457127571106, "step": 350 }, { "epoch": 0.7759562841530054, "grad_norm": 66.31246023221092, "learning_rate": 1.4442556767166369e-07, "logits/chosen": -1.1231715679168701, "logits/rejected": -1.0935585498809814, "logps/chosen": -0.5266932845115662, "logps/rejected": -0.6977185010910034, "loss": 2.0776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.266932487487793, "rewards/margins": 1.7102525234222412, "rewards/rejected": -6.9771857261657715, "semantic_entropy": 1.0020959377288818, "step": 355 }, { "epoch": 0.7868852459016393, "grad_norm": 83.110534751007, "learning_rate": 1.312537656810549e-07, "logits/chosen": -1.0739078521728516, "logits/rejected": -1.0743262767791748, "logps/chosen": -0.5362976789474487, "logps/rejected": -0.714411735534668, "loss": 2.1365, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.362977504730225, "rewards/margins": 1.7811400890350342, "rewards/rejected": -7.144117832183838, "semantic_entropy": 0.9827717542648315, "step": 360 }, { "epoch": 0.7978142076502732, "grad_norm": 141.91877356203636, "learning_rate": 1.1862051965451214e-07, "logits/chosen": -1.1579176187515259, "logits/rejected": -1.1566094160079956, "logps/chosen": -0.5423863530158997, "logps/rejected": -0.7343495488166809, "loss": 2.054, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.423863410949707, "rewards/margins": 1.9196319580078125, "rewards/rejected": -7.3434953689575195, "semantic_entropy": 1.011482834815979, "step": 365 }, { "epoch": 0.8087431693989071, "grad_norm": 78.59703524378126, "learning_rate": 1.0654428051942138e-07, "logits/chosen": -1.165038824081421, "logits/rejected": -1.1290335655212402, "logps/chosen": -0.5577388405799866, "logps/rejected": -0.7825115919113159, "loss": 2.1829, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.577388286590576, "rewards/margins": 2.247727632522583, "rewards/rejected": -7.825116157531738, "semantic_entropy": 1.0018393993377686, "step": 370 }, { "epoch": 0.819672131147541, "grad_norm": 63.6353735816688, "learning_rate": 9.504268569144763e-08, "logits/chosen": -1.2013657093048096, "logits/rejected": -1.1338837146759033, "logps/chosen": -0.5316141843795776, "logps/rejected": -0.7204681038856506, "loss": 2.0911, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.316141605377197, "rewards/margins": 1.8885393142700195, "rewards/rejected": -7.204681396484375, "semantic_entropy": 1.0054622888565063, "step": 375 }, { "epoch": 0.8306010928961749, "grad_norm": 64.71607446988344, "learning_rate": 8.413253331499049e-08, "logits/chosen": -1.0807088613510132, "logits/rejected": -1.102399230003357, "logps/chosen": -0.549530029296875, "logps/rejected": -0.7361005544662476, "loss": 2.0374, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.49530029296875, "rewards/margins": 1.8657052516937256, "rewards/rejected": -7.361004829406738, "semantic_entropy": 1.0044524669647217, "step": 380 }, { "epoch": 0.8415300546448088, "grad_norm": 67.03443291040514, "learning_rate": 7.382975772939865e-08, "logits/chosen": -1.1790930032730103, "logits/rejected": -1.1615774631500244, "logps/chosen": -0.590388834476471, "logps/rejected": -0.7754439115524292, "loss": 2.1706, "rewards/accuracies": 0.84375, "rewards/chosen": -5.903887748718262, "rewards/margins": 1.850551962852478, "rewards/rejected": -7.754439353942871, "semantic_entropy": 1.0115418434143066, "step": 385 }, { "epoch": 0.8524590163934426, "grad_norm": 103.88519596336293, "learning_rate": 6.414940619677734e-08, "logits/chosen": -1.166526436805725, "logits/rejected": -1.1425046920776367, "logps/chosen": -0.5350316762924194, "logps/rejected": -0.7514439821243286, "loss": 2.1188, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -5.350315570831299, "rewards/margins": 2.164124011993408, "rewards/rejected": -7.514439582824707, "semantic_entropy": 1.0120559930801392, "step": 390 }, { "epoch": 0.8633879781420765, "grad_norm": 71.31872002268815, "learning_rate": 5.5105616925376296e-08, "logits/chosen": -1.1460245847702026, "logits/rejected": -1.1267726421356201, "logps/chosen": -0.5442631244659424, "logps/rejected": -0.6925864815711975, "loss": 2.0494, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -5.442631721496582, "rewards/margins": 1.4832336902618408, "rewards/rejected": -6.925864219665527, "semantic_entropy": 1.0200846195220947, "step": 395 }, { "epoch": 0.8743169398907104, "grad_norm": 75.02341041681119, "learning_rate": 4.6711598420656976e-08, "logits/chosen": -1.0774163007736206, "logits/rejected": -1.0491969585418701, "logps/chosen": -0.5711244344711304, "logps/rejected": -0.7721945643424988, "loss": 2.0424, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.711243629455566, "rewards/margins": 2.010702133178711, "rewards/rejected": -7.721946716308594, "semantic_entropy": 0.9843025207519531, "step": 400 }, { "epoch": 0.8743169398907104, "eval_logits/chosen": -1.373081088066101, "eval_logits/rejected": -1.332649827003479, "eval_logps/chosen": -0.5314387083053589, "eval_logps/rejected": -0.7141891121864319, "eval_loss": 2.0935795307159424, "eval_rewards/accuracies": 0.7771084308624268, "eval_rewards/chosen": -5.314386367797852, "eval_rewards/margins": 1.8275047540664673, "eval_rewards/rejected": -7.1418914794921875, "eval_runtime": 37.9946, "eval_samples_per_second": 34.689, "eval_semantic_entropy": 0.9976296424865723, "eval_steps_per_second": 2.185, "step": 400 }, { "epoch": 0.8852459016393442, "grad_norm": 68.55432141696805, "learning_rate": 3.897961019419516e-08, "logits/chosen": -1.1141546964645386, "logits/rejected": -1.046690583229065, "logps/chosen": -0.5127943754196167, "logps/rejected": -0.656581699848175, "loss": 1.9895, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -5.127943992614746, "rewards/margins": 1.4378730058670044, "rewards/rejected": -6.565816402435303, "semantic_entropy": 1.010331630706787, "step": 405 }, { "epoch": 0.8961748633879781, "grad_norm": 78.80129315841073, "learning_rate": 3.192094485859526e-08, "logits/chosen": -1.1211316585540771, "logits/rejected": -1.1407296657562256, "logps/chosen": -0.5510164499282837, "logps/rejected": -0.7787143588066101, "loss": 2.0771, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.5101637840271, "rewards/margins": 2.2769789695739746, "rewards/rejected": -7.787143707275391, "semantic_entropy": 0.9897591471672058, "step": 410 }, { "epoch": 0.907103825136612, "grad_norm": 63.07603955603986, "learning_rate": 2.5545911634565265e-08, "logits/chosen": -1.1598929166793823, "logits/rejected": -1.1571664810180664, "logps/chosen": -0.5616727471351624, "logps/rejected": -0.7835390567779541, "loss": 2.1028, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -5.616726875305176, "rewards/margins": 2.2186641693115234, "rewards/rejected": -7.835390567779541, "semantic_entropy": 0.9976503252983093, "step": 415 }, { "epoch": 0.9180327868852459, "grad_norm": 89.74277499235377, "learning_rate": 1.9863821294241522e-08, "logits/chosen": -1.1581684350967407, "logits/rejected": -1.1270772218704224, "logps/chosen": -0.5167144536972046, "logps/rejected": -0.7354345321655273, "loss": 1.9853, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.167144298553467, "rewards/margins": 2.1872007846832275, "rewards/rejected": -7.354344844818115, "semantic_entropy": 1.002636194229126, "step": 420 }, { "epoch": 0.9289617486338798, "grad_norm": 71.82508706026904, "learning_rate": 1.4882972562753615e-08, "logits/chosen": -1.1459519863128662, "logits/rejected": -1.1357475519180298, "logps/chosen": -0.5768141746520996, "logps/rejected": -0.7887662053108215, "loss": 2.1228, "rewards/accuracies": 0.78125, "rewards/chosen": -5.768141746520996, "rewards/margins": 2.1195199489593506, "rewards/rejected": -7.887660980224609, "semantic_entropy": 0.9999387860298157, "step": 425 }, { "epoch": 0.9398907103825137, "grad_norm": 91.24129023810345, "learning_rate": 1.0610639997888915e-08, "logits/chosen": -1.0857610702514648, "logits/rejected": -1.0863049030303955, "logps/chosen": -0.520858883857727, "logps/rejected": -0.7221606969833374, "loss": 1.9229, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.208588600158691, "rewards/margins": 2.0130181312561035, "rewards/rejected": -7.221606254577637, "semantic_entropy": 1.0139762163162231, "step": 430 }, { "epoch": 0.9508196721311475, "grad_norm": 88.53510645254667, "learning_rate": 7.053063365559997e-09, "logits/chosen": -1.147918939590454, "logits/rejected": -1.1737738847732544, "logps/chosen": -0.5408393740653992, "logps/rejected": -0.7648274898529053, "loss": 2.0597, "rewards/accuracies": 0.84375, "rewards/chosen": -5.408394813537598, "rewards/margins": 2.2398805618286133, "rewards/rejected": -7.6482744216918945, "semantic_entropy": 1.0074278116226196, "step": 435 }, { "epoch": 0.9617486338797814, "grad_norm": 77.47275941246323, "learning_rate": 4.215438526591064e-09, "logits/chosen": -1.08914053440094, "logits/rejected": -1.0488555431365967, "logps/chosen": -0.5592411160469055, "logps/rejected": -0.7156537175178528, "loss": 2.0346, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.592411518096924, "rewards/margins": 1.5641257762908936, "rewards/rejected": -7.156537055969238, "semantic_entropy": 0.9892482757568359, "step": 440 }, { "epoch": 0.9726775956284153, "grad_norm": 85.37542604306078, "learning_rate": 2.1019098481337426e-09, "logits/chosen": -1.1617168188095093, "logits/rejected": -1.1342122554779053, "logps/chosen": -0.5354763865470886, "logps/rejected": -0.7130267024040222, "loss": 2.0073, "rewards/accuracies": 0.8125, "rewards/chosen": -5.354763984680176, "rewards/margins": 1.7755035161972046, "rewards/rejected": -7.130267143249512, "semantic_entropy": 1.0182139873504639, "step": 445 }, { "epoch": 0.9836065573770492, "grad_norm": 92.55545564226749, "learning_rate": 7.155641507955445e-10, "logits/chosen": -1.0736119747161865, "logits/rejected": -1.0667097568511963, "logps/chosen": -0.5805756449699402, "logps/rejected": -0.7572126388549805, "loss": 2.146, "rewards/accuracies": 0.78125, "rewards/chosen": -5.805756568908691, "rewards/margins": 1.7663694620132446, "rewards/rejected": -7.5721259117126465, "semantic_entropy": 0.9835384488105774, "step": 450 }, { "epoch": 0.994535519125683, "grad_norm": 79.92510322372067, "learning_rate": 5.842620032053824e-11, "logits/chosen": -1.0938892364501953, "logits/rejected": -1.0882636308670044, "logps/chosen": -0.5764094591140747, "logps/rejected": -0.7290435433387756, "loss": 2.187, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.764094829559326, "rewards/margins": 1.5263407230377197, "rewards/rejected": -7.290434837341309, "semantic_entropy": 1.0177193880081177, "step": 455 }, { "epoch": 0.9989071038251366, "step": 457, "total_flos": 0.0, "train_loss": 2.4655840506438875, "train_runtime": 5955.1851, "train_samples_per_second": 9.833, "train_steps_per_second": 0.077 } ], "logging_steps": 5, "max_steps": 457, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }