{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.3054830287206268e-08, "logits/chosen": 0.2694149911403656, "logits/rejected": 0.46839016675949097, "logps/chosen": -5021.38671875, "logps/rejected": -3781.455322265625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.3054830287206266e-07, "logits/chosen": 0.37451040744781494, "logits/rejected": 0.4508368670940399, "logps/chosen": -3258.931884765625, "logps/rejected": -2797.27099609375, "loss": 0.6921, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.008748779073357582, "rewards/margins": 0.004036817234009504, "rewards/rejected": 0.0047119613736867905, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.610966057441253e-07, "logits/chosen": 0.39052361249923706, "logits/rejected": 0.42785000801086426, "logps/chosen": -3101.714111328125, "logps/rejected": -2925.632080078125, "loss": 0.6918, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0038903425447642803, "rewards/margins": 0.0028132761362940073, "rewards/rejected": 0.001077066408470273, "step": 20 }, { "epoch": 0.01, "eval_logits/chosen": 0.40100428462028503, "eval_logits/rejected": 0.4756467342376709, "eval_logps/chosen": -3243.883056640625, "eval_logps/rejected": -2749.733154296875, "eval_loss": 0.69266676902771, "eval_rewards/accuracies": 0.49900001287460327, "eval_rewards/chosen": 0.010503383353352547, "eval_rewards/margins": 0.0016427963273599744, "eval_rewards/rejected": 0.008860588073730469, "eval_runtime": 435.6166, "eval_samples_per_second": 4.591, "eval_steps_per_second": 1.148, "step": 20 }, { "epoch": 0.01, "learning_rate": 3.9164490861618804e-07, "logits/chosen": 0.3738934397697449, "logits/rejected": 0.4079577326774597, "logps/chosen": -3332.12060546875, "logps/rejected": -3117.1923828125, "loss": 0.6918, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.02959837019443512, "rewards/margins": 0.0013076241593807936, "rewards/rejected": 0.028290744870901108, "step": 30 }, { "epoch": 0.01, "learning_rate": 5.221932114882506e-07, "logits/chosen": 0.3971901535987854, "logits/rejected": 0.45270150899887085, "logps/chosen": -3383.08837890625, "logps/rejected": -2872.220703125, "loss": 0.6888, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.08587168902158737, "rewards/margins": 0.014594366773962975, "rewards/rejected": 0.07127733528614044, "step": 40 }, { "epoch": 0.01, "eval_logits/chosen": 0.39199602603912354, "eval_logits/rejected": 0.4667048454284668, "eval_logps/chosen": -3234.010009765625, "eval_logps/rejected": -2741.609619140625, "eval_loss": 0.6864708662033081, "eval_rewards/accuracies": 0.5569999814033508, "eval_rewards/chosen": 0.10923188179731369, "eval_rewards/margins": 0.019136928021907806, "eval_rewards/rejected": 0.09009493887424469, "eval_runtime": 431.2443, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.159, "step": 40 }, { "epoch": 0.01, "learning_rate": 6.527415143603135e-07, "logits/chosen": 0.5431323647499084, "logits/rejected": 0.5686339139938354, "logps/chosen": -2714.167236328125, "logps/rejected": -2374.578857421875, "loss": 0.6844, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1448867917060852, "rewards/margins": 0.02292322739958763, "rewards/rejected": 0.12196356058120728, "step": 50 }, { "epoch": 0.02, "learning_rate": 7.832898172323761e-07, "logits/chosen": 0.3579309582710266, "logits/rejected": 0.43882283568382263, "logps/chosen": -3377.47314453125, "logps/rejected": -3146.041259765625, "loss": 0.6812, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3199368119239807, "rewards/margins": 0.0335029773414135, "rewards/rejected": 0.2864338755607605, "step": 60 }, { "epoch": 0.02, "eval_logits/chosen": 0.3634220361709595, "eval_logits/rejected": 0.438355416059494, "eval_logps/chosen": -3210.522705078125, "eval_logps/rejected": -2721.70361328125, "eval_loss": 0.6778392195701599, "eval_rewards/accuracies": 0.5580000281333923, "eval_rewards/chosen": 0.3441031277179718, "eval_rewards/margins": 0.05494501441717148, "eval_rewards/rejected": 0.2891581058502197, "eval_runtime": 434.0538, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 60 }, { "epoch": 0.02, "learning_rate": 9.138381201044387e-07, "logits/chosen": 0.3434237837791443, "logits/rejected": 0.361980676651001, "logps/chosen": -2560.92919921875, "logps/rejected": -2396.83349609375, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.28493043780326843, "rewards/margins": 0.02466559410095215, "rewards/rejected": 0.26026487350463867, "step": 70 }, { "epoch": 0.02, "learning_rate": 1.0443864229765013e-06, "logits/chosen": 0.49326092004776, "logits/rejected": 0.464181125164032, "logps/chosen": -2978.31494140625, "logps/rejected": -2870.066650390625, "loss": 0.6845, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.428170382976532, "rewards/margins": 0.042138513177633286, "rewards/rejected": 0.3860318958759308, "step": 80 }, { "epoch": 0.02, "eval_logits/chosen": 0.34676727652549744, "eval_logits/rejected": 0.4217112958431244, "eval_logps/chosen": -3194.867431640625, "eval_logps/rejected": -2708.706298828125, "eval_loss": 0.675126850605011, "eval_rewards/accuracies": 0.5529999732971191, "eval_rewards/chosen": 0.5006561279296875, "eval_rewards/margins": 0.08152718842029572, "eval_rewards/rejected": 0.419128954410553, "eval_runtime": 432.9627, "eval_samples_per_second": 4.619, "eval_steps_per_second": 1.155, "step": 80 }, { "epoch": 0.02, "learning_rate": 1.1749347258485642e-06, "logits/chosen": 0.410836398601532, "logits/rejected": 0.3978697657585144, "logps/chosen": -2950.85009765625, "logps/rejected": -2410.281982421875, "loss": 0.6524, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.45929017663002014, "rewards/margins": 0.09510257840156555, "rewards/rejected": 0.364187628030777, "step": 90 }, { "epoch": 0.03, "learning_rate": 1.305483028720627e-06, "logits/chosen": 0.35857781767845154, "logits/rejected": 0.4451242983341217, "logps/chosen": -2968.312744140625, "logps/rejected": -2463.664794921875, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5796502828598022, "rewards/margins": 0.1427154839038849, "rewards/rejected": 0.43693476915359497, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": 0.31475889682769775, "eval_logits/rejected": 0.3896203935146332, "eval_logps/chosen": -3175.369384765625, "eval_logps/rejected": -2692.428955078125, "eval_loss": 0.6733439564704895, "eval_rewards/accuracies": 0.550000011920929, "eval_rewards/chosen": 0.6956388354301453, "eval_rewards/margins": 0.11373422294855118, "eval_rewards/rejected": 0.5819045901298523, "eval_runtime": 431.2279, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.159, "step": 100 }, { "epoch": 0.03, "learning_rate": 1.4360313315926894e-06, "logits/chosen": 0.4156278669834137, "logits/rejected": 0.4004064202308655, "logps/chosen": -3027.26611328125, "logps/rejected": -3013.200439453125, "loss": 0.7238, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5847059488296509, "rewards/margins": -0.019619522616267204, "rewards/rejected": 0.6043254733085632, "step": 110 }, { "epoch": 0.03, "learning_rate": 1.5665796344647521e-06, "logits/chosen": 0.30818819999694824, "logits/rejected": 0.36611881852149963, "logps/chosen": -3279.479736328125, "logps/rejected": -2957.90576171875, "loss": 0.6642, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.5687682032585144, "rewards/margins": 0.04552336782217026, "rewards/rejected": 0.5232447385787964, "step": 120 }, { "epoch": 0.03, "eval_logits/chosen": 0.34161919355392456, "eval_logits/rejected": 0.41587868332862854, "eval_logps/chosen": -3192.629638671875, "eval_logps/rejected": -2707.4033203125, "eval_loss": 0.6705058217048645, "eval_rewards/accuracies": 0.5709999799728394, "eval_rewards/chosen": 0.5230329632759094, "eval_rewards/margins": 0.09087348729372025, "eval_rewards/rejected": 0.4321594834327698, "eval_runtime": 434.683, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 120 }, { "epoch": 0.03, "learning_rate": 1.6971279373368146e-06, "logits/chosen": 0.3913455009460449, "logits/rejected": 0.4176483750343323, "logps/chosen": -3376.279296875, "logps/rejected": -2870.35302734375, "loss": 0.6675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5320384502410889, "rewards/margins": 0.11943557113409042, "rewards/rejected": 0.412602961063385, "step": 130 }, { "epoch": 0.04, "learning_rate": 1.8276762402088774e-06, "logits/chosen": 0.2895314693450928, "logits/rejected": 0.35183554887771606, "logps/chosen": -3270.6298828125, "logps/rejected": -2973.749267578125, "loss": 0.6701, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.6199338436126709, "rewards/margins": 0.07197347283363342, "rewards/rejected": 0.5479603409767151, "step": 140 }, { "epoch": 0.04, "eval_logits/chosen": 0.3169664144515991, "eval_logits/rejected": 0.39726728200912476, "eval_logps/chosen": -3186.456787109375, "eval_logps/rejected": -2702.371826171875, "eval_loss": 0.6715799570083618, "eval_rewards/accuracies": 0.5709999799728394, "eval_rewards/chosen": 0.5847654342651367, "eval_rewards/margins": 0.10229046642780304, "eval_rewards/rejected": 0.4824749827384949, "eval_runtime": 430.5695, "eval_samples_per_second": 4.645, "eval_steps_per_second": 1.161, "step": 140 }, { "epoch": 0.04, "learning_rate": 1.9582245430809403e-06, "logits/chosen": 0.3584805130958557, "logits/rejected": 0.460904598236084, "logps/chosen": -3165.745361328125, "logps/rejected": -2749.85498046875, "loss": 0.706, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6630610227584839, "rewards/margins": 0.0910097137093544, "rewards/rejected": 0.5720512270927429, "step": 150 }, { "epoch": 0.04, "learning_rate": 2.0887728459530026e-06, "logits/chosen": 0.38657838106155396, "logits/rejected": 0.36648914217948914, "logps/chosen": -3097.75390625, "logps/rejected": -3017.968505859375, "loss": 0.7142, "rewards/accuracies": 0.4375, "rewards/chosen": 0.5332592129707336, "rewards/margins": -0.04311450943350792, "rewards/rejected": 0.5763736963272095, "step": 160 }, { "epoch": 0.04, "eval_logits/chosen": 0.33465275168418884, "eval_logits/rejected": 0.41513919830322266, "eval_logps/chosen": -3200.787353515625, "eval_logps/rejected": -2715.60205078125, "eval_loss": 0.6676562428474426, "eval_rewards/accuracies": 0.5849999785423279, "eval_rewards/chosen": 0.44146081805229187, "eval_rewards/margins": 0.09128639101982117, "eval_rewards/rejected": 0.3501743972301483, "eval_runtime": 434.9503, "eval_samples_per_second": 4.598, "eval_steps_per_second": 1.15, "step": 160 }, { "epoch": 0.04, "learning_rate": 2.2193211488250653e-06, "logits/chosen": 0.3804462254047394, "logits/rejected": 0.4036695957183838, "logps/chosen": -3374.642578125, "logps/rejected": -3233.76806640625, "loss": 0.6789, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.28807249665260315, "rewards/margins": -0.00100223277695477, "rewards/rejected": 0.28907471895217896, "step": 170 }, { "epoch": 0.05, "learning_rate": 2.3498694516971284e-06, "logits/chosen": 0.33502498269081116, "logits/rejected": 0.36636993288993835, "logps/chosen": -2849.62841796875, "logps/rejected": -2697.879638671875, "loss": 0.6615, "rewards/accuracies": 0.625, "rewards/chosen": 0.2737550139427185, "rewards/margins": 0.05823422595858574, "rewards/rejected": 0.21552081406116486, "step": 180 }, { "epoch": 0.05, "eval_logits/chosen": 0.33259251713752747, "eval_logits/rejected": 0.41091641783714294, "eval_logps/chosen": -3189.158935546875, "eval_logps/rejected": -2706.587158203125, "eval_loss": 0.6624520421028137, "eval_rewards/accuracies": 0.5989999771118164, "eval_rewards/chosen": 0.5577443242073059, "eval_rewards/margins": 0.11742062121629715, "eval_rewards/rejected": 0.44032371044158936, "eval_runtime": 432.3403, "eval_samples_per_second": 4.626, "eval_steps_per_second": 1.156, "step": 180 }, { "epoch": 0.05, "learning_rate": 2.4804177545691907e-06, "logits/chosen": 0.3444517254829407, "logits/rejected": 0.4270195960998535, "logps/chosen": -3378.428466796875, "logps/rejected": -3194.337890625, "loss": 0.6629, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6465830206871033, "rewards/margins": 0.10114260017871857, "rewards/rejected": 0.5454403758049011, "step": 190 }, { "epoch": 0.05, "learning_rate": 2.610966057441254e-06, "logits/chosen": 0.3368102014064789, "logits/rejected": 0.4088291525840759, "logps/chosen": -3595.17822265625, "logps/rejected": -2792.098876953125, "loss": 0.6665, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9071613550186157, "rewards/margins": 0.2651379108428955, "rewards/rejected": 0.6420234441757202, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": 0.34202033281326294, "eval_logits/rejected": 0.4160567820072174, "eval_logps/chosen": -3151.239990234375, "eval_logps/rejected": -2677.22509765625, "eval_loss": 0.6630626916885376, "eval_rewards/accuracies": 0.5860000252723694, "eval_rewards/chosen": 0.9369348287582397, "eval_rewards/margins": 0.20299381017684937, "eval_rewards/rejected": 0.7339408993721008, "eval_runtime": 433.1077, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.154, "step": 200 }, { "epoch": 0.05, "learning_rate": 2.741514360313316e-06, "logits/chosen": 0.37777179479599, "logits/rejected": 0.3763605058193207, "logps/chosen": -3412.768798828125, "logps/rejected": -2964.60498046875, "loss": 0.7269, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.9503151774406433, "rewards/margins": 0.08324390649795532, "rewards/rejected": 0.8670713305473328, "step": 210 }, { "epoch": 0.06, "learning_rate": 2.872062663185379e-06, "logits/chosen": 0.41821736097335815, "logits/rejected": 0.44176244735717773, "logps/chosen": -3309.788330078125, "logps/rejected": -2743.70166015625, "loss": 0.6708, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.551195502281189, "rewards/margins": 0.13858731091022491, "rewards/rejected": 0.41260820627212524, "step": 220 }, { "epoch": 0.06, "eval_logits/chosen": 0.42348435521125793, "eval_logits/rejected": 0.4935801327228546, "eval_logps/chosen": -3198.706298828125, "eval_logps/rejected": -2718.924560546875, "eval_loss": 0.6642667651176453, "eval_rewards/accuracies": 0.5920000076293945, "eval_rewards/chosen": 0.46227169036865234, "eval_rewards/margins": 0.14532099664211273, "eval_rewards/rejected": 0.3169506788253784, "eval_runtime": 434.2445, "eval_samples_per_second": 4.606, "eval_steps_per_second": 1.151, "step": 220 }, { "epoch": 0.06, "learning_rate": 3.0026109660574416e-06, "logits/chosen": 0.5542663335800171, "logits/rejected": 0.6077815294265747, "logps/chosen": -2831.956787109375, "logps/rejected": -2464.8876953125, "loss": 0.6798, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.3584771156311035, "rewards/margins": 0.11843252182006836, "rewards/rejected": 0.24004459381103516, "step": 230 }, { "epoch": 0.06, "learning_rate": 3.1331592689295043e-06, "logits/chosen": 0.341006338596344, "logits/rejected": 0.3886900246143341, "logps/chosen": -3320.176513671875, "logps/rejected": -2941.79736328125, "loss": 0.683, "rewards/accuracies": 0.5625, "rewards/chosen": 0.6637318134307861, "rewards/margins": 0.16271236538887024, "rewards/rejected": 0.5010194182395935, "step": 240 }, { "epoch": 0.06, "eval_logits/chosen": 0.3650403320789337, "eval_logits/rejected": 0.4461342692375183, "eval_logps/chosen": -3192.144287109375, "eval_logps/rejected": -2712.76220703125, "eval_loss": 0.6630212664604187, "eval_rewards/accuracies": 0.6159999966621399, "eval_rewards/chosen": 0.5278900861740112, "eval_rewards/margins": 0.14932134747505188, "eval_rewards/rejected": 0.37856873869895935, "eval_runtime": 431.407, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 240 }, { "epoch": 0.07, "learning_rate": 3.263707571801567e-06, "logits/chosen": 0.30492913722991943, "logits/rejected": 0.3574830889701843, "logps/chosen": -3402.729248046875, "logps/rejected": -3008.567626953125, "loss": 0.7329, "rewards/accuracies": 0.6875, "rewards/chosen": 0.7251652479171753, "rewards/margins": 0.1593145728111267, "rewards/rejected": 0.5658507347106934, "step": 250 }, { "epoch": 0.07, "learning_rate": 3.3942558746736293e-06, "logits/chosen": 0.4338935911655426, "logits/rejected": 0.52497798204422, "logps/chosen": -3234.494140625, "logps/rejected": -2531.953857421875, "loss": 0.6545, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7618404626846313, "rewards/margins": 0.2613573670387268, "rewards/rejected": 0.5004830956459045, "step": 260 }, { "epoch": 0.07, "eval_logits/chosen": 0.3391437232494354, "eval_logits/rejected": 0.42327457666397095, "eval_logps/chosen": -3174.3583984375, "eval_logps/rejected": -2696.804931640625, "eval_loss": 0.6642194390296936, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": 0.7057454586029053, "eval_rewards/margins": 0.16760458052158356, "eval_rewards/rejected": 0.5381408929824829, "eval_runtime": 435.7139, "eval_samples_per_second": 4.59, "eval_steps_per_second": 1.148, "step": 260 }, { "epoch": 0.07, "learning_rate": 3.524804177545692e-06, "logits/chosen": 0.34105032682418823, "logits/rejected": 0.40103426575660706, "logps/chosen": -3716.36474609375, "logps/rejected": -3368.737548828125, "loss": 0.6835, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.8102656602859497, "rewards/margins": 0.009944954887032509, "rewards/rejected": 0.8003206253051758, "step": 270 }, { "epoch": 0.07, "learning_rate": 3.6553524804177547e-06, "logits/chosen": 0.360210120677948, "logits/rejected": 0.3961041271686554, "logps/chosen": -3065.91064453125, "logps/rejected": -2726.56494140625, "loss": 0.6447, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6820917725563049, "rewards/margins": 0.16703075170516968, "rewards/rejected": 0.5150610208511353, "step": 280 }, { "epoch": 0.07, "eval_logits/chosen": 0.2968996465206146, "eval_logits/rejected": 0.37403151392936707, "eval_logps/chosen": -3146.64453125, "eval_logps/rejected": -2673.731689453125, "eval_loss": 0.6696676015853882, "eval_rewards/accuracies": 0.6039999723434448, "eval_rewards/chosen": 0.9828857779502869, "eval_rewards/margins": 0.21401001513004303, "eval_rewards/rejected": 0.7688757181167603, "eval_runtime": 431.3843, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 280 }, { "epoch": 0.08, "learning_rate": 3.7859007832898174e-06, "logits/chosen": 0.20792098343372345, "logits/rejected": 0.29871290922164917, "logps/chosen": -3625.885986328125, "logps/rejected": -3123.01123046875, "loss": 0.6437, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9595824480056763, "rewards/margins": 0.23611263930797577, "rewards/rejected": 0.7234698534011841, "step": 290 }, { "epoch": 0.08, "learning_rate": 3.9164490861618806e-06, "logits/chosen": 0.2793983817100525, "logits/rejected": 0.37193626165390015, "logps/chosen": -3584.172607421875, "logps/rejected": -2952.966064453125, "loss": 0.6532, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1887600421905518, "rewards/margins": 0.3551940321922302, "rewards/rejected": 0.8335660099983215, "step": 300 }, { "epoch": 0.08, "eval_logits/chosen": 0.3240261673927307, "eval_logits/rejected": 0.3932121694087982, "eval_logps/chosen": -3135.05517578125, "eval_logps/rejected": -2668.265380859375, "eval_loss": 0.6842317581176758, "eval_rewards/accuracies": 0.6159999966621399, "eval_rewards/chosen": 1.098778486251831, "eval_rewards/margins": 0.2752384543418884, "eval_rewards/rejected": 0.8235400319099426, "eval_runtime": 434.3508, "eval_samples_per_second": 4.605, "eval_steps_per_second": 1.151, "step": 300 }, { "epoch": 0.08, "learning_rate": 4.046997389033943e-06, "logits/chosen": 0.38430434465408325, "logits/rejected": 0.3882272243499756, "logps/chosen": -3151.79345703125, "logps/rejected": -2963.14013671875, "loss": 0.6307, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.9992223978042603, "rewards/margins": 0.25411438941955566, "rewards/rejected": 0.7451080083847046, "step": 310 }, { "epoch": 0.08, "learning_rate": 4.177545691906005e-06, "logits/chosen": 0.36302921175956726, "logits/rejected": 0.3614574074745178, "logps/chosen": -3660.998046875, "logps/rejected": -3047.01318359375, "loss": 0.6508, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9046725034713745, "rewards/margins": 0.43586355447769165, "rewards/rejected": 0.46880897879600525, "step": 320 }, { "epoch": 0.08, "eval_logits/chosen": 0.32564598321914673, "eval_logits/rejected": 0.410008043050766, "eval_logps/chosen": -3195.15966796875, "eval_logps/rejected": -2718.756103515625, "eval_loss": 0.6766108274459839, "eval_rewards/accuracies": 0.6110000014305115, "eval_rewards/chosen": 0.49773287773132324, "eval_rewards/margins": 0.17909826338291168, "eval_rewards/rejected": 0.31863459944725037, "eval_runtime": 433.0283, "eval_samples_per_second": 4.619, "eval_steps_per_second": 1.155, "step": 320 }, { "epoch": 0.09, "learning_rate": 4.308093994778068e-06, "logits/chosen": 0.28110283613204956, "logits/rejected": 0.4024382531642914, "logps/chosen": -3275.461181640625, "logps/rejected": -2713.1103515625, "loss": 0.7264, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.33772045373916626, "rewards/margins": 0.12221743166446686, "rewards/rejected": 0.21550297737121582, "step": 330 }, { "epoch": 0.09, "learning_rate": 4.4386422976501306e-06, "logits/chosen": 0.3178556561470032, "logits/rejected": 0.3595356345176697, "logps/chosen": -2875.369873046875, "logps/rejected": -2567.747802734375, "loss": 0.6363, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4873991906642914, "rewards/margins": 0.2501722276210785, "rewards/rejected": 0.2372269183397293, "step": 340 }, { "epoch": 0.09, "eval_logits/chosen": 0.27454981207847595, "eval_logits/rejected": 0.3598467707633972, "eval_logps/chosen": -3178.899169921875, "eval_logps/rejected": -2700.798095703125, "eval_loss": 0.6837542653083801, "eval_rewards/accuracies": 0.5950000286102295, "eval_rewards/chosen": 0.6603414416313171, "eval_rewards/margins": 0.16213050484657288, "eval_rewards/rejected": 0.49821093678474426, "eval_runtime": 431.9254, "eval_samples_per_second": 4.63, "eval_steps_per_second": 1.158, "step": 340 }, { "epoch": 0.09, "learning_rate": 4.569190600522193e-06, "logits/chosen": 0.21587708592414856, "logits/rejected": 0.31181126832962036, "logps/chosen": -2707.907958984375, "logps/rejected": -2417.74267578125, "loss": 0.6793, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7874638438224792, "rewards/margins": 0.18165403604507446, "rewards/rejected": 0.6058098077774048, "step": 350 }, { "epoch": 0.09, "learning_rate": 4.699738903394257e-06, "logits/chosen": 0.18711814284324646, "logits/rejected": 0.27558764815330505, "logps/chosen": -3153.45068359375, "logps/rejected": -2709.81982421875, "loss": 0.7016, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3306450843811035, "rewards/margins": 0.30943673849105835, "rewards/rejected": 1.0212082862854004, "step": 360 }, { "epoch": 0.09, "eval_logits/chosen": 0.22326692938804626, "eval_logits/rejected": 0.3153199851512909, "eval_logps/chosen": -3134.056884765625, "eval_logps/rejected": -2665.273193359375, "eval_loss": 0.6748794317245483, "eval_rewards/accuracies": 0.6150000095367432, "eval_rewards/chosen": 1.1087632179260254, "eval_rewards/margins": 0.2553001046180725, "eval_rewards/rejected": 0.8534631133079529, "eval_runtime": 435.0603, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 360 }, { "epoch": 0.1, "learning_rate": 4.8302872062663196e-06, "logits/chosen": 0.1838264763355255, "logits/rejected": 0.22554358839988708, "logps/chosen": -3298.046142578125, "logps/rejected": -2745.108154296875, "loss": 0.6573, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2793898582458496, "rewards/margins": 0.3967159688472748, "rewards/rejected": 0.8826737403869629, "step": 370 }, { "epoch": 0.1, "learning_rate": 4.9608355091383814e-06, "logits/chosen": 0.31761807203292847, "logits/rejected": 0.37144431471824646, "logps/chosen": -2525.34423828125, "logps/rejected": -2111.93701171875, "loss": 0.6508, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.6076975464820862, "rewards/margins": 0.23200814425945282, "rewards/rejected": 0.3756893575191498, "step": 380 }, { "epoch": 0.1, "eval_logits/chosen": 0.27832531929016113, "eval_logits/rejected": 0.363128125667572, "eval_logps/chosen": -3161.513427734375, "eval_logps/rejected": -2690.803955078125, "eval_loss": 0.6655476093292236, "eval_rewards/accuracies": 0.6169999837875366, "eval_rewards/chosen": 0.8341966271400452, "eval_rewards/margins": 0.2360440045595169, "eval_rewards/rejected": 0.5981525778770447, "eval_runtime": 430.7287, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 380 }, { "epoch": 0.1, "learning_rate": 4.9999488562447675e-06, "logits/chosen": 0.2528289556503296, "logits/rejected": 0.35706380009651184, "logps/chosen": -3318.848388671875, "logps/rejected": -2748.4873046875, "loss": 0.6925, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9206016659736633, "rewards/margins": 0.2647760808467865, "rewards/rejected": 0.6558254957199097, "step": 390 }, { "epoch": 0.1, "learning_rate": 4.999698361256577e-06, "logits/chosen": 0.21857766807079315, "logits/rejected": 0.2511882781982422, "logps/chosen": -3432.91162109375, "logps/rejected": -3010.167724609375, "loss": 0.7066, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.8730987310409546, "rewards/margins": 0.13953432440757751, "rewards/rejected": 0.7335644960403442, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": 0.30806076526641846, "eval_logits/rejected": 0.3912850022315979, "eval_logps/chosen": -3199.071044921875, "eval_logps/rejected": -2720.57763671875, "eval_loss": 0.6643412709236145, "eval_rewards/accuracies": 0.609000027179718, "eval_rewards/chosen": 0.45861876010894775, "eval_rewards/margins": 0.15820595622062683, "eval_rewards/rejected": 0.3004128336906433, "eval_runtime": 435.0758, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 400 }, { "epoch": 0.11, "learning_rate": 4.999239142174581e-06, "logits/chosen": 0.2581877112388611, "logits/rejected": 0.30333179235458374, "logps/chosen": -3176.662353515625, "logps/rejected": -2681.770751953125, "loss": 0.6811, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.29091495275497437, "rewards/margins": 0.09103190153837204, "rewards/rejected": 0.19988304376602173, "step": 410 }, { "epoch": 0.11, "learning_rate": 4.99857123734344e-06, "logits/chosen": 0.15652363002300262, "logits/rejected": 0.19975371658802032, "logps/chosen": -3313.223388671875, "logps/rejected": -2639.69140625, "loss": 0.6569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.215486764907837, "rewards/margins": 0.3324063718318939, "rewards/rejected": 0.8830803036689758, "step": 420 }, { "epoch": 0.11, "eval_logits/chosen": 0.09962375462055206, "eval_logits/rejected": 0.19952738285064697, "eval_logps/chosen": -3070.3232421875, "eval_logps/rejected": -2609.653564453125, "eval_loss": 0.6894924640655518, "eval_rewards/accuracies": 0.597000002861023, "eval_rewards/chosen": 1.7460988759994507, "eval_rewards/margins": 0.33644339442253113, "eval_rewards/rejected": 1.4096555709838867, "eval_runtime": 431.3663, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 420 }, { "epoch": 0.11, "learning_rate": 4.997694702533016e-06, "logits/chosen": 0.16685494780540466, "logits/rejected": 0.28268295526504517, "logps/chosen": -3612.405029296875, "logps/rejected": -3180.85986328125, "loss": 0.6458, "rewards/accuracies": 0.5625, "rewards/chosen": 1.9067745208740234, "rewards/margins": 0.47571372985839844, "rewards/rejected": 1.4310609102249146, "step": 430 }, { "epoch": 0.12, "learning_rate": 4.996609610933713e-06, "logits/chosen": 0.30861029028892517, "logits/rejected": 0.42410510778427124, "logps/chosen": -3421.16845703125, "logps/rejected": -3205.45654296875, "loss": 0.6971, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.5985921621322632, "rewards/margins": 0.06879931688308716, "rewards/rejected": 0.529792845249176, "step": 440 }, { "epoch": 0.12, "eval_logits/chosen": 0.3794691264629364, "eval_logits/rejected": 0.46541422605514526, "eval_logps/chosen": -3213.872314453125, "eval_logps/rejected": -2735.197021484375, "eval_loss": 0.6804001331329346, "eval_rewards/accuracies": 0.597000002861023, "eval_rewards/chosen": 0.31060782074928284, "eval_rewards/margins": 0.1563836634159088, "eval_rewards/rejected": 0.15422417223453522, "eval_runtime": 433.6266, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 440 }, { "epoch": 0.12, "learning_rate": 4.995316053150366e-06, "logits/chosen": 0.38915303349494934, "logits/rejected": 0.4440118670463562, "logps/chosen": -3600.672607421875, "logps/rejected": -3162.372314453125, "loss": 0.6995, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.5723387002944946, "rewards/margins": 0.13602381944656372, "rewards/rejected": 0.4363149106502533, "step": 450 }, { "epoch": 0.12, "learning_rate": 4.9938141371946815e-06, "logits/chosen": 0.3789653778076172, "logits/rejected": 0.4040400981903076, "logps/chosen": -3390.50146484375, "logps/rejected": -2924.3310546875, "loss": 0.7179, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.7771019339561462, "rewards/margins": 0.21817466616630554, "rewards/rejected": 0.5589272975921631, "step": 460 }, { "epoch": 0.12, "eval_logits/chosen": 0.37551364302635193, "eval_logits/rejected": 0.45338889956474304, "eval_logps/chosen": -3205.85107421875, "eval_logps/rejected": -2729.13623046875, "eval_loss": 0.6707738041877747, "eval_rewards/accuracies": 0.6060000061988831, "eval_rewards/chosen": 0.39082252979278564, "eval_rewards/margins": 0.17599175870418549, "eval_rewards/rejected": 0.21483078598976135, "eval_runtime": 433.1249, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.154, "step": 460 }, { "epoch": 0.12, "learning_rate": 4.992103988476206e-06, "logits/chosen": 0.34160929918289185, "logits/rejected": 0.35172420740127563, "logps/chosen": -3291.15673828125, "logps/rejected": -2719.785888671875, "loss": 0.663, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.4612928330898285, "rewards/margins": 0.12932494282722473, "rewards/rejected": 0.33196789026260376, "step": 470 }, { "epoch": 0.13, "learning_rate": 4.990185749791866e-06, "logits/chosen": 0.2565078139305115, "logits/rejected": 0.3810094892978668, "logps/chosen": -3320.289794921875, "logps/rejected": -3023.98876953125, "loss": 0.6713, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.1720809936523438, "rewards/margins": 0.198554128408432, "rewards/rejected": 0.9735267758369446, "step": 480 }, { "epoch": 0.13, "eval_logits/chosen": 0.20718203485012054, "eval_logits/rejected": 0.2920670807361603, "eval_logps/chosen": -3128.837890625, "eval_logps/rejected": -2660.428955078125, "eval_loss": 0.6652926206588745, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": 1.1609543561935425, "eval_rewards/margins": 0.2590506076812744, "eval_rewards/rejected": 0.9019039273262024, "eval_runtime": 431.6312, "eval_samples_per_second": 4.634, "eval_steps_per_second": 1.158, "step": 480 }, { "epoch": 0.13, "learning_rate": 4.9880595813140395e-06, "logits/chosen": 0.26255783438682556, "logits/rejected": 0.33088934421539307, "logps/chosen": -3076.095458984375, "logps/rejected": -2723.376708984375, "loss": 0.683, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1001404523849487, "rewards/margins": 0.16311264038085938, "rewards/rejected": 0.9370278120040894, "step": 490 }, { "epoch": 0.13, "learning_rate": 4.985725660577184e-06, "logits/chosen": 0.312019407749176, "logits/rejected": 0.28921788930892944, "logps/chosen": -3122.08984375, "logps/rejected": -3067.213623046875, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": 0.8809388279914856, "rewards/margins": 0.07867269217967987, "rewards/rejected": 0.8022662401199341, "step": 500 }, { "epoch": 0.13, "eval_logits/chosen": 0.27516239881515503, "eval_logits/rejected": 0.3597623407840729, "eval_logps/chosen": -3162.54443359375, "eval_logps/rejected": -2688.715576171875, "eval_loss": 0.6618161797523499, "eval_rewards/accuracies": 0.6230000257492065, "eval_rewards/chosen": 0.823884904384613, "eval_rewards/margins": 0.20484650135040283, "eval_rewards/rejected": 0.6190384030342102, "eval_runtime": 434.9405, "eval_samples_per_second": 4.598, "eval_steps_per_second": 1.15, "step": 500 }, { "epoch": 0.13, "learning_rate": 4.983184182463009e-06, "logits/chosen": 0.28617388010025024, "logits/rejected": 0.33343711495399475, "logps/chosen": -3116.06396484375, "logps/rejected": -2760.382080078125, "loss": 0.641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6751779317855835, "rewards/margins": 0.2610264718532562, "rewards/rejected": 0.4141514301300049, "step": 510 }, { "epoch": 0.14, "learning_rate": 4.980435359184203e-06, "logits/chosen": 0.16974309086799622, "logits/rejected": 0.18319687247276306, "logps/chosen": -3604.00244140625, "logps/rejected": -3465.17919921875, "loss": 0.6805, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.3318134546279907, "rewards/margins": 0.19661560654640198, "rewards/rejected": 1.1351979970932007, "step": 520 }, { "epoch": 0.14, "eval_logits/chosen": 0.21993647515773773, "eval_logits/rejected": 0.3035815358161926, "eval_logps/chosen": -3128.94287109375, "eval_logps/rejected": -2659.617431640625, "eval_loss": 0.6631866693496704, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": 1.1599045991897583, "eval_rewards/margins": 0.2498857080936432, "eval_rewards/rejected": 0.9100189208984375, "eval_runtime": 430.7467, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 520 }, { "epoch": 0.14, "learning_rate": 4.9774794202667236e-06, "logits/chosen": 0.33694523572921753, "logits/rejected": 0.33044418692588806, "logps/chosen": -2800.47802734375, "logps/rejected": -2482.7021484375, "loss": 0.6692, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.9492979049682617, "rewards/margins": 0.19272860884666443, "rewards/rejected": 0.7565691471099854, "step": 530 }, { "epoch": 0.14, "learning_rate": 4.974316612530615e-06, "logits/chosen": 0.21310079097747803, "logits/rejected": 0.2969479560852051, "logps/chosen": -3240.87841796875, "logps/rejected": -2858.770263671875, "loss": 0.6669, "rewards/accuracies": 0.625, "rewards/chosen": 0.5399230718612671, "rewards/margins": 0.14048579335212708, "rewards/rejected": 0.3994373679161072, "step": 540 }, { "epoch": 0.14, "eval_logits/chosen": 0.3261869549751282, "eval_logits/rejected": 0.3959881663322449, "eval_logps/chosen": -3202.12353515625, "eval_logps/rejected": -2723.495361328125, "eval_loss": 0.676192045211792, "eval_rewards/accuracies": 0.6010000109672546, "eval_rewards/chosen": 0.42809563875198364, "eval_rewards/margins": 0.15685473382472992, "eval_rewards/rejected": 0.2712409198284149, "eval_runtime": 435.0288, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 540 }, { "epoch": 0.14, "learning_rate": 4.970947200069416e-06, "logits/chosen": 0.3193834125995636, "logits/rejected": 0.3038922846317291, "logps/chosen": -2899.6494140625, "logps/rejected": -2558.14111328125, "loss": 0.6359, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.7902882099151611, "rewards/margins": 0.25909554958343506, "rewards/rejected": 0.5311926603317261, "step": 550 }, { "epoch": 0.15, "learning_rate": 4.967371464228096e-06, "logits/chosen": 0.20563605427742004, "logits/rejected": 0.23766914010047913, "logps/chosen": -2873.71044921875, "logps/rejected": -2691.78173828125, "loss": 0.7231, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 1.5348114967346191, "rewards/margins": 0.08968158066272736, "rewards/rejected": 1.4451299905776978, "step": 560 }, { "epoch": 0.15, "eval_logits/chosen": 0.12118327617645264, "eval_logits/rejected": 0.20613756775856018, "eval_logps/chosen": -3081.10888671875, "eval_logps/rejected": -2620.840087890625, "eval_loss": 0.6818861365318298, "eval_rewards/accuracies": 0.6100000143051147, "eval_rewards/chosen": 1.6382437944412231, "eval_rewards/margins": 0.3404523432254791, "eval_rewards/rejected": 1.2977914810180664, "eval_runtime": 431.3913, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 560 }, { "epoch": 0.15, "learning_rate": 4.963589703579569e-06, "logits/chosen": 0.19758854806423187, "logits/rejected": 0.2117747813463211, "logps/chosen": -3392.86962890625, "logps/rejected": -2948.943603515625, "loss": 0.6466, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.487652063369751, "rewards/margins": 0.3400648236274719, "rewards/rejected": 1.1475870609283447, "step": 570 }, { "epoch": 0.15, "learning_rate": 4.9596022338997615e-06, "logits/chosen": 0.33713430166244507, "logits/rejected": 0.3380606472492218, "logps/chosen": -2983.394287109375, "logps/rejected": -2583.453369140625, "loss": 0.6914, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.9762035608291626, "rewards/margins": 0.1640392541885376, "rewards/rejected": 0.812164306640625, "step": 580 }, { "epoch": 0.15, "eval_logits/chosen": 0.2803965210914612, "eval_logits/rejected": 0.36015668511390686, "eval_logps/chosen": -3171.76513671875, "eval_logps/rejected": -2699.477294921875, "eval_loss": 0.6667001843452454, "eval_rewards/accuracies": 0.6119999885559082, "eval_rewards/chosen": 0.7316786646842957, "eval_rewards/margins": 0.22025705873966217, "eval_rewards/rejected": 0.5114216208457947, "eval_runtime": 433.9233, "eval_samples_per_second": 4.609, "eval_steps_per_second": 1.152, "step": 580 }, { "epoch": 0.15, "learning_rate": 4.955409388141243e-06, "logits/chosen": 0.2904849946498871, "logits/rejected": 0.2874545454978943, "logps/chosen": -3328.39501953125, "logps/rejected": -3027.83349609375, "loss": 0.6921, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.6120063066482544, "rewards/margins": 0.13958485424518585, "rewards/rejected": 0.47242140769958496, "step": 590 }, { "epoch": 0.16, "learning_rate": 4.951011516405429e-06, "logits/chosen": 0.25706946849823, "logits/rejected": 0.26872220635414124, "logps/chosen": -2869.29052734375, "logps/rejected": -2644.3203125, "loss": 0.6744, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.9242690801620483, "rewards/margins": 0.14932402968406677, "rewards/rejected": 0.774945080280304, "step": 600 }, { "epoch": 0.16, "eval_logits/chosen": 0.2001418173313141, "eval_logits/rejected": 0.2892521917819977, "eval_logps/chosen": -3113.716552734375, "eval_logps/rejected": -2648.575439453125, "eval_loss": 0.6655290722846985, "eval_rewards/accuracies": 0.6140000224113464, "eval_rewards/chosen": 1.3121711015701294, "eval_rewards/margins": 0.29173266887664795, "eval_rewards/rejected": 1.0204384326934814, "eval_runtime": 433.3468, "eval_samples_per_second": 4.615, "eval_steps_per_second": 1.154, "step": 600 }, { "epoch": 0.16, "learning_rate": 4.946408985913344e-06, "logits/chosen": 0.12893471121788025, "logits/rejected": 0.14496776461601257, "logps/chosen": -3610.104736328125, "logps/rejected": -2941.44384765625, "loss": 0.6127, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5392391681671143, "rewards/margins": 0.4305228590965271, "rewards/rejected": 1.108716368675232, "step": 610 }, { "epoch": 0.16, "learning_rate": 4.941602180974958e-06, "logits/chosen": 0.1892281323671341, "logits/rejected": 0.2857258915901184, "logps/chosen": -3099.654052734375, "logps/rejected": -2563.149658203125, "loss": 0.7202, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.5326813459396362, "rewards/margins": 0.2934575080871582, "rewards/rejected": 1.239223837852478, "step": 620 }, { "epoch": 0.16, "eval_logits/chosen": 0.21555908024311066, "eval_logits/rejected": 0.3039129674434662, "eval_logps/chosen": -3107.617919921875, "eval_logps/rejected": -2643.658447265625, "eval_loss": 0.6703886985778809, "eval_rewards/accuracies": 0.6190000176429749, "eval_rewards/chosen": 1.3731536865234375, "eval_rewards/margins": 0.3035494387149811, "eval_rewards/rejected": 1.0696042776107788, "eval_runtime": 431.3933, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 620 }, { "epoch": 0.16, "learning_rate": 4.936591502957101e-06, "logits/chosen": 0.26240888237953186, "logits/rejected": 0.22847862541675568, "logps/chosen": -2799.74072265625, "logps/rejected": -2490.33935546875, "loss": 0.7035, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9977954626083374, "rewards/margins": 0.15218262374401093, "rewards/rejected": 0.8456128835678101, "step": 630 }, { "epoch": 0.17, "learning_rate": 4.931377370249946e-06, "logits/chosen": 0.2967272400856018, "logits/rejected": 0.30584752559661865, "logps/chosen": -2839.68115234375, "logps/rejected": -2731.757080078125, "loss": 0.6505, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.79596346616745, "rewards/margins": 0.09385130554437637, "rewards/rejected": 0.702112078666687, "step": 640 }, { "epoch": 0.17, "eval_logits/chosen": 0.20528021454811096, "eval_logits/rejected": 0.29464566707611084, "eval_logps/chosen": -3136.512451171875, "eval_logps/rejected": -2666.355712890625, "eval_loss": 0.6630944609642029, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": 1.0842074155807495, "eval_rewards/margins": 0.24157202243804932, "eval_rewards/rejected": 0.842635452747345, "eval_runtime": 435.4988, "eval_samples_per_second": 4.592, "eval_steps_per_second": 1.148, "step": 640 }, { "epoch": 0.17, "learning_rate": 4.925960218232073e-06, "logits/chosen": 0.2488943636417389, "logits/rejected": 0.2558088004589081, "logps/chosen": -2828.72900390625, "logps/rejected": -2741.37109375, "loss": 0.6761, "rewards/accuracies": 0.625, "rewards/chosen": 0.8986973762512207, "rewards/margins": 0.2228974997997284, "rewards/rejected": 0.6757997870445251, "step": 650 }, { "epoch": 0.17, "learning_rate": 4.920340499234116e-06, "logits/chosen": 0.30085256695747375, "logits/rejected": 0.3155014216899872, "logps/chosen": -2791.52392578125, "logps/rejected": -2781.51708984375, "loss": 0.6678, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.44950413703918457, "rewards/margins": 0.1757611185312271, "rewards/rejected": 0.27374306321144104, "step": 660 }, { "epoch": 0.17, "eval_logits/chosen": 0.21014481782913208, "eval_logits/rejected": 0.30193254351615906, "eval_logps/chosen": -3173.929443359375, "eval_logps/rejected": -2697.19091796875, "eval_loss": 0.6687774658203125, "eval_rewards/accuracies": 0.6169999837875366, "eval_rewards/chosen": 0.7100350260734558, "eval_rewards/margins": 0.17575141787528992, "eval_rewards/rejected": 0.5342835187911987, "eval_runtime": 431.2093, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.16, "step": 660 }, { "epoch": 0.18, "learning_rate": 4.914518682500995e-06, "logits/chosen": 0.16767539083957672, "logits/rejected": 0.30946797132492065, "logps/chosen": -3541.389404296875, "logps/rejected": -2814.685546875, "loss": 0.6718, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.1290301084518433, "rewards/margins": 0.1281205117702484, "rewards/rejected": 1.000909686088562, "step": 670 }, { "epoch": 0.18, "learning_rate": 4.9084952541527315e-06, "logits/chosen": 0.09088976681232452, "logits/rejected": 0.19814464449882507, "logps/chosen": -3544.16455078125, "logps/rejected": -2842.513671875, "loss": 0.6905, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3866424560546875, "rewards/margins": 0.24147526919841766, "rewards/rejected": 1.1451671123504639, "step": 680 }, { "epoch": 0.18, "eval_logits/chosen": 0.17350909113883972, "eval_logits/rejected": 0.2673546373844147, "eval_logps/chosen": -3132.293701171875, "eval_logps/rejected": -2662.89794921875, "eval_loss": 0.6600972414016724, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": 1.1263943910598755, "eval_rewards/margins": 0.24918213486671448, "eval_rewards/rejected": 0.8772122263908386, "eval_runtime": 434.669, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 680 }, { "epoch": 0.18, "learning_rate": 4.902270717143858e-06, "logits/chosen": 0.3163197934627533, "logits/rejected": 0.31124523282051086, "logps/chosen": -2872.3564453125, "logps/rejected": -2590.56005859375, "loss": 0.6616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9659813046455383, "rewards/margins": 0.14309151470661163, "rewards/rejected": 0.8228896856307983, "step": 690 }, { "epoch": 0.18, "learning_rate": 4.895845591221427e-06, "logits/chosen": 0.233978271484375, "logits/rejected": 0.35231074690818787, "logps/chosen": -3650.08447265625, "logps/rejected": -2916.44970703125, "loss": 0.6414, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.914772629737854, "rewards/margins": 0.27519088983535767, "rewards/rejected": 0.6395816802978516, "step": 700 }, { "epoch": 0.18, "eval_logits/chosen": 0.25088560581207275, "eval_logits/rejected": 0.3400818407535553, "eval_logps/chosen": -3167.74267578125, "eval_logps/rejected": -2694.656494140625, "eval_loss": 0.668441891670227, "eval_rewards/accuracies": 0.628000020980835, "eval_rewards/chosen": 0.7719046473503113, "eval_rewards/margins": 0.21227821707725525, "eval_rewards/rejected": 0.5596264004707336, "eval_runtime": 431.993, "eval_samples_per_second": 4.63, "eval_steps_per_second": 1.157, "step": 700 }, { "epoch": 0.19, "learning_rate": 4.8892204128816e-06, "logits/chosen": 0.33243808150291443, "logits/rejected": 0.2517126798629761, "logps/chosen": -2662.500732421875, "logps/rejected": -2700.250732421875, "loss": 0.7106, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.617062509059906, "rewards/margins": 0.02785428799688816, "rewards/rejected": 0.58920818567276, "step": 710 }, { "epoch": 0.19, "learning_rate": 4.882395735324864e-06, "logits/chosen": 0.20111370086669922, "logits/rejected": 0.28446730971336365, "logps/chosen": -2576.430908203125, "logps/rejected": -2290.6083984375, "loss": 0.6752, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.0430291891098022, "rewards/margins": 0.23782393336296082, "rewards/rejected": 0.8052053451538086, "step": 720 }, { "epoch": 0.19, "eval_logits/chosen": 0.0784836933016777, "eval_logits/rejected": 0.17869125306606293, "eval_logps/chosen": -3057.898681640625, "eval_logps/rejected": -2602.08544921875, "eval_loss": 0.6932120323181152, "eval_rewards/accuracies": 0.6140000224113464, "eval_rewards/chosen": 1.8703466653823853, "eval_rewards/margins": 0.3850066363811493, "eval_rewards/rejected": 1.4853399991989136, "eval_runtime": 432.7836, "eval_samples_per_second": 4.621, "eval_steps_per_second": 1.155, "step": 720 }, { "epoch": 0.19, "learning_rate": 4.87537212840983e-06, "logits/chosen": 0.10039131343364716, "logits/rejected": 0.17558032274246216, "logps/chosen": -2662.52001953125, "logps/rejected": -2358.291259765625, "loss": 0.6791, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.687911033630371, "rewards/margins": 0.2881425619125366, "rewards/rejected": 1.3997684717178345, "step": 730 }, { "epoch": 0.19, "learning_rate": 4.8681501786056545e-06, "logits/chosen": 0.21189598739147186, "logits/rejected": 0.3067930340766907, "logps/chosen": -3321.143798828125, "logps/rejected": -2917.879638671875, "loss": 0.6982, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.0170387029647827, "rewards/margins": 0.1190987080335617, "rewards/rejected": 0.8979401588439941, "step": 740 }, { "epoch": 0.19, "eval_logits/chosen": 0.2655080556869507, "eval_logits/rejected": 0.35751500725746155, "eval_logps/chosen": -3198.267578125, "eval_logps/rejected": -2721.14990234375, "eval_loss": 0.6774183511734009, "eval_rewards/accuracies": 0.6159999966621399, "eval_rewards/chosen": 0.4666571617126465, "eval_rewards/margins": 0.17196469008922577, "eval_rewards/rejected": 0.2946925163269043, "eval_runtime": 434.1835, "eval_samples_per_second": 4.606, "eval_steps_per_second": 1.152, "step": 740 }, { "epoch": 0.2, "learning_rate": 4.860730488943068e-06, "logits/chosen": 0.2675749957561493, "logits/rejected": 0.36448100209236145, "logps/chosen": -3268.106689453125, "logps/rejected": -2704.97216796875, "loss": 0.6287, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.6579660773277283, "rewards/margins": 0.30677348375320435, "rewards/rejected": 0.3511926233768463, "step": 750 }, { "epoch": 0.2, "learning_rate": 4.853113678964022e-06, "logits/chosen": 0.21715252101421356, "logits/rejected": 0.25422555208206177, "logps/chosen": -2893.327392578125, "logps/rejected": -2555.432861328125, "loss": 0.6149, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1357202529907227, "rewards/margins": 0.2503480315208435, "rewards/rejected": 0.8853722810745239, "step": 760 }, { "epoch": 0.2, "eval_logits/chosen": 0.10560925304889679, "eval_logits/rejected": 0.20155802369117737, "eval_logps/chosen": -3092.660400390625, "eval_logps/rejected": -2632.17431640625, "eval_loss": 0.6715385913848877, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": 1.522727131843567, "eval_rewards/margins": 0.33827662467956543, "eval_rewards/rejected": 1.184450387954712, "eval_runtime": 430.8936, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 760 }, { "epoch": 0.2, "learning_rate": 4.845300384669958e-06, "logits/chosen": 0.24156585335731506, "logits/rejected": 0.21839673817157745, "logps/chosen": -2626.412353515625, "logps/rejected": -2703.326171875, "loss": 0.7032, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.2415733337402344, "rewards/margins": 0.12165530025959015, "rewards/rejected": 1.1199181079864502, "step": 770 }, { "epoch": 0.2, "learning_rate": 4.837291258468701e-06, "logits/chosen": 0.2679353654384613, "logits/rejected": 0.3077923357486725, "logps/chosen": -2966.36669921875, "logps/rejected": -2496.802734375, "loss": 0.6568, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3959789574146271, "rewards/margins": 0.2111394852399826, "rewards/rejected": 0.18483945727348328, "step": 780 }, { "epoch": 0.2, "eval_logits/chosen": 0.3384398818016052, "eval_logits/rejected": 0.42943423986434937, "eval_logps/chosen": -3226.049072265625, "eval_logps/rejected": -2751.242919921875, "eval_loss": 0.6975247859954834, "eval_rewards/accuracies": 0.6019999980926514, "eval_rewards/chosen": 0.18884114921092987, "eval_rewards/margins": 0.1950789839029312, "eval_rewards/rejected": -0.006237812340259552, "eval_runtime": 435.2674, "eval_samples_per_second": 4.595, "eval_steps_per_second": 1.149, "step": 780 }, { "epoch": 0.21, "learning_rate": 4.829086969119984e-06, "logits/chosen": 0.3434927761554718, "logits/rejected": 0.414896160364151, "logps/chosen": -3324.159423828125, "logps/rejected": -3029.572998046875, "loss": 0.662, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.6215066909790039, "rewards/margins": 0.18717338144779205, "rewards/rejected": 0.43433332443237305, "step": 790 }, { "epoch": 0.21, "learning_rate": 4.820688201679605e-06, "logits/chosen": 0.15142004191875458, "logits/rejected": 0.2572762370109558, "logps/chosen": -3121.619140625, "logps/rejected": -2530.41064453125, "loss": 0.633, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.6333844661712646, "rewards/margins": 0.5365501642227173, "rewards/rejected": 1.096834421157837, "step": 800 }, { "epoch": 0.21, "eval_logits/chosen": 0.04355182498693466, "eval_logits/rejected": 0.15217523276805878, "eval_logps/chosen": -3037.4560546875, "eval_logps/rejected": -2588.680419921875, "eval_loss": 0.6989004611968994, "eval_rewards/accuracies": 0.6129999756813049, "eval_rewards/chosen": 2.07477068901062, "eval_rewards/margins": 0.4553816318511963, "eval_rewards/rejected": 1.6193891763687134, "eval_runtime": 431.1831, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.16, "step": 800 }, { "epoch": 0.21, "learning_rate": 4.8120956574422315e-06, "logits/chosen": 0.06384239345788956, "logits/rejected": 0.15334098041057587, "logps/chosen": -2776.998046875, "logps/rejected": -2482.7626953125, "loss": 0.7912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.6437005996704102, "rewards/margins": 0.4083956182003021, "rewards/rejected": 1.2353050708770752, "step": 810 }, { "epoch": 0.21, "learning_rate": 4.803310053882831e-06, "logits/chosen": 0.22994089126586914, "logits/rejected": 0.32230791449546814, "logps/chosen": -3148.53271484375, "logps/rejected": -2464.07470703125, "loss": 0.6907, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.031940221786499, "rewards/margins": 0.3243027329444885, "rewards/rejected": 0.7076374292373657, "step": 820 }, { "epoch": 0.21, "eval_logits/chosen": 0.20369070768356323, "eval_logits/rejected": 0.30359917879104614, "eval_logps/chosen": -3135.479248046875, "eval_logps/rejected": -2669.955322265625, "eval_loss": 0.6632094979286194, "eval_rewards/accuracies": 0.6349999904632568, "eval_rewards/chosen": 1.0945394039154053, "eval_rewards/margins": 0.2878988981246948, "eval_rewards/rejected": 0.806640625, "eval_runtime": 433.9401, "eval_samples_per_second": 4.609, "eval_steps_per_second": 1.152, "step": 820 }, { "epoch": 0.22, "learning_rate": 4.794332124596775e-06, "logits/chosen": 0.279374361038208, "logits/rejected": 0.3658539652824402, "logps/chosen": -3161.129638671875, "logps/rejected": -2649.42236328125, "loss": 0.7311, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9364837408065796, "rewards/margins": 0.24851250648498535, "rewards/rejected": 0.6879712343215942, "step": 830 }, { "epoch": 0.22, "learning_rate": 4.785162619238575e-06, "logits/chosen": 0.2936154007911682, "logits/rejected": 0.308160662651062, "logps/chosen": -3774.05126953125, "logps/rejected": -3430.507080078125, "loss": 0.6582, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8875568509101868, "rewards/margins": 0.2365831881761551, "rewards/rejected": 0.6509736776351929, "step": 840 }, { "epoch": 0.22, "eval_logits/chosen": 0.21733371913433075, "eval_logits/rejected": 0.31188175082206726, "eval_logps/chosen": -3159.10205078125, "eval_logps/rejected": -2688.943603515625, "eval_loss": 0.6571035981178284, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": 0.8583118319511414, "eval_rewards/margins": 0.24155481159687042, "eval_rewards/rejected": 0.6167570352554321, "eval_runtime": 433.0715, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.155, "step": 840 }, { "epoch": 0.22, "learning_rate": 4.775802303459288e-06, "logits/chosen": 0.12405607849359512, "logits/rejected": 0.21875357627868652, "logps/chosen": -4006.57958984375, "logps/rejected": -3388.118408203125, "loss": 0.6344, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1925489902496338, "rewards/margins": 0.3031531274318695, "rewards/rejected": 0.8893957138061523, "step": 850 }, { "epoch": 0.23, "learning_rate": 4.766251958842589e-06, "logits/chosen": 0.29462534189224243, "logits/rejected": 0.26128581166267395, "logps/chosen": -3372.448486328125, "logps/rejected": -3381.94384765625, "loss": 0.6568, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.7879565954208374, "rewards/margins": 0.18704822659492493, "rewards/rejected": 0.6009083986282349, "step": 860 }, { "epoch": 0.23, "eval_logits/chosen": 0.2591753602027893, "eval_logits/rejected": 0.351222962141037, "eval_logps/chosen": -3199.35107421875, "eval_logps/rejected": -2722.352294921875, "eval_loss": 0.6718289852142334, "eval_rewards/accuracies": 0.609000027179718, "eval_rewards/chosen": 0.45582208037376404, "eval_rewards/margins": 0.17315183579921722, "eval_rewards/rejected": 0.282670259475708, "eval_runtime": 431.3986, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 860 }, { "epoch": 0.23, "learning_rate": 4.7565123828395066e-06, "logits/chosen": 0.23790264129638672, "logits/rejected": 0.2792920470237732, "logps/chosen": -3101.481689453125, "logps/rejected": -2638.35009765625, "loss": 0.662, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4960171580314636, "rewards/margins": 0.15477773547172546, "rewards/rejected": 0.34123939275741577, "step": 870 }, { "epoch": 0.23, "learning_rate": 4.746584388701831e-06, "logits/chosen": 0.17238274216651917, "logits/rejected": 0.27201011776924133, "logps/chosen": -2726.42724609375, "logps/rejected": -2329.327392578125, "loss": 0.6589, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0839899778366089, "rewards/margins": 0.30773013830184937, "rewards/rejected": 0.77625972032547, "step": 880 }, { "epoch": 0.23, "eval_logits/chosen": 0.11082763969898224, "eval_logits/rejected": 0.21104219555854797, "eval_logps/chosen": -3112.243408203125, "eval_logps/rejected": -2649.617919921875, "eval_loss": 0.6679428815841675, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 1.3268980979919434, "eval_rewards/margins": 0.3168840706348419, "eval_rewards/rejected": 1.0100139379501343, "eval_runtime": 435.2277, "eval_samples_per_second": 4.595, "eval_steps_per_second": 1.149, "step": 880 }, { "epoch": 0.23, "learning_rate": 4.736468805414218e-06, "logits/chosen": 0.13169807195663452, "logits/rejected": 0.15527954697608948, "logps/chosen": -2806.538818359375, "logps/rejected": -2544.407470703125, "loss": 0.6733, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9121415019035339, "rewards/margins": 0.24330878257751465, "rewards/rejected": 0.6688327789306641, "step": 890 }, { "epoch": 0.24, "learning_rate": 4.7261664776249595e-06, "logits/chosen": 0.15172605216503143, "logits/rejected": 0.24370086193084717, "logps/chosen": -3430.317138671875, "logps/rejected": -3009.8486328125, "loss": 0.6371, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.371455192565918, "rewards/margins": 0.3049808144569397, "rewards/rejected": 1.066474437713623, "step": 900 }, { "epoch": 0.24, "eval_logits/chosen": 0.13766679167747498, "eval_logits/rejected": 0.23073351383209229, "eval_logps/chosen": -3126.612060546875, "eval_logps/rejected": -2663.31201171875, "eval_loss": 0.6656347513198853, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": 1.183210849761963, "eval_rewards/margins": 0.31014153361320496, "eval_rewards/rejected": 0.8730692863464355, "eval_runtime": 430.873, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 900 }, { "epoch": 0.24, "learning_rate": 4.715678265575463e-06, "logits/chosen": 0.11664383113384247, "logits/rejected": 0.22525322437286377, "logps/chosen": -3194.677734375, "logps/rejected": -2560.40478515625, "loss": 0.622, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.33279550075531, "rewards/margins": 0.5007774829864502, "rewards/rejected": 0.8320180773735046, "step": 910 }, { "epoch": 0.24, "learning_rate": 4.705005045028415e-06, "logits/chosen": 0.126219242811203, "logits/rejected": 0.19556903839111328, "logps/chosen": -3781.857421875, "logps/rejected": -3192.947021484375, "loss": 0.7471, "rewards/accuracies": 0.5, "rewards/chosen": 1.1316674947738647, "rewards/margins": 0.12889644503593445, "rewards/rejected": 1.0027711391448975, "step": 920 }, { "epoch": 0.24, "eval_logits/chosen": 0.20772652328014374, "eval_logits/rejected": 0.29164254665374756, "eval_logps/chosen": -3161.26611328125, "eval_logps/rejected": -2692.1220703125, "eval_loss": 0.6692809462547302, "eval_rewards/accuracies": 0.6389999985694885, "eval_rewards/chosen": 0.8366723656654358, "eval_rewards/margins": 0.25169962644577026, "eval_rewards/rejected": 0.5849726796150208, "eval_runtime": 434.7785, "eval_samples_per_second": 4.6, "eval_steps_per_second": 1.15, "step": 920 }, { "epoch": 0.24, "learning_rate": 4.694147707194659e-06, "logits/chosen": 0.25285905599594116, "logits/rejected": 0.23385247588157654, "logps/chosen": -3193.215576171875, "logps/rejected": -2985.509521484375, "loss": 0.6622, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7524423599243164, "rewards/margins": 0.061927784234285355, "rewards/rejected": 0.6905146241188049, "step": 930 }, { "epoch": 0.25, "learning_rate": 4.683107158658782e-06, "logits/chosen": 0.1771703064441681, "logits/rejected": 0.3061208724975586, "logps/chosen": -3459.796875, "logps/rejected": -2903.66455078125, "loss": 0.6415, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.1246603727340698, "rewards/margins": 0.3370293378829956, "rewards/rejected": 0.787631094455719, "step": 940 }, { "epoch": 0.25, "eval_logits/chosen": 0.14409470558166504, "eval_logits/rejected": 0.2346661388874054, "eval_logps/chosen": -3137.30859375, "eval_logps/rejected": -2669.684326171875, "eval_loss": 0.6632093787193298, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": 1.0762486457824707, "eval_rewards/margins": 0.266898512840271, "eval_rewards/rejected": 0.8093501329421997, "eval_runtime": 432.0831, "eval_samples_per_second": 4.629, "eval_steps_per_second": 1.157, "step": 940 }, { "epoch": 0.25, "learning_rate": 4.671884321303407e-06, "logits/chosen": -0.031478021293878555, "logits/rejected": 0.11315950006246567, "logps/chosen": -3788.88818359375, "logps/rejected": -2727.018310546875, "loss": 0.6442, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.5508688688278198, "rewards/margins": 0.3115304112434387, "rewards/rejected": 1.2393386363983154, "step": 950 }, { "epoch": 0.25, "learning_rate": 4.660480132232224e-06, "logits/chosen": 0.04831144958734512, "logits/rejected": 0.02368408441543579, "logps/chosen": -2767.36376953125, "logps/rejected": -2663.06005859375, "loss": 0.7267, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.8313884735107422, "rewards/margins": 0.12214634567499161, "rewards/rejected": 1.7092422246932983, "step": 960 }, { "epoch": 0.25, "eval_logits/chosen": -0.025592295452952385, "eval_logits/rejected": 0.07426931709051132, "eval_logps/chosen": -3041.255859375, "eval_logps/rejected": -2584.757080078125, "eval_loss": 0.6971449851989746, "eval_rewards/accuracies": 0.5929999947547913, "eval_rewards/chosen": 2.036773920059204, "eval_rewards/margins": 0.3781488537788391, "eval_rewards/rejected": 1.6586247682571411, "eval_runtime": 433.0712, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.155, "step": 960 }, { "epoch": 0.25, "learning_rate": 4.6488955436917414e-06, "logits/chosen": 0.08942289650440216, "logits/rejected": 0.11886408179998398, "logps/chosen": -3139.666259765625, "logps/rejected": -3032.911376953125, "loss": 0.7523, "rewards/accuracies": 0.5625, "rewards/chosen": 1.6085468530654907, "rewards/margins": 0.2415986955165863, "rewards/rejected": 1.366947889328003, "step": 970 }, { "epoch": 0.26, "learning_rate": 4.6371315229917644e-06, "logits/chosen": 0.20894484221935272, "logits/rejected": 0.2383333146572113, "logps/chosen": -3313.9296875, "logps/rejected": -3033.85302734375, "loss": 0.6586, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5171154737472534, "rewards/margins": 0.23641736805438995, "rewards/rejected": 0.28069809079170227, "step": 980 }, { "epoch": 0.26, "eval_logits/chosen": 0.23801569640636444, "eval_logits/rejected": 0.32683467864990234, "eval_logps/chosen": -3207.21044921875, "eval_logps/rejected": -2726.409423828125, "eval_loss": 0.6856436729431152, "eval_rewards/accuracies": 0.609000027179718, "eval_rewards/chosen": 0.37722790241241455, "eval_rewards/margins": 0.13513070344924927, "eval_rewards/rejected": 0.24209719896316528, "eval_runtime": 434.2681, "eval_samples_per_second": 4.605, "eval_steps_per_second": 1.151, "step": 980 }, { "epoch": 0.26, "learning_rate": 4.625189052424638e-06, "logits/chosen": 0.27509307861328125, "logits/rejected": 0.26890403032302856, "logps/chosen": -2852.1162109375, "logps/rejected": -2854.47021484375, "loss": 0.6474, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4414089620113373, "rewards/margins": 0.10975675284862518, "rewards/rejected": 0.3316522538661957, "step": 990 }, { "epoch": 0.26, "learning_rate": 4.613069129183218e-06, "logits/chosen": 0.1874045729637146, "logits/rejected": 0.2115097939968109, "logps/chosen": -2742.56689453125, "logps/rejected": -2470.456787109375, "loss": 0.7058, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8734512329101562, "rewards/margins": 0.15519645810127258, "rewards/rejected": 0.718254804611206, "step": 1000 }, { "epoch": 0.26, "eval_logits/chosen": 0.13200362026691437, "eval_logits/rejected": 0.22639912366867065, "eval_logps/chosen": -3141.533447265625, "eval_logps/rejected": -2670.741943359375, "eval_loss": 0.6664600968360901, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": 1.03399658203125, "eval_rewards/margins": 0.23522353172302246, "eval_rewards/rejected": 0.798772931098938, "eval_runtime": 430.8982, "eval_samples_per_second": 4.641, "eval_steps_per_second": 1.16, "step": 1000 }, { "epoch": 0.26, "learning_rate": 4.600772765277607e-06, "logits/chosen": 0.10840918868780136, "logits/rejected": 0.22101978957653046, "logps/chosen": -3185.760498046875, "logps/rejected": -2951.656005859375, "loss": 0.6651, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8249515295028687, "rewards/margins": 0.16047556698322296, "rewards/rejected": 0.6644760370254517, "step": 1010 }, { "epoch": 0.27, "learning_rate": 4.588300987450652e-06, "logits/chosen": 0.3151921331882477, "logits/rejected": 0.32274752855300903, "logps/chosen": -2889.594482421875, "logps/rejected": -2711.18505859375, "loss": 0.6562, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6317747831344604, "rewards/margins": 0.2206035554409027, "rewards/rejected": 0.41117119789123535, "step": 1020 }, { "epoch": 0.27, "eval_logits/chosen": 0.21916908025741577, "eval_logits/rejected": 0.3141496479511261, "eval_logps/chosen": -3201.3095703125, "eval_logps/rejected": -2724.30908203125, "eval_loss": 0.6731351613998413, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": 0.4362356662750244, "eval_rewards/margins": 0.1731313318014145, "eval_rewards/rejected": 0.2631043791770935, "eval_runtime": 435.2198, "eval_samples_per_second": 4.595, "eval_steps_per_second": 1.149, "step": 1020 }, { "epoch": 0.27, "learning_rate": 4.5756548370922136e-06, "logits/chosen": 0.17107148468494415, "logits/rejected": 0.3339146077632904, "logps/chosen": -2937.3583984375, "logps/rejected": -2453.65771484375, "loss": 0.643, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.4791443347930908, "rewards/margins": 0.2664182484149933, "rewards/rejected": 0.21272608637809753, "step": 1030 }, { "epoch": 0.27, "learning_rate": 4.562835370152206e-06, "logits/chosen": 0.17145976424217224, "logits/rejected": 0.24029257893562317, "logps/chosen": -2986.204345703125, "logps/rejected": -2653.17041015625, "loss": 0.6695, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8505908846855164, "rewards/margins": 0.20952963829040527, "rewards/rejected": 0.6410611867904663, "step": 1040 }, { "epoch": 0.27, "eval_logits/chosen": 0.15215829014778137, "eval_logits/rejected": 0.2495543509721756, "eval_logps/chosen": -3154.933837890625, "eval_logps/rejected": -2685.94091796875, "eval_loss": 0.6666401624679565, "eval_rewards/accuracies": 0.6240000128746033, "eval_rewards/chosen": 0.8999937176704407, "eval_rewards/margins": 0.2532128095626831, "eval_rewards/rejected": 0.6467809081077576, "eval_runtime": 431.5073, "eval_samples_per_second": 4.635, "eval_steps_per_second": 1.159, "step": 1040 }, { "epoch": 0.27, "learning_rate": 4.54984365705243e-06, "logits/chosen": 0.16127470135688782, "logits/rejected": 0.1686708927154541, "logps/chosen": -2859.669677734375, "logps/rejected": -2438.57958984375, "loss": 0.6431, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8572206497192383, "rewards/margins": 0.2656770348548889, "rewards/rejected": 0.5915436148643494, "step": 1050 }, { "epoch": 0.28, "learning_rate": 4.536680782597191e-06, "logits/chosen": 0.2011353224515915, "logits/rejected": 0.22867946326732635, "logps/chosen": -2675.025146484375, "logps/rejected": -2079.73876953125, "loss": 0.6998, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8153513669967651, "rewards/margins": 0.28279685974121094, "rewards/rejected": 0.5325545072555542, "step": 1060 }, { "epoch": 0.28, "eval_logits/chosen": 0.128602534532547, "eval_logits/rejected": 0.2293267548084259, "eval_logps/chosen": -3148.851806640625, "eval_logps/rejected": -2680.230224609375, "eval_loss": 0.6631279587745667, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": 0.9608126282691956, "eval_rewards/margins": 0.2569228410720825, "eval_rewards/rejected": 0.703889787197113, "eval_runtime": 432.7419, "eval_samples_per_second": 4.622, "eval_steps_per_second": 1.155, "step": 1060 }, { "epoch": 0.28, "learning_rate": 4.523347845882718e-06, "logits/chosen": 0.10225598514080048, "logits/rejected": 0.22114522755146027, "logps/chosen": -3210.76953125, "logps/rejected": -2655.65966796875, "loss": 0.6593, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9413834810256958, "rewards/margins": 0.3570558428764343, "rewards/rejected": 0.5843275189399719, "step": 1070 }, { "epoch": 0.28, "learning_rate": 4.50984596020539e-06, "logits/chosen": 0.12361164391040802, "logits/rejected": 0.16664133965969086, "logps/chosen": -3124.489501953125, "logps/rejected": -2776.021484375, "loss": 0.6467, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9287630915641785, "rewards/margins": 0.29461583495140076, "rewards/rejected": 0.6341472864151001, "step": 1080 }, { "epoch": 0.28, "eval_logits/chosen": 0.154340460896492, "eval_logits/rejected": 0.2534354329109192, "eval_logps/chosen": -3152.224853515625, "eval_logps/rejected": -2682.678955078125, "eval_loss": 0.661058247089386, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": 0.9270836710929871, "eval_rewards/margins": 0.24768143892288208, "eval_rewards/rejected": 0.679402232170105, "eval_runtime": 433.3415, "eval_samples_per_second": 4.615, "eval_steps_per_second": 1.154, "step": 1080 }, { "epoch": 0.29, "learning_rate": 4.4961762529687745e-06, "logits/chosen": 0.28299981355667114, "logits/rejected": 0.2669948935508728, "logps/chosen": -3039.753173828125, "logps/rejected": -2915.790283203125, "loss": 0.6562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9180598258972168, "rewards/margins": 0.2806738018989563, "rewards/rejected": 0.6373860239982605, "step": 1090 }, { "epoch": 0.29, "learning_rate": 4.482339865589492e-06, "logits/chosen": 0.22564205527305603, "logits/rejected": 0.3158654272556305, "logps/chosen": -3112.56884765625, "logps/rejected": -2838.22216796875, "loss": 0.7014, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.43064600229263306, "rewards/margins": 0.17393958568572998, "rewards/rejected": 0.2567063570022583, "step": 1100 }, { "epoch": 0.29, "eval_logits/chosen": 0.3112128674983978, "eval_logits/rejected": 0.4019556939601898, "eval_logps/chosen": -3227.002197265625, "eval_logps/rejected": -2748.674560546875, "eval_loss": 0.6915711760520935, "eval_rewards/accuracies": 0.597000002861023, "eval_rewards/chosen": 0.1793079376220703, "eval_rewards/margins": 0.1598605066537857, "eval_rewards/rejected": 0.019447432830929756, "eval_runtime": 430.4162, "eval_samples_per_second": 4.647, "eval_steps_per_second": 1.162, "step": 1100 }, { "epoch": 0.29, "learning_rate": 4.468337953401909e-06, "logits/chosen": 0.15705159306526184, "logits/rejected": 0.19908219575881958, "logps/chosen": -3910.09619140625, "logps/rejected": -3509.895263671875, "loss": 0.6996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.48222607374191284, "rewards/margins": 0.18784353137016296, "rewards/rejected": 0.2943825125694275, "step": 1110 }, { "epoch": 0.29, "learning_rate": 4.45417168556166e-06, "logits/chosen": 0.26801520586013794, "logits/rejected": 0.2657167315483093, "logps/chosen": -3047.155517578125, "logps/rejected": -2690.32763671875, "loss": 0.6383, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.015018105506897, "rewards/margins": 0.33848801255226135, "rewards/rejected": 0.6765300631523132, "step": 1120 }, { "epoch": 0.29, "eval_logits/chosen": 0.1309698075056076, "eval_logits/rejected": 0.22458001971244812, "eval_logps/chosen": -3120.439697265625, "eval_logps/rejected": -2656.01025390625, "eval_loss": 0.6645776629447937, "eval_rewards/accuracies": 0.6190000176429749, "eval_rewards/chosen": 1.2449334859848022, "eval_rewards/margins": 0.2988436222076416, "eval_rewards/rejected": 0.9460898041725159, "eval_runtime": 434.7267, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 1120 }, { "epoch": 0.3, "learning_rate": 4.439842244948036e-06, "logits/chosen": 0.043300751596689224, "logits/rejected": 0.14321494102478027, "logps/chosen": -3138.837890625, "logps/rejected": -2762.51904296875, "loss": 0.6755, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.188788652420044, "rewards/margins": 0.21242380142211914, "rewards/rejected": 0.97636479139328, "step": 1130 }, { "epoch": 0.3, "learning_rate": 4.425350828065204e-06, "logits/chosen": 0.2044309675693512, "logits/rejected": 0.24734990298748016, "logps/chosen": -3106.46142578125, "logps/rejected": -2511.349853515625, "loss": 0.6594, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1759591102600098, "rewards/margins": 0.30159974098205566, "rewards/rejected": 0.8743594288825989, "step": 1140 }, { "epoch": 0.3, "eval_logits/chosen": 0.13717390596866608, "eval_logits/rejected": 0.22943007946014404, "eval_logps/chosen": -3123.19384765625, "eval_logps/rejected": -2657.951904296875, "eval_loss": 0.6694162487983704, "eval_rewards/accuracies": 0.625, "eval_rewards/chosen": 1.2173967361450195, "eval_rewards/margins": 0.2907230257987976, "eval_rewards/rejected": 0.9266735315322876, "eval_runtime": 430.5073, "eval_samples_per_second": 4.646, "eval_steps_per_second": 1.161, "step": 1140 }, { "epoch": 0.3, "learning_rate": 4.410698644942303e-06, "logits/chosen": 0.16843870282173157, "logits/rejected": 0.22185227274894714, "logps/chosen": -3160.94970703125, "logps/rejected": -2581.95654296875, "loss": 0.6228, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.375333547592163, "rewards/margins": 0.49348753690719604, "rewards/rejected": 0.8818461298942566, "step": 1150 }, { "epoch": 0.3, "learning_rate": 4.395886919032406e-06, "logits/chosen": 0.24170048534870148, "logits/rejected": 0.3024105429649353, "logps/chosen": -3298.40380859375, "logps/rejected": -2940.814697265625, "loss": 0.6662, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.3273108005523682, "rewards/margins": 0.2847282290458679, "rewards/rejected": 1.0425827503204346, "step": 1160 }, { "epoch": 0.3, "eval_logits/chosen": 0.266377329826355, "eval_logits/rejected": 0.3541879653930664, "eval_logps/chosen": -3166.857177734375, "eval_logps/rejected": -2698.607421875, "eval_loss": 0.6691722273826599, "eval_rewards/accuracies": 0.6340000033378601, "eval_rewards/chosen": 0.7807608246803284, "eval_rewards/margins": 0.26064351201057434, "eval_rewards/rejected": 0.5201172232627869, "eval_runtime": 433.9773, "eval_samples_per_second": 4.609, "eval_steps_per_second": 1.152, "step": 1160 }, { "epoch": 0.31, "learning_rate": 4.380916887110366e-06, "logits/chosen": 0.25344404578208923, "logits/rejected": 0.3945383131504059, "logps/chosen": -3154.962890625, "logps/rejected": -2672.71044921875, "loss": 0.6724, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8532293438911438, "rewards/margins": 0.1568954885005951, "rewards/rejected": 0.6963338255882263, "step": 1170 }, { "epoch": 0.31, "learning_rate": 4.365789799169539e-06, "logits/chosen": 0.2550129294395447, "logits/rejected": 0.3834827244281769, "logps/chosen": -3319.89453125, "logps/rejected": -2614.60986328125, "loss": 0.6439, "rewards/accuracies": 0.625, "rewards/chosen": 0.9404705762863159, "rewards/margins": 0.23398876190185547, "rewards/rejected": 0.7064818739891052, "step": 1180 }, { "epoch": 0.31, "eval_logits/chosen": 0.28002381324768066, "eval_logits/rejected": 0.3655260503292084, "eval_logps/chosen": -3153.010986328125, "eval_logps/rejected": -2688.39501953125, "eval_loss": 0.6643928289413452, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 0.9192204475402832, "eval_rewards/margins": 0.29697781801223755, "eval_rewards/rejected": 0.6222426891326904, "eval_runtime": 431.7775, "eval_samples_per_second": 4.632, "eval_steps_per_second": 1.158, "step": 1180 }, { "epoch": 0.31, "learning_rate": 4.350506918317416e-06, "logits/chosen": 0.17394372820854187, "logits/rejected": 0.2910707890987396, "logps/chosen": -3182.14599609375, "logps/rejected": -2645.91455078125, "loss": 0.6448, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.355242133140564, "rewards/margins": 0.3299023509025574, "rewards/rejected": 1.0253398418426514, "step": 1190 }, { "epoch": 0.31, "learning_rate": 4.335069520670149e-06, "logits/chosen": 0.18607555329799652, "logits/rejected": 0.26543331146240234, "logps/chosen": -2805.07421875, "logps/rejected": -2525.427001953125, "loss": 0.6218, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.29789137840271, "rewards/margins": 0.3404329717159271, "rewards/rejected": 0.95745849609375, "step": 1200 }, { "epoch": 0.31, "eval_logits/chosen": 0.21157526969909668, "eval_logits/rejected": 0.3050297796726227, "eval_logps/chosen": -3136.6796875, "eval_logps/rejected": -2674.114013671875, "eval_loss": 0.6585681438446045, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 1.082531452178955, "eval_rewards/margins": 0.31747734546661377, "eval_rewards/rejected": 0.7650541067123413, "eval_runtime": 431.8268, "eval_samples_per_second": 4.631, "eval_steps_per_second": 1.158, "step": 1200 }, { "epoch": 0.32, "learning_rate": 4.319478895246e-06, "logits/chosen": 0.3427901864051819, "logits/rejected": 0.355103075504303, "logps/chosen": -2543.169677734375, "logps/rejected": -2401.9912109375, "loss": 0.6925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.6026889085769653, "rewards/margins": 0.16290698945522308, "rewards/rejected": 0.43978196382522583, "step": 1210 }, { "epoch": 0.32, "learning_rate": 4.303736343857704e-06, "logits/chosen": 0.37430500984191895, "logits/rejected": 0.3719637393951416, "logps/chosen": -2915.022705078125, "logps/rejected": -2745.69384765625, "loss": 0.68, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.35964491963386536, "rewards/margins": 0.14721675217151642, "rewards/rejected": 0.21242818236351013, "step": 1220 }, { "epoch": 0.32, "eval_logits/chosen": 0.20581862330436707, "eval_logits/rejected": 0.30018508434295654, "eval_logps/chosen": -3145.6201171875, "eval_logps/rejected": -2680.749267578125, "eval_loss": 0.6571410298347473, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": 0.9931336641311646, "eval_rewards/margins": 0.2944355607032776, "eval_rewards/rejected": 0.6986980438232422, "eval_runtime": 434.0722, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 1220 }, { "epoch": 0.32, "learning_rate": 4.287843181003772e-06, "logits/chosen": 0.28214162588119507, "logits/rejected": 0.3284228444099426, "logps/chosen": -2877.477294921875, "logps/rejected": -2346.061767578125, "loss": 0.61, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.1273117065429688, "rewards/margins": 0.3056851327419281, "rewards/rejected": 0.8216265439987183, "step": 1230 }, { "epoch": 0.32, "learning_rate": 4.27180073375873e-06, "logits/chosen": 0.20327293872833252, "logits/rejected": 0.2174561470746994, "logps/chosen": -3113.965576171875, "logps/rejected": -2962.91357421875, "loss": 0.631, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.647871732711792, "rewards/margins": 0.39653071761131287, "rewards/rejected": 1.2513409852981567, "step": 1240 }, { "epoch": 0.32, "eval_logits/chosen": 0.12976451218128204, "eval_logits/rejected": 0.22259633243083954, "eval_logps/chosen": -3100.83984375, "eval_logps/rejected": -2641.633056640625, "eval_loss": 0.6605857610702515, "eval_rewards/accuracies": 0.6449999809265137, "eval_rewards/chosen": 1.4409338235855103, "eval_rewards/margins": 0.351070374250412, "eval_rewards/rejected": 1.0898635387420654, "eval_runtime": 430.0537, "eval_samples_per_second": 4.651, "eval_steps_per_second": 1.163, "step": 1240 }, { "epoch": 0.33, "learning_rate": 4.255610341662304e-06, "logits/chosen": 0.18046480417251587, "logits/rejected": 0.1754683405160904, "logps/chosen": -3002.83056640625, "logps/rejected": -2788.439697265625, "loss": 0.6006, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3609216213226318, "rewards/margins": 0.3701387047767639, "rewards/rejected": 0.9907829165458679, "step": 1250 }, { "epoch": 0.33, "learning_rate": 4.2392733566075764e-06, "logits/chosen": 0.03303495794534683, "logits/rejected": 0.0659671276807785, "logps/chosen": -3335.93603515625, "logps/rejected": -3016.701416015625, "loss": 0.6553, "rewards/accuracies": 0.6875, "rewards/chosen": 1.5000711679458618, "rewards/margins": 0.4075811505317688, "rewards/rejected": 1.0924898386001587, "step": 1260 }, { "epoch": 0.33, "eval_logits/chosen": 0.08770661801099777, "eval_logits/rejected": 0.18526098132133484, "eval_logps/chosen": -3105.521484375, "eval_logps/rejected": -2646.45556640625, "eval_loss": 0.6755309700965881, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": 1.394115924835205, "eval_rewards/margins": 0.35247930884361267, "eval_rewards/rejected": 1.04163658618927, "eval_runtime": 434.6252, "eval_samples_per_second": 4.602, "eval_steps_per_second": 1.15, "step": 1260 }, { "epoch": 0.33, "learning_rate": 4.2227911427280975e-06, "logits/chosen": 0.12640723586082458, "logits/rejected": 0.21180033683776855, "logps/chosen": -3275.716796875, "logps/rejected": -2503.45751953125, "loss": 0.6975, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3040649890899658, "rewards/margins": 0.4380066990852356, "rewards/rejected": 0.866058349609375, "step": 1270 }, { "epoch": 0.33, "learning_rate": 4.206165076283983e-06, "logits/chosen": -0.034152328968048096, "logits/rejected": 0.06710993498563766, "logps/chosen": -3643.556640625, "logps/rejected": -2971.960693359375, "loss": 0.656, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.739383339881897, "rewards/margins": 0.2324363887310028, "rewards/rejected": 1.5069469213485718, "step": 1280 }, { "epoch": 0.33, "eval_logits/chosen": 0.03427112475037575, "eval_logits/rejected": 0.1332724541425705, "eval_logps/chosen": -3082.835205078125, "eval_logps/rejected": -2625.012939453125, "eval_loss": 0.6741769909858704, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.6209807395935059, "eval_rewards/margins": 0.3649192750453949, "eval_rewards/rejected": 1.2560611963272095, "eval_runtime": 431.1687, "eval_samples_per_second": 4.639, "eval_steps_per_second": 1.16, "step": 1280 }, { "epoch": 0.34, "learning_rate": 4.189396545546995e-06, "logits/chosen": -0.0017816796898841858, "logits/rejected": 0.06524969637393951, "logps/chosen": -2891.66455078125, "logps/rejected": -2555.768798828125, "loss": 0.7392, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.5242462158203125, "rewards/margins": 0.28190702199935913, "rewards/rejected": 1.2423391342163086, "step": 1290 }, { "epoch": 0.34, "learning_rate": 4.172486950684627e-06, "logits/chosen": -0.01402245182543993, "logits/rejected": 0.09218231588602066, "logps/chosen": -3317.37890625, "logps/rejected": -2525.61572265625, "loss": 0.6968, "rewards/accuracies": 0.5625, "rewards/chosen": 1.613255262374878, "rewards/margins": 0.329478919506073, "rewards/rejected": 1.2837764024734497, "step": 1300 }, { "epoch": 0.34, "eval_logits/chosen": 0.04399670287966728, "eval_logits/rejected": 0.14183732867240906, "eval_logps/chosen": -3089.2763671875, "eval_logps/rejected": -2628.070556640625, "eval_loss": 0.6620357632637024, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": 1.5565693378448486, "eval_rewards/margins": 0.33108285069465637, "eval_rewards/rejected": 1.2254862785339355, "eval_runtime": 432.9341, "eval_samples_per_second": 4.62, "eval_steps_per_second": 1.155, "step": 1300 }, { "epoch": 0.34, "learning_rate": 4.155437703643182e-06, "logits/chosen": 0.14660152792930603, "logits/rejected": 0.1822119653224945, "logps/chosen": -2755.6181640625, "logps/rejected": -2556.6201171875, "loss": 0.6912, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.2852537631988525, "rewards/margins": 0.12867704033851624, "rewards/rejected": 1.1565768718719482, "step": 1310 }, { "epoch": 0.35, "learning_rate": 4.138250228029882e-06, "logits/chosen": 0.020845487713813782, "logits/rejected": 0.023300308734178543, "logps/chosen": -2807.335693359375, "logps/rejected": -2608.6796875, "loss": 0.6756, "rewards/accuracies": 0.5625, "rewards/chosen": 1.0622012615203857, "rewards/margins": 0.1612163484096527, "rewards/rejected": 0.9009848833084106, "step": 1320 }, { "epoch": 0.35, "eval_logits/chosen": 0.04559008404612541, "eval_logits/rejected": 0.1435910314321518, "eval_logps/chosen": -3098.37646484375, "eval_logps/rejected": -2632.772705078125, "eval_loss": 0.6619133353233337, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": 1.4655649662017822, "eval_rewards/margins": 0.28709885478019714, "eval_rewards/rejected": 1.1784662008285522, "eval_runtime": 432.8463, "eval_samples_per_second": 4.621, "eval_steps_per_second": 1.155, "step": 1320 }, { "epoch": 0.35, "learning_rate": 4.120925958993994e-06, "logits/chosen": 0.18598555028438568, "logits/rejected": 0.286693274974823, "logps/chosen": -2556.603515625, "logps/rejected": -2142.288330078125, "loss": 0.74, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.7354651093482971, "rewards/margins": 0.08770757913589478, "rewards/rejected": 0.6477575302124023, "step": 1330 }, { "epoch": 0.35, "learning_rate": 4.103466343106999e-06, "logits/chosen": 0.0688103586435318, "logits/rejected": 0.24212606251239777, "logps/chosen": -3729.354736328125, "logps/rejected": -2956.0078125, "loss": 0.651, "rewards/accuracies": 0.625, "rewards/chosen": 0.9855343103408813, "rewards/margins": 0.2683238387107849, "rewards/rejected": 0.7172105312347412, "step": 1340 }, { "epoch": 0.35, "eval_logits/chosen": 0.1607738733291626, "eval_logits/rejected": 0.2575262486934662, "eval_logps/chosen": -3145.572998046875, "eval_logps/rejected": -2675.200927734375, "eval_loss": 0.6585801839828491, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": 0.9936038851737976, "eval_rewards/margins": 0.23942090570926666, "eval_rewards/rejected": 0.7541829943656921, "eval_runtime": 430.8781, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 1340 }, { "epoch": 0.35, "learning_rate": 4.085872838241797e-06, "logits/chosen": 0.18015019595623016, "logits/rejected": 0.17807723581790924, "logps/chosen": -2812.416259765625, "logps/rejected": -2816.8994140625, "loss": 0.708, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1321533918380737, "rewards/margins": -0.009967814199626446, "rewards/rejected": 1.1421211957931519, "step": 1350 }, { "epoch": 0.36, "learning_rate": 4.06814691345098e-06, "logits/chosen": 0.1508791148662567, "logits/rejected": 0.25893545150756836, "logps/chosen": -2712.971435546875, "logps/rejected": -2640.009765625, "loss": 0.6863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.2149766683578491, "rewards/margins": 0.17109015583992004, "rewards/rejected": 1.043886423110962, "step": 1360 }, { "epoch": 0.36, "eval_logits/chosen": 0.16238804161548615, "eval_logits/rejected": 0.26247698068618774, "eval_logps/chosen": -3138.90283203125, "eval_logps/rejected": -2672.00634765625, "eval_loss": 0.6592795252799988, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 1.0603034496307373, "eval_rewards/margins": 0.2741730809211731, "eval_rewards/rejected": 0.786130428314209, "eval_runtime": 434.5794, "eval_samples_per_second": 4.602, "eval_steps_per_second": 1.151, "step": 1360 }, { "epoch": 0.36, "learning_rate": 4.050290048844171e-06, "logits/chosen": 0.10025720298290253, "logits/rejected": 0.2371581345796585, "logps/chosen": -3697.56591796875, "logps/rejected": -2927.544921875, "loss": 0.6663, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.0616953372955322, "rewards/margins": 0.3179744482040405, "rewards/rejected": 0.7437208294868469, "step": 1370 }, { "epoch": 0.36, "learning_rate": 4.032303735464422e-06, "logits/chosen": 0.22232666611671448, "logits/rejected": 0.30685633420944214, "logps/chosen": -3399.1796875, "logps/rejected": -2910.68310546875, "loss": 0.6671, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9664727449417114, "rewards/margins": 0.2944018244743347, "rewards/rejected": 0.6720708608627319, "step": 1380 }, { "epoch": 0.36, "eval_logits/chosen": 0.1792251020669937, "eval_logits/rejected": 0.27691176533699036, "eval_logps/chosen": -3152.441162109375, "eval_logps/rejected": -2683.8271484375, "eval_loss": 0.6584669351577759, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": 0.9249168634414673, "eval_rewards/margins": 0.25699588656425476, "eval_rewards/rejected": 0.6679208874702454, "eval_runtime": 430.2528, "eval_samples_per_second": 4.648, "eval_steps_per_second": 1.162, "step": 1380 }, { "epoch": 0.36, "learning_rate": 4.014189475163727e-06, "logits/chosen": 0.250161349773407, "logits/rejected": 0.3024250864982605, "logps/chosen": -3578.50927734375, "logps/rejected": -3076.99072265625, "loss": 0.704, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3416260480880737, "rewards/margins": 0.31344103813171387, "rewards/rejected": 1.0281848907470703, "step": 1390 }, { "epoch": 0.37, "learning_rate": 3.995948780477605e-06, "logits/chosen": 0.24695098400115967, "logits/rejected": 0.2912483811378479, "logps/chosen": -3567.078857421875, "logps/rejected": -3018.37255859375, "loss": 0.6495, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4374496936798096, "rewards/margins": 0.37245994806289673, "rewards/rejected": 1.0649895668029785, "step": 1400 }, { "epoch": 0.37, "eval_logits/chosen": 0.1592485010623932, "eval_logits/rejected": 0.25626036524772644, "eval_logps/chosen": -3144.181884765625, "eval_logps/rejected": -2675.75341796875, "eval_loss": 0.6559209227561951, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 1.007513165473938, "eval_rewards/margins": 0.2588542401790619, "eval_rewards/rejected": 0.7486589550971985, "eval_runtime": 434.1441, "eval_samples_per_second": 4.607, "eval_steps_per_second": 1.152, "step": 1400 }, { "epoch": 0.37, "learning_rate": 3.977583174498816e-06, "logits/chosen": 0.1604035347700119, "logits/rejected": 0.25348392128944397, "logps/chosen": -3107.49462890625, "logps/rejected": -3145.966064453125, "loss": 0.7329, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8261555433273315, "rewards/margins": 0.09552982449531555, "rewards/rejected": 0.7306256890296936, "step": 1410 }, { "epoch": 0.37, "learning_rate": 3.959094190750172e-06, "logits/chosen": 0.12305520474910736, "logits/rejected": 0.29530590772628784, "logps/chosen": -3538.533935546875, "logps/rejected": -3034.084228515625, "loss": 0.6505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7359656095504761, "rewards/margins": 0.3393523693084717, "rewards/rejected": 0.39661329984664917, "step": 1420 }, { "epoch": 0.37, "eval_logits/chosen": 0.24186643958091736, "eval_logits/rejected": 0.33206459879875183, "eval_logps/chosen": -3194.786865234375, "eval_logps/rejected": -2719.096923828125, "eval_loss": 0.6665523052215576, "eval_rewards/accuracies": 0.6309999823570251, "eval_rewards/chosen": 0.5014644265174866, "eval_rewards/margins": 0.18624107539653778, "eval_rewards/rejected": 0.31522342562675476, "eval_runtime": 431.8292, "eval_samples_per_second": 4.631, "eval_steps_per_second": 1.158, "step": 1420 }, { "epoch": 0.37, "learning_rate": 3.9404833730564975e-06, "logits/chosen": 0.3466762900352478, "logits/rejected": 0.4390391409397125, "logps/chosen": -2962.7978515625, "logps/rejected": -2690.782470703125, "loss": 0.6707, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3717375099658966, "rewards/margins": 0.1719440221786499, "rewards/rejected": 0.1997934877872467, "step": 1430 }, { "epoch": 0.38, "learning_rate": 3.921752275415712e-06, "logits/chosen": 0.10341383516788483, "logits/rejected": 0.16401778161525726, "logps/chosen": -2966.730712890625, "logps/rejected": -2709.18896484375, "loss": 0.6855, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.45706772804260254, "rewards/margins": 0.08931884169578552, "rewards/rejected": 0.367748886346817, "step": 1440 }, { "epoch": 0.38, "eval_logits/chosen": 0.18593600392341614, "eval_logits/rejected": 0.2769761383533478, "eval_logps/chosen": -3160.43310546875, "eval_logps/rejected": -2689.621337890625, "eval_loss": 0.6567209959030151, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 0.8449998497962952, "eval_rewards/margins": 0.2350170612335205, "eval_rewards/rejected": 0.6099827885627747, "eval_runtime": 432.2383, "eval_samples_per_second": 4.627, "eval_steps_per_second": 1.157, "step": 1440 }, { "epoch": 0.38, "learning_rate": 3.902902461869079e-06, "logits/chosen": 0.2583538889884949, "logits/rejected": 0.3390609920024872, "logps/chosen": -2448.330810546875, "logps/rejected": -2278.47509765625, "loss": 0.6685, "rewards/accuracies": 0.6875, "rewards/chosen": 0.6906525492668152, "rewards/margins": 0.18777084350585938, "rewards/rejected": 0.5028817057609558, "step": 1450 }, { "epoch": 0.38, "learning_rate": 3.883935506370605e-06, "logits/chosen": 0.17118586599826813, "logits/rejected": 0.2784334719181061, "logps/chosen": -3388.67578125, "logps/rejected": -3014.18115234375, "loss": 0.6501, "rewards/accuracies": 0.625, "rewards/chosen": 0.8943119049072266, "rewards/margins": 0.2588035464286804, "rewards/rejected": 0.6355084180831909, "step": 1460 }, { "epoch": 0.38, "eval_logits/chosen": 0.1981024593114853, "eval_logits/rejected": 0.2910307049751282, "eval_logps/chosen": -3169.166259765625, "eval_logps/rejected": -2697.960693359375, "eval_loss": 0.6599109172821045, "eval_rewards/accuracies": 0.6389999985694885, "eval_rewards/chosen": 0.7576707005500793, "eval_rewards/margins": 0.2310859113931656, "eval_rewards/rejected": 0.5265849232673645, "eval_runtime": 433.7989, "eval_samples_per_second": 4.61, "eval_steps_per_second": 1.153, "step": 1460 }, { "epoch": 0.38, "learning_rate": 3.864852992655617e-06, "logits/chosen": 0.232582688331604, "logits/rejected": 0.31352290511131287, "logps/chosen": -2944.626708984375, "logps/rejected": -2444.094970703125, "loss": 0.628, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8596689105033875, "rewards/margins": 0.21491022408008575, "rewards/rejected": 0.6447586417198181, "step": 1470 }, { "epoch": 0.39, "learning_rate": 3.845656514108516e-06, "logits/chosen": 0.1640838384628296, "logits/rejected": 0.16049417853355408, "logps/chosen": -3403.094482421875, "logps/rejected": -2909.10400390625, "loss": 0.649, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2753345966339111, "rewards/margins": 0.41491857171058655, "rewards/rejected": 0.860416054725647, "step": 1480 }, { "epoch": 0.39, "eval_logits/chosen": 0.10520908981561661, "eval_logits/rejected": 0.20650361478328705, "eval_logps/chosen": -3118.7607421875, "eval_logps/rejected": -2655.2158203125, "eval_loss": 0.659864068031311, "eval_rewards/accuracies": 0.6420000195503235, "eval_rewards/chosen": 1.2617236375808716, "eval_rewards/margins": 0.30768993496894836, "eval_rewards/rejected": 0.9540336728096008, "eval_runtime": 430.3716, "eval_samples_per_second": 4.647, "eval_steps_per_second": 1.162, "step": 1480 }, { "epoch": 0.39, "learning_rate": 3.826347673629738e-06, "logits/chosen": 0.18303225934505463, "logits/rejected": 0.2529780864715576, "logps/chosen": -2996.833984375, "logps/rejected": -2728.722900390625, "loss": 0.7081, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.0881035327911377, "rewards/margins": 0.1533777415752411, "rewards/rejected": 0.9347257614135742, "step": 1490 }, { "epoch": 0.39, "learning_rate": 3.8069280835019062e-06, "logits/chosen": 0.0685875415802002, "logits/rejected": 0.180327370762825, "logps/chosen": -3125.45068359375, "logps/rejected": -2569.78173828125, "loss": 0.6554, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1173698902130127, "rewards/margins": 0.26348310708999634, "rewards/rejected": 0.8538867831230164, "step": 1500 }, { "epoch": 0.39, "eval_logits/chosen": 0.1329973042011261, "eval_logits/rejected": 0.22804613411426544, "eval_logps/chosen": -3139.9814453125, "eval_logps/rejected": -2672.230224609375, "eval_loss": 0.6582511067390442, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": 1.04951810836792, "eval_rewards/margins": 0.26562774181365967, "eval_rewards/rejected": 0.7838903665542603, "eval_runtime": 434.6267, "eval_samples_per_second": 4.602, "eval_steps_per_second": 1.15, "step": 1500 }, { "epoch": 0.4, "learning_rate": 3.7873993652552077e-06, "logits/chosen": 0.17722779512405396, "logits/rejected": 0.20878300070762634, "logps/chosen": -2914.510498046875, "logps/rejected": -2793.29296875, "loss": 0.6709, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8258069753646851, "rewards/margins": 0.21584340929985046, "rewards/rejected": 0.6099635362625122, "step": 1510 }, { "epoch": 0.4, "learning_rate": 3.7677631495319953e-06, "logits/chosen": 0.12299227714538574, "logits/rejected": 0.18356509506702423, "logps/chosen": -3398.839111328125, "logps/rejected": -3123.25146484375, "loss": 0.6749, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.897351086139679, "rewards/margins": 0.14092959463596344, "rewards/rejected": 0.7564215064048767, "step": 1520 }, { "epoch": 0.4, "eval_logits/chosen": 0.17667429149150848, "eval_logits/rejected": 0.26705875992774963, "eval_logps/chosen": -3162.768310546875, "eval_logps/rejected": -2692.017822265625, "eval_loss": 0.6606147289276123, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": 0.8216500878334045, "eval_rewards/margins": 0.2356354296207428, "eval_rewards/rejected": 0.5860146880149841, "eval_runtime": 431.0903, "eval_samples_per_second": 4.639, "eval_steps_per_second": 1.16, "step": 1520 }, { "epoch": 0.4, "learning_rate": 3.748021075950633e-06, "logits/chosen": 0.2903847098350525, "logits/rejected": 0.34681954979896545, "logps/chosen": -2865.524658203125, "logps/rejected": -2454.694091796875, "loss": 0.6297, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.7485159635543823, "rewards/margins": 0.22835659980773926, "rewards/rejected": 0.5201594233512878, "step": 1530 }, { "epoch": 0.4, "learning_rate": 3.7281747929685824e-06, "logits/chosen": 0.19285848736763, "logits/rejected": 0.3307565450668335, "logps/chosen": -3178.95166015625, "logps/rejected": -2850.491943359375, "loss": 0.6857, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7182660102844238, "rewards/margins": 0.23981337249279022, "rewards/rejected": 0.4784526228904724, "step": 1540 }, { "epoch": 0.4, "eval_logits/chosen": 0.21319906413555145, "eval_logits/rejected": 0.3069779574871063, "eval_logps/chosen": -3166.340576171875, "eval_logps/rejected": -2698.195068359375, "eval_loss": 0.6595433354377747, "eval_rewards/accuracies": 0.6460000276565552, "eval_rewards/chosen": 0.7859293222427368, "eval_rewards/margins": 0.26168978214263916, "eval_rewards/rejected": 0.5242394804954529, "eval_runtime": 433.221, "eval_samples_per_second": 4.617, "eval_steps_per_second": 1.154, "step": 1540 }, { "epoch": 0.41, "learning_rate": 3.7082259577447604e-06, "logits/chosen": 0.34329789876937866, "logits/rejected": 0.38762885332107544, "logps/chosen": -3185.33251953125, "logps/rejected": -2627.58837890625, "loss": 0.652, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8655205965042114, "rewards/margins": 0.2935423254966736, "rewards/rejected": 0.5719782114028931, "step": 1550 }, { "epoch": 0.41, "learning_rate": 3.6881762360011688e-06, "logits/chosen": 0.1408468782901764, "logits/rejected": 0.23060747981071472, "logps/chosen": -3133.78955078125, "logps/rejected": -2793.703857421875, "loss": 0.6507, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.0547510385513306, "rewards/margins": 0.41841545701026917, "rewards/rejected": 0.636335551738739, "step": 1560 }, { "epoch": 0.41, "eval_logits/chosen": 0.16865360736846924, "eval_logits/rejected": 0.2674937844276428, "eval_logps/chosen": -3145.1982421875, "eval_logps/rejected": -2681.724609375, "eval_loss": 0.6542052626609802, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 0.9973480701446533, "eval_rewards/margins": 0.3084021508693695, "eval_rewards/rejected": 0.6889458298683167, "eval_runtime": 432.5327, "eval_samples_per_second": 4.624, "eval_steps_per_second": 1.156, "step": 1560 }, { "epoch": 0.41, "learning_rate": 3.668027301883802e-06, "logits/chosen": 0.21688225865364075, "logits/rejected": 0.22685685753822327, "logps/chosen": -2999.636474609375, "logps/rejected": -2805.21337890625, "loss": 0.6262, "rewards/accuracies": 0.625, "rewards/chosen": 0.8248790502548218, "rewards/margins": 0.2846268117427826, "rewards/rejected": 0.5402522683143616, "step": 1570 }, { "epoch": 0.41, "learning_rate": 3.64778083782286e-06, "logits/chosen": 0.1460150182247162, "logits/rejected": 0.24566078186035156, "logps/chosen": -3206.595947265625, "logps/rejected": -2824.58740234375, "loss": 0.6126, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0156611204147339, "rewards/margins": 0.2733878493309021, "rewards/rejected": 0.7422732710838318, "step": 1580 }, { "epoch": 0.41, "eval_logits/chosen": 0.11675554513931274, "eval_logits/rejected": 0.21622370183467865, "eval_logps/chosen": -3115.064453125, "eval_logps/rejected": -2657.041015625, "eval_loss": 0.6575484275817871, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 1.2986865043640137, "eval_rewards/margins": 0.3629027307033539, "eval_rewards/rejected": 0.9357838034629822, "eval_runtime": 431.2773, "eval_samples_per_second": 4.637, "eval_steps_per_second": 1.159, "step": 1580 }, { "epoch": 0.42, "learning_rate": 3.627438534392268e-06, "logits/chosen": 0.09023705124855042, "logits/rejected": 0.139350026845932, "logps/chosen": -3186.497802734375, "logps/rejected": -2773.193359375, "loss": 0.6649, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.3556030988693237, "rewards/margins": 0.18923628330230713, "rewards/rejected": 1.1663668155670166, "step": 1590 }, { "epoch": 0.42, "learning_rate": 3.607002090168506e-06, "logits/chosen": 0.010949126444756985, "logits/rejected": 0.11314767599105835, "logps/chosen": -3330.94482421875, "logps/rejected": -2963.7783203125, "loss": 0.6109, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.5903584957122803, "rewards/margins": 0.48318687081336975, "rewards/rejected": 1.1071715354919434, "step": 1600 }, { "epoch": 0.42, "eval_logits/chosen": 0.07349586486816406, "eval_logits/rejected": 0.17740881443023682, "eval_logps/chosen": -3097.249267578125, "eval_logps/rejected": -2641.500732421875, "eval_loss": 0.6629951596260071, "eval_rewards/accuracies": 0.6349999904632568, "eval_rewards/chosen": 1.4768401384353638, "eval_rewards/margins": 0.3856562674045563, "eval_rewards/rejected": 1.0911836624145508, "eval_runtime": 434.523, "eval_samples_per_second": 4.603, "eval_steps_per_second": 1.151, "step": 1600 }, { "epoch": 0.42, "learning_rate": 3.586473211588787e-06, "logits/chosen": 0.16379007697105408, "logits/rejected": 0.19587299227714539, "logps/chosen": -3025.614501953125, "logps/rejected": -2583.198486328125, "loss": 0.6499, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.1499139070510864, "rewards/margins": 0.31314200162887573, "rewards/rejected": 0.8367718458175659, "step": 1610 }, { "epoch": 0.42, "learning_rate": 3.5658536128085623e-06, "logits/chosen": 0.1383196860551834, "logits/rejected": 0.1314079463481903, "logps/chosen": -3115.815185546875, "logps/rejected": -2825.74951171875, "loss": 0.6221, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.2445911169052124, "rewards/margins": 0.46700796484947205, "rewards/rejected": 0.7775831818580627, "step": 1620 }, { "epoch": 0.42, "eval_logits/chosen": 0.09215152263641357, "eval_logits/rejected": 0.19689638912677765, "eval_logps/chosen": -3116.356201171875, "eval_logps/rejected": -2656.922607421875, "eval_loss": 0.6608781814575195, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.2857699394226074, "eval_rewards/margins": 0.34880125522613525, "eval_rewards/rejected": 0.9369685649871826, "eval_runtime": 430.3525, "eval_samples_per_second": 4.647, "eval_steps_per_second": 1.162, "step": 1620 }, { "epoch": 0.43, "learning_rate": 3.545145015558399e-06, "logits/chosen": 0.1150917261838913, "logits/rejected": 0.14601179957389832, "logps/chosen": -3228.016357421875, "logps/rejected": -2927.2353515625, "loss": 0.6829, "rewards/accuracies": 0.625, "rewards/chosen": 1.278878092765808, "rewards/margins": 0.1917877197265625, "rewards/rejected": 1.0870906114578247, "step": 1630 }, { "epoch": 0.43, "learning_rate": 3.5243491490002056e-06, "logits/chosen": 0.12332086265087128, "logits/rejected": 0.14177069067955017, "logps/chosen": -3019.46484375, "logps/rejected": -3053.4462890625, "loss": 0.6565, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.9789087176322937, "rewards/margins": 0.35014471411705017, "rewards/rejected": 0.6287640929222107, "step": 1640 }, { "epoch": 0.43, "eval_logits/chosen": 0.1893828809261322, "eval_logits/rejected": 0.2898103892803192, "eval_logps/chosen": -3173.423828125, "eval_logps/rejected": -2706.029296875, "eval_loss": 0.6651201844215393, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": 0.7150956988334656, "eval_rewards/margins": 0.2691948115825653, "eval_rewards/rejected": 0.4459008276462555, "eval_runtime": 434.1438, "eval_samples_per_second": 4.607, "eval_steps_per_second": 1.152, "step": 1640 }, { "epoch": 0.43, "learning_rate": 3.503467749582857e-06, "logits/chosen": 0.17114904522895813, "logits/rejected": 0.29283085465431213, "logps/chosen": -3181.461669921875, "logps/rejected": -2416.08349609375, "loss": 0.6378, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7707377672195435, "rewards/margins": 0.19535033404827118, "rewards/rejected": 0.5753874778747559, "step": 1650 }, { "epoch": 0.43, "learning_rate": 3.4825025608971947e-06, "logits/chosen": 0.17305757105350494, "logits/rejected": 0.21983642876148224, "logps/chosen": -2992.6962890625, "logps/rejected": -2542.326171875, "loss": 0.5982, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.6217515468597412, "rewards/margins": 0.5241011381149292, "rewards/rejected": 1.0976502895355225, "step": 1660 }, { "epoch": 0.43, "eval_logits/chosen": 0.08046545088291168, "eval_logits/rejected": 0.183335080742836, "eval_logps/chosen": -3098.037353515625, "eval_logps/rejected": -2641.568603515625, "eval_loss": 0.6570764183998108, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 1.4689598083496094, "eval_rewards/margins": 0.37845250964164734, "eval_rewards/rejected": 1.0905072689056396, "eval_runtime": 431.3985, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 1660 }, { "epoch": 0.44, "learning_rate": 3.4614553335304407e-06, "logits/chosen": 0.16976502537727356, "logits/rejected": 0.23575982451438904, "logps/chosen": -2611.44189453125, "logps/rejected": -2583.77197265625, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": 1.2433974742889404, "rewards/margins": 0.24245846271514893, "rewards/rejected": 1.0009390115737915, "step": 1670 }, { "epoch": 0.44, "learning_rate": 3.4403278249200222e-06, "logits/chosen": 0.08754734694957733, "logits/rejected": 0.20148825645446777, "logps/chosen": -3223.266845703125, "logps/rejected": -2784.00439453125, "loss": 0.6986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.2149263620376587, "rewards/margins": 0.2647256851196289, "rewards/rejected": 0.9502006769180298, "step": 1680 }, { "epoch": 0.44, "eval_logits/chosen": 0.15460717678070068, "eval_logits/rejected": 0.2532672882080078, "eval_logps/chosen": -3133.4736328125, "eval_logps/rejected": -2672.806396484375, "eval_loss": 0.6550268530845642, "eval_rewards/accuracies": 0.6480000019073486, "eval_rewards/chosen": 1.1145951747894287, "eval_rewards/margins": 0.33646807074546814, "eval_rewards/rejected": 0.778127133846283, "eval_runtime": 432.5269, "eval_samples_per_second": 4.624, "eval_steps_per_second": 1.156, "step": 1680 }, { "epoch": 0.44, "learning_rate": 3.4191217992068293e-06, "logits/chosen": 0.15261730551719666, "logits/rejected": 0.26762574911117554, "logps/chosen": -3334.293701171875, "logps/rejected": -2830.876708984375, "loss": 0.6341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.1861635446548462, "rewards/margins": 0.38032567501068115, "rewards/rejected": 0.8058378100395203, "step": 1690 }, { "epoch": 0.44, "learning_rate": 3.3978390270879056e-06, "logits/chosen": 0.0648256242275238, "logits/rejected": 0.18302997946739197, "logps/chosen": -3300.06884765625, "logps/rejected": -2768.169677734375, "loss": 0.6316, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.6177408695220947, "rewards/margins": 0.4874458312988281, "rewards/rejected": 1.1302950382232666, "step": 1700 }, { "epoch": 0.44, "eval_logits/chosen": 0.04745986685156822, "eval_logits/rejected": 0.1493695080280304, "eval_logps/chosen": -3081.18115234375, "eval_logps/rejected": -2625.31787109375, "eval_loss": 0.6606111526489258, "eval_rewards/accuracies": 0.6359999775886536, "eval_rewards/chosen": 1.63752281665802, "eval_rewards/margins": 0.3845072388648987, "eval_rewards/rejected": 1.2530157566070557, "eval_runtime": 433.4603, "eval_samples_per_second": 4.614, "eval_steps_per_second": 1.154, "step": 1700 }, { "epoch": 0.45, "learning_rate": 3.3764812856685995e-06, "logits/chosen": 0.06260480731725693, "logits/rejected": 0.13203497231006622, "logps/chosen": -2822.783203125, "logps/rejected": -2620.430908203125, "loss": 0.6986, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.446090817451477, "rewards/margins": 0.25067800283432007, "rewards/rejected": 1.1954126358032227, "step": 1710 }, { "epoch": 0.45, "learning_rate": 3.3550503583141726e-06, "logits/chosen": 0.12656763195991516, "logits/rejected": 0.1688462197780609, "logps/chosen": -3060.4189453125, "logps/rejected": -2634.305419921875, "loss": 0.6618, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1088200807571411, "rewards/margins": 0.2684691548347473, "rewards/rejected": 0.8403509259223938, "step": 1720 }, { "epoch": 0.45, "eval_logits/chosen": 0.13087493181228638, "eval_logits/rejected": 0.22966016829013824, "eval_logps/chosen": -3136.467529296875, "eval_logps/rejected": -2671.847900390625, "eval_loss": 0.6570676565170288, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 1.084654688835144, "eval_rewards/margins": 0.2969398498535156, "eval_rewards/rejected": 0.7877148389816284, "eval_runtime": 430.6235, "eval_samples_per_second": 4.644, "eval_steps_per_second": 1.161, "step": 1720 }, { "epoch": 0.45, "learning_rate": 3.3335480345008907e-06, "logits/chosen": 0.15931150317192078, "logits/rejected": 0.21486827731132507, "logps/chosen": -2855.713623046875, "logps/rejected": -2365.821533203125, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": 1.1800167560577393, "rewards/margins": 0.4255780577659607, "rewards/rejected": 0.7544385194778442, "step": 1730 }, { "epoch": 0.46, "learning_rate": 3.3119761096666055e-06, "logits/chosen": 0.022145554423332214, "logits/rejected": 0.1381361037492752, "logps/chosen": -3409.54833984375, "logps/rejected": -2628.103515625, "loss": 0.7146, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.2122939825057983, "rewards/margins": 0.24860568344593048, "rewards/rejected": 0.9636882543563843, "step": 1740 }, { "epoch": 0.46, "eval_logits/chosen": 0.09440691024065018, "eval_logits/rejected": 0.1949949860572815, "eval_logps/chosen": -3104.23876953125, "eval_logps/rejected": -2643.846435546875, "eval_loss": 0.6608638167381287, "eval_rewards/accuracies": 0.6420000195503235, "eval_rewards/chosen": 1.4069457054138184, "eval_rewards/margins": 0.33921706676483154, "eval_rewards/rejected": 1.0677286386489868, "eval_runtime": 434.7361, "eval_samples_per_second": 4.6, "eval_steps_per_second": 1.15, "step": 1740 }, { "epoch": 0.46, "learning_rate": 3.290336385060832e-06, "logits/chosen": 0.07489897310733795, "logits/rejected": 0.15724709630012512, "logps/chosen": -3379.081298828125, "logps/rejected": -2890.026123046875, "loss": 0.6803, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.6116670370101929, "rewards/margins": 0.2879468500614166, "rewards/rejected": 1.3237203359603882, "step": 1750 }, { "epoch": 0.46, "learning_rate": 3.268630667594348e-06, "logits/chosen": 0.20419076085090637, "logits/rejected": 0.24949023127555847, "logps/chosen": -2709.8046875, "logps/rejected": -2523.84619140625, "loss": 0.7156, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.2472082376480103, "rewards/margins": 0.19915586709976196, "rewards/rejected": 1.0480523109436035, "step": 1760 }, { "epoch": 0.46, "eval_logits/chosen": 0.15789051353931427, "eval_logits/rejected": 0.25551679730415344, "eval_logps/chosen": -3137.118408203125, "eval_logps/rejected": -2671.9775390625, "eval_loss": 0.6546176671981812, "eval_rewards/accuracies": 0.652999997138977, "eval_rewards/chosen": 1.078142523765564, "eval_rewards/margins": 0.29172515869140625, "eval_rewards/rejected": 0.7864173650741577, "eval_runtime": 430.3703, "eval_samples_per_second": 4.647, "eval_steps_per_second": 1.162, "step": 1760 }, { "epoch": 0.46, "learning_rate": 3.2468607696883147e-06, "logits/chosen": 0.19224001467227936, "logits/rejected": 0.18688587844371796, "logps/chosen": -3139.707275390625, "logps/rejected": -3253.13818359375, "loss": 0.6831, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.7220229506492615, "rewards/margins": 0.06010418012738228, "rewards/rejected": 0.6619186997413635, "step": 1770 }, { "epoch": 0.47, "learning_rate": 3.225028509122944e-06, "logits/chosen": 0.2888290584087372, "logits/rejected": 0.31027376651763916, "logps/chosen": -2878.21533203125, "logps/rejected": -3013.267578125, "loss": 0.6817, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5044113993644714, "rewards/margins": 0.11019613593816757, "rewards/rejected": 0.39421528577804565, "step": 1780 }, { "epoch": 0.47, "eval_logits/chosen": 0.22068993747234344, "eval_logits/rejected": 0.31624090671539307, "eval_logps/chosen": -3190.676513671875, "eval_logps/rejected": -2715.246337890625, "eval_loss": 0.6729059815406799, "eval_rewards/accuracies": 0.6190000176429749, "eval_rewards/chosen": 0.5425652265548706, "eval_rewards/margins": 0.188832625746727, "eval_rewards/rejected": 0.3537325859069824, "eval_runtime": 434.0946, "eval_samples_per_second": 4.607, "eval_steps_per_second": 1.152, "step": 1780 }, { "epoch": 0.47, "learning_rate": 3.2031357088857083e-06, "logits/chosen": 0.2256816327571869, "logits/rejected": 0.2832657992839813, "logps/chosen": -2951.27490234375, "logps/rejected": -2546.7626953125, "loss": 0.6653, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7938987016677856, "rewards/margins": 0.15549656748771667, "rewards/rejected": 0.6384021043777466, "step": 1790 }, { "epoch": 0.47, "learning_rate": 3.181184197019127e-06, "logits/chosen": 0.08977369964122772, "logits/rejected": 0.19705714285373688, "logps/chosen": -3129.514892578125, "logps/rejected": -2756.933837890625, "loss": 0.6277, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1325801610946655, "rewards/margins": 0.308296799659729, "rewards/rejected": 0.8242834210395813, "step": 1800 }, { "epoch": 0.47, "eval_logits/chosen": 0.061959899961948395, "eval_logits/rejected": 0.16662432253360748, "eval_logps/chosen": -3096.299560546875, "eval_logps/rejected": -2634.9365234375, "eval_loss": 0.6605405807495117, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": 1.486335039138794, "eval_rewards/margins": 0.3295076787471771, "eval_rewards/rejected": 1.1568275690078735, "eval_runtime": 431.6026, "eval_samples_per_second": 4.634, "eval_steps_per_second": 1.158, "step": 1800 }, { "epoch": 0.47, "learning_rate": 3.159175806468126e-06, "logits/chosen": -0.0012862607836723328, "logits/rejected": 0.1033133864402771, "logps/chosen": -3602.08251953125, "logps/rejected": -3050.93017578125, "loss": 0.6127, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.0919625759124756, "rewards/margins": 0.5325493216514587, "rewards/rejected": 1.559413194656372, "step": 1810 }, { "epoch": 0.48, "learning_rate": 3.1371123749269804e-06, "logits/chosen": 0.05335702747106552, "logits/rejected": 0.13902577757835388, "logps/chosen": -3311.86328125, "logps/rejected": -2839.94775390625, "loss": 0.6093, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.4549100399017334, "rewards/margins": 0.3658619225025177, "rewards/rejected": 1.089048147201538, "step": 1820 }, { "epoch": 0.48, "eval_logits/chosen": 0.10223958641290665, "eval_logits/rejected": 0.2064126580953598, "eval_logps/chosen": -3110.324462890625, "eval_logps/rejected": -2649.488525390625, "eval_loss": 0.655569314956665, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": 1.3460863828659058, "eval_rewards/margins": 0.3347780108451843, "eval_rewards/rejected": 1.0113083124160767, "eval_runtime": 432.0404, "eval_samples_per_second": 4.629, "eval_steps_per_second": 1.157, "step": 1820 }, { "epoch": 0.48, "learning_rate": 3.114995744685877e-06, "logits/chosen": 0.15783575177192688, "logits/rejected": 0.19609513878822327, "logps/chosen": -2503.150390625, "logps/rejected": -2212.72216796875, "loss": 0.6902, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.9345712661743164, "rewards/margins": 0.13790163397789001, "rewards/rejected": 0.796669602394104, "step": 1830 }, { "epoch": 0.48, "learning_rate": 3.0928277624770743e-06, "logits/chosen": 0.19189710915088654, "logits/rejected": 0.29529860615730286, "logps/chosen": -3351.09814453125, "logps/rejected": -2615.06396484375, "loss": 0.6416, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9789069294929504, "rewards/margins": 0.2561236619949341, "rewards/rejected": 0.7227832674980164, "step": 1840 }, { "epoch": 0.48, "eval_logits/chosen": 0.16018134355545044, "eval_logits/rejected": 0.26175692677497864, "eval_logps/chosen": -3142.752197265625, "eval_logps/rejected": -2677.513427734375, "eval_loss": 0.6525106430053711, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": 1.021809697151184, "eval_rewards/margins": 0.2907525897026062, "eval_rewards/rejected": 0.7310571670532227, "eval_runtime": 433.6059, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 1840 }, { "epoch": 0.48, "learning_rate": 3.070610279320708e-06, "logits/chosen": 0.12904778122901917, "logits/rejected": 0.1936662793159485, "logps/chosen": -3133.924560546875, "logps/rejected": -2668.89697265625, "loss": 0.6236, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2473167181015015, "rewards/margins": 0.33506640791893005, "rewards/rejected": 0.9122503995895386, "step": 1850 }, { "epoch": 0.49, "learning_rate": 3.0483451503702264e-06, "logits/chosen": 0.026821259409189224, "logits/rejected": 0.04171115905046463, "logps/chosen": -3426.274169921875, "logps/rejected": -3339.563232421875, "loss": 0.647, "rewards/accuracies": 0.625, "rewards/chosen": 1.8743276596069336, "rewards/margins": 0.4552678167819977, "rewards/rejected": 1.4190597534179688, "step": 1860 }, { "epoch": 0.49, "eval_logits/chosen": 0.10068066418170929, "eval_logits/rejected": 0.20392417907714844, "eval_logps/chosen": -3114.912353515625, "eval_logps/rejected": -2654.193603515625, "eval_loss": 0.6554283499717712, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 1.3002110719680786, "eval_rewards/margins": 0.3359551727771759, "eval_rewards/rejected": 0.9642559289932251, "eval_runtime": 430.4257, "eval_samples_per_second": 4.647, "eval_steps_per_second": 1.162, "step": 1860 }, { "epoch": 0.49, "learning_rate": 3.0260342347574916e-06, "logits/chosen": 0.24853697419166565, "logits/rejected": 0.24362778663635254, "logps/chosen": -3091.21923828125, "logps/rejected": -3040.197998046875, "loss": 0.6707, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.960342288017273, "rewards/margins": 0.25902003049850464, "rewards/rejected": 0.7013221979141235, "step": 1870 }, { "epoch": 0.49, "learning_rate": 3.0036793954375358e-06, "logits/chosen": 0.3089984059333801, "logits/rejected": 0.45125776529312134, "logps/chosen": -3289.33154296875, "logps/rejected": -2269.986328125, "loss": 0.6269, "rewards/accuracies": 0.625, "rewards/chosen": 0.5914864540100098, "rewards/margins": 0.32909318804740906, "rewards/rejected": 0.2623932361602783, "step": 1880 }, { "epoch": 0.49, "eval_logits/chosen": 0.16607628762722015, "eval_logits/rejected": 0.26885902881622314, "eval_logps/chosen": -3165.389892578125, "eval_logps/rejected": -2698.312744140625, "eval_loss": 0.6585498452186584, "eval_rewards/accuracies": 0.6349999904632568, "eval_rewards/chosen": 0.7954334020614624, "eval_rewards/margins": 0.27236810326576233, "eval_rewards/rejected": 0.5230653882026672, "eval_runtime": 434.8387, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 1880 }, { "epoch": 0.49, "learning_rate": 2.981282499033009e-06, "logits/chosen": 0.24196061491966248, "logits/rejected": 0.2414587438106537, "logps/chosen": -2860.64404296875, "logps/rejected": -2925.143798828125, "loss": 0.7161, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.6814879775047302, "rewards/margins": 0.03675490617752075, "rewards/rejected": 0.6447331309318542, "step": 1890 }, { "epoch": 0.5, "learning_rate": 2.9588454156783163e-06, "logits/chosen": 0.28820252418518066, "logits/rejected": 0.26390987634658813, "logps/chosen": -2964.46142578125, "logps/rejected": -3050.644775390625, "loss": 0.7114, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.7384127378463745, "rewards/margins": 0.028867293149232864, "rewards/rejected": 0.7095454335212708, "step": 1900 }, { "epoch": 0.5, "eval_logits/chosen": 0.1904171258211136, "eval_logits/rejected": 0.2911287844181061, "eval_logps/chosen": -3183.388671875, "eval_logps/rejected": -2712.958740234375, "eval_loss": 0.6588751077651978, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": 0.6154425740242004, "eval_rewards/margins": 0.2388375699520111, "eval_rewards/rejected": 0.37660494446754456, "eval_runtime": 431.0136, "eval_samples_per_second": 4.64, "eval_steps_per_second": 1.16, "step": 1900 }, { "epoch": 0.5, "learning_rate": 2.9363700188634597e-06, "logits/chosen": 0.20760202407836914, "logits/rejected": 0.239425927400589, "logps/chosen": -3186.408203125, "logps/rejected": -2889.694580078125, "loss": 0.6982, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.501329779624939, "rewards/margins": 0.09633903205394745, "rewards/rejected": 0.4049907624721527, "step": 1910 }, { "epoch": 0.5, "learning_rate": 2.9138581852776053e-06, "logits/chosen": 0.17429453134536743, "logits/rejected": 0.2935001850128174, "logps/chosen": -3419.54248046875, "logps/rejected": -2819.908203125, "loss": 0.6789, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.7137317657470703, "rewards/margins": 0.2887819707393646, "rewards/rejected": 0.4249497950077057, "step": 1920 }, { "epoch": 0.5, "eval_logits/chosen": 0.1702168881893158, "eval_logits/rejected": 0.2714051902294159, "eval_logps/chosen": -3174.8984375, "eval_logps/rejected": -2704.5810546875, "eval_loss": 0.6562850475311279, "eval_rewards/accuracies": 0.6349999904632568, "eval_rewards/chosen": 0.7003446817398071, "eval_rewards/margins": 0.23995986580848694, "eval_rewards/rejected": 0.46038469672203064, "eval_runtime": 433.6868, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 1920 }, { "epoch": 0.51, "learning_rate": 2.8913117946523805e-06, "logits/chosen": 0.27158123254776, "logits/rejected": 0.36910751461982727, "logps/chosen": -2765.224609375, "logps/rejected": -2285.499755859375, "loss": 0.6356, "rewards/accuracies": 0.5625, "rewards/chosen": 0.7056854963302612, "rewards/margins": 0.26931947469711304, "rewards/rejected": 0.4363659918308258, "step": 1930 }, { "epoch": 0.51, "learning_rate": 2.8687327296049126e-06, "logits/chosen": 0.07269696891307831, "logits/rejected": 0.09847090393304825, "logps/chosen": -2940.708740234375, "logps/rejected": -2835.687255859375, "loss": 0.6729, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.14520263671875, "rewards/margins": 0.3733542561531067, "rewards/rejected": 0.7718484401702881, "step": 1940 }, { "epoch": 0.51, "eval_logits/chosen": 0.07341960072517395, "eval_logits/rejected": 0.1794813871383667, "eval_logps/chosen": -3118.243408203125, "eval_logps/rejected": -2655.864990234375, "eval_loss": 0.6573635935783386, "eval_rewards/accuracies": 0.6420000195503235, "eval_rewards/chosen": 1.2668969631195068, "eval_rewards/margins": 0.31935393810272217, "eval_rewards/rejected": 0.9475430250167847, "eval_runtime": 432.5011, "eval_samples_per_second": 4.624, "eval_steps_per_second": 1.156, "step": 1940 }, { "epoch": 0.51, "learning_rate": 2.8461228754806376e-06, "logits/chosen": 0.0056190164759755135, "logits/rejected": 0.06592054665088654, "logps/chosen": -3281.85205078125, "logps/rejected": -2750.1416015625, "loss": 0.6339, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.4659066200256348, "rewards/margins": 0.2704998552799225, "rewards/rejected": 1.1954066753387451, "step": 1950 }, { "epoch": 0.51, "learning_rate": 2.823484120195865e-06, "logits/chosen": -0.04551716893911362, "logits/rejected": 0.12663479149341583, "logps/chosen": -3310.137939453125, "logps/rejected": -2611.508544921875, "loss": 0.6502, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.6863304376602173, "rewards/margins": 0.47188156843185425, "rewards/rejected": 1.2144488096237183, "step": 1960 }, { "epoch": 0.51, "eval_logits/chosen": 0.050778865814208984, "eval_logits/rejected": 0.1571788340806961, "eval_logps/chosen": -3103.32861328125, "eval_logps/rejected": -2642.912841796875, "eval_loss": 0.6606889367103577, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": 1.4160468578338623, "eval_rewards/margins": 0.33898091316223145, "eval_rewards/rejected": 1.0770660638809204, "eval_runtime": 431.4155, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 1960 }, { "epoch": 0.52, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -0.04949677735567093, "logits/rejected": 0.09687992930412292, "logps/chosen": -3834.75341796875, "logps/rejected": -3136.56103515625, "loss": 0.6522, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 1.7121565341949463, "rewards/margins": 0.5151535272598267, "rewards/rejected": 1.19700288772583, "step": 1970 }, { "epoch": 0.52, "learning_rate": 2.7781274697184353e-06, "logits/chosen": 0.2392493039369583, "logits/rejected": 0.21708495914936066, "logps/chosen": -2951.28955078125, "logps/rejected": -2551.0732421875, "loss": 0.6567, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8871490359306335, "rewards/margins": 0.226845383644104, "rewards/rejected": 0.6603037118911743, "step": 1980 }, { "epoch": 0.52, "eval_logits/chosen": 0.12331610172986984, "eval_logits/rejected": 0.22633220255374908, "eval_logps/chosen": -3145.6884765625, "eval_logps/rejected": -2678.528564453125, "eval_loss": 0.6547273993492126, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 0.9924439787864685, "eval_rewards/margins": 0.2715378999710083, "eval_rewards/rejected": 0.7209060788154602, "eval_runtime": 434.4521, "eval_samples_per_second": 4.603, "eval_steps_per_second": 1.151, "step": 1980 }, { "epoch": 0.52, "learning_rate": 2.7554133617930397e-06, "logits/chosen": 0.11571061611175537, "logits/rejected": 0.1605503112077713, "logps/chosen": -3174.21240234375, "logps/rejected": -2851.3349609375, "loss": 0.6379, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.1113097667694092, "rewards/margins": 0.31989890336990356, "rewards/rejected": 0.7914108633995056, "step": 1990 }, { "epoch": 0.52, "learning_rate": 2.7326779269254363e-06, "logits/chosen": 0.04783995822072029, "logits/rejected": 0.09715013951063156, "logps/chosen": -3597.832763671875, "logps/rejected": -3122.93359375, "loss": 0.66, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3304338455200195, "rewards/margins": 0.3057953715324402, "rewards/rejected": 1.0246384143829346, "step": 2000 }, { "epoch": 0.52, "eval_logits/chosen": 0.1300792545080185, "eval_logits/rejected": 0.23234666883945465, "eval_logps/chosen": -3150.986328125, "eval_logps/rejected": -2682.588134765625, "eval_loss": 0.6564373970031738, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 0.9394680261611938, "eval_rewards/margins": 0.25916051864624023, "eval_rewards/rejected": 0.6803075075149536, "eval_runtime": 430.4364, "eval_samples_per_second": 4.646, "eval_steps_per_second": 1.162, "step": 2000 }, { "epoch": 0.53, "learning_rate": 2.7099230635178954e-06, "logits/chosen": 0.25560516119003296, "logits/rejected": 0.19861075282096863, "logps/chosen": -3247.532470703125, "logps/rejected": -3260.64697265625, "loss": 0.6362, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.071368932723999, "rewards/margins": 0.374894917011261, "rewards/rejected": 0.6964740753173828, "step": 2010 }, { "epoch": 0.53, "learning_rate": 2.6871506715949608e-06, "logits/chosen": 0.1226113811135292, "logits/rejected": 0.15147504210472107, "logps/chosen": -3066.229736328125, "logps/rejected": -2288.5263671875, "loss": 0.6165, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7889875769615173, "rewards/margins": 0.3118061423301697, "rewards/rejected": 0.4771813750267029, "step": 2020 }, { "epoch": 0.53, "eval_logits/chosen": 0.10944931209087372, "eval_logits/rejected": 0.21173043549060822, "eval_logps/chosen": -3132.904541015625, "eval_logps/rejected": -2668.576904296875, "eval_loss": 0.6539210081100464, "eval_rewards/accuracies": 0.6420000195503235, "eval_rewards/chosen": 1.1202855110168457, "eval_rewards/margins": 0.2998623847961426, "eval_rewards/rejected": 0.8204231858253479, "eval_runtime": 434.6769, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 2020 }, { "epoch": 0.53, "learning_rate": 2.6643626526448063e-06, "logits/chosen": 0.17268984019756317, "logits/rejected": 0.15303292870521545, "logps/chosen": -2976.93115234375, "logps/rejected": -2504.17138671875, "loss": 0.6438, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1162221431732178, "rewards/margins": 0.23188626766204834, "rewards/rejected": 0.8843358755111694, "step": 2030 }, { "epoch": 0.53, "learning_rate": 2.6415609094604562e-06, "logits/chosen": 0.13868165016174316, "logits/rejected": 0.14400923252105713, "logps/chosen": -3186.37841796875, "logps/rejected": -3131.861328125, "loss": 0.7214, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.3973057270050049, "rewards/margins": 0.18191012740135193, "rewards/rejected": 1.2153955698013306, "step": 2040 }, { "epoch": 0.53, "eval_logits/chosen": 0.0900619626045227, "eval_logits/rejected": 0.19336962699890137, "eval_logps/chosen": -3111.621337890625, "eval_logps/rejected": -2651.482421875, "eval_loss": 0.655492901802063, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 1.3331220149993896, "eval_rewards/margins": 0.3417540192604065, "eval_rewards/rejected": 0.9913679361343384, "eval_runtime": 431.2121, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.16, "step": 2040 }, { "epoch": 0.54, "learning_rate": 2.618747345980904e-06, "logits/chosen": 0.0412694588303566, "logits/rejected": 0.1378905028104782, "logps/chosen": -2789.478515625, "logps/rejected": -2542.04541015625, "loss": 0.6626, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.223069190979004, "rewards/margins": 0.3255647122859955, "rewards/rejected": 0.897504448890686, "step": 2050 }, { "epoch": 0.54, "learning_rate": 2.595923867132136e-06, "logits/chosen": 0.18346819281578064, "logits/rejected": 0.2186085730791092, "logps/chosen": -3023.18896484375, "logps/rejected": -2618.40771484375, "loss": 0.6622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.0684258937835693, "rewards/margins": 0.18317675590515137, "rewards/rejected": 0.885249137878418, "step": 2060 }, { "epoch": 0.54, "eval_logits/chosen": 0.08649124950170517, "eval_logits/rejected": 0.18998154997825623, "eval_logps/chosen": -3120.61474609375, "eval_logps/rejected": -2657.939453125, "eval_loss": 0.6509246826171875, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": 1.2431837320327759, "eval_rewards/margins": 0.3163864016532898, "eval_rewards/rejected": 0.9267972707748413, "eval_runtime": 432.9706, "eval_samples_per_second": 4.619, "eval_steps_per_second": 1.155, "step": 2060 }, { "epoch": 0.54, "learning_rate": 2.5730923786680672e-06, "logits/chosen": 0.03658928722143173, "logits/rejected": 0.22147834300994873, "logps/chosen": -2770.685302734375, "logps/rejected": -2214.77685546875, "loss": 0.7004, "rewards/accuracies": 0.625, "rewards/chosen": 0.9670537710189819, "rewards/margins": 0.3021814227104187, "rewards/rejected": 0.6648725271224976, "step": 2070 }, { "epoch": 0.54, "learning_rate": 2.5502547870114137e-06, "logits/chosen": 0.15138494968414307, "logits/rejected": 0.23306353390216827, "logps/chosen": -2879.3095703125, "logps/rejected": -2561.2421875, "loss": 0.6141, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9628521203994751, "rewards/margins": 0.2760319113731384, "rewards/rejected": 0.6868202090263367, "step": 2080 }, { "epoch": 0.54, "eval_logits/chosen": 0.10413829982280731, "eval_logits/rejected": 0.20671820640563965, "eval_logps/chosen": -3134.596435546875, "eval_logps/rejected": -2669.467529296875, "eval_loss": 0.6503845453262329, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": 1.103366732597351, "eval_rewards/margins": 0.29185014963150024, "eval_rewards/rejected": 0.8115164637565613, "eval_runtime": 433.3406, "eval_samples_per_second": 4.615, "eval_steps_per_second": 1.154, "step": 2080 }, { "epoch": 0.55, "learning_rate": 2.527412999094507e-06, "logits/chosen": 0.04813632741570473, "logits/rejected": 0.13193252682685852, "logps/chosen": -3592.100341796875, "logps/rejected": -3212.26904296875, "loss": 0.6131, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.608589768409729, "rewards/margins": 0.3390997350215912, "rewards/rejected": 1.2694900035858154, "step": 2090 }, { "epoch": 0.55, "learning_rate": 2.504568922200064e-06, "logits/chosen": 0.13469159603118896, "logits/rejected": 0.1818404495716095, "logps/chosen": -2952.713623046875, "logps/rejected": -2406.609130859375, "loss": 0.6511, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.1789261102676392, "rewards/margins": 0.286815345287323, "rewards/rejected": 0.8921108245849609, "step": 2100 }, { "epoch": 0.55, "eval_logits/chosen": 0.05289905518293381, "eval_logits/rejected": 0.1577528864145279, "eval_logps/chosen": -3111.312255859375, "eval_logps/rejected": -2648.950927734375, "eval_loss": 0.6494833827018738, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.3362112045288086, "eval_rewards/margins": 0.319529265165329, "eval_rewards/rejected": 1.0166819095611572, "eval_runtime": 430.6242, "eval_samples_per_second": 4.644, "eval_steps_per_second": 1.161, "step": 2100 }, { "epoch": 0.55, "learning_rate": 2.4817244638019333e-06, "logits/chosen": 0.07212768495082855, "logits/rejected": 0.13466773927211761, "logps/chosen": -3026.56201171875, "logps/rejected": -2747.56298828125, "loss": 0.6421, "rewards/accuracies": 0.625, "rewards/chosen": 1.2377313375473022, "rewards/margins": 0.2553738057613373, "rewards/rejected": 0.9823576807975769, "step": 2110 }, { "epoch": 0.55, "learning_rate": 2.4588815314058155e-06, "logits/chosen": 0.1251976042985916, "logits/rejected": 0.22235138714313507, "logps/chosen": -2942.7861328125, "logps/rejected": -2567.19482421875, "loss": 0.6457, "rewards/accuracies": 0.5625, "rewards/chosen": 1.2061235904693604, "rewards/margins": 0.3019629120826721, "rewards/rejected": 0.9041604995727539, "step": 2120 }, { "epoch": 0.55, "eval_logits/chosen": 0.024196593090891838, "eval_logits/rejected": 0.12969060242176056, "eval_logps/chosen": -3104.77490234375, "eval_logps/rejected": -2642.482666015625, "eval_loss": 0.650678813457489, "eval_rewards/accuracies": 0.6299999952316284, "eval_rewards/chosen": 1.401583194732666, "eval_rewards/margins": 0.3202188014984131, "eval_rewards/rejected": 1.081364393234253, "eval_runtime": 434.8482, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 2120 }, { "epoch": 0.56, "learning_rate": 2.4360420323899922e-06, "logits/chosen": 0.08713003247976303, "logits/rejected": 0.08628582954406738, "logps/chosen": -3339.127685546875, "logps/rejected": -3283.711669921875, "loss": 0.6413, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.5193580389022827, "rewards/margins": 0.27459102869033813, "rewards/rejected": 1.2447670698165894, "step": 2130 }, { "epoch": 0.56, "learning_rate": 2.4132078738460585e-06, "logits/chosen": 0.14204150438308716, "logits/rejected": 0.12824349105358124, "logps/chosen": -2885.69970703125, "logps/rejected": -2757.631591796875, "loss": 0.6444, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.0270668268203735, "rewards/margins": 0.19756512343883514, "rewards/rejected": 0.8295016288757324, "step": 2140 }, { "epoch": 0.56, "eval_logits/chosen": 0.08376435935497284, "eval_logits/rejected": 0.18691790103912354, "eval_logps/chosen": -3145.85107421875, "eval_logps/rejected": -2678.1279296875, "eval_loss": 0.6481114625930786, "eval_rewards/accuracies": 0.6460000276565552, "eval_rewards/chosen": 0.9908216595649719, "eval_rewards/margins": 0.26590749621391296, "eval_rewards/rejected": 0.7249141931533813, "eval_runtime": 430.8001, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 2140 }, { "epoch": 0.56, "learning_rate": 2.3903809624196826e-06, "logits/chosen": 0.07288821041584015, "logits/rejected": 0.15483590960502625, "logps/chosen": -3299.91015625, "logps/rejected": -2999.813720703125, "loss": 0.6343, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.0453672409057617, "rewards/margins": 0.1928001195192337, "rewards/rejected": 0.8525670170783997, "step": 2150 }, { "epoch": 0.57, "learning_rate": 2.3675632041513978e-06, "logits/chosen": 0.0943610742688179, "logits/rejected": 0.07643449306488037, "logps/chosen": -3471.84619140625, "logps/rejected": -3265.229248046875, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": 1.3719606399536133, "rewards/margins": 0.273048460483551, "rewards/rejected": 1.098912239074707, "step": 2160 }, { "epoch": 0.57, "eval_logits/chosen": 0.04631989821791649, "eval_logits/rejected": 0.15207552909851074, "eval_logps/chosen": -3127.82861328125, "eval_logps/rejected": -2662.7958984375, "eval_loss": 0.6468932032585144, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.1710470914840698, "eval_rewards/margins": 0.29281410574913025, "eval_rewards/rejected": 0.878233015537262, "eval_runtime": 433.8016, "eval_samples_per_second": 4.61, "eval_steps_per_second": 1.153, "step": 2160 }, { "epoch": 0.57, "learning_rate": 2.3447565043174533e-06, "logits/chosen": 0.12128611654043198, "logits/rejected": 0.11309567838907242, "logps/chosen": -3288.78076171875, "logps/rejected": -2761.48291015625, "loss": 0.6052, "rewards/accuracies": 0.5625, "rewards/chosen": 1.1160128116607666, "rewards/margins": 0.19696560502052307, "rewards/rejected": 0.9190472364425659, "step": 2170 }, { "epoch": 0.57, "learning_rate": 2.321962767270724e-06, "logits/chosen": 0.14467069506645203, "logits/rejected": 0.18251073360443115, "logps/chosen": -2764.28564453125, "logps/rejected": -2455.669189453125, "loss": 0.7217, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.8048734664916992, "rewards/margins": 0.03892925754189491, "rewards/rejected": 0.7659441828727722, "step": 2180 }, { "epoch": 0.57, "eval_logits/chosen": 0.08977630734443665, "eval_logits/rejected": 0.19278757274150848, "eval_logps/chosen": -3157.906494140625, "eval_logps/rejected": -2688.28076171875, "eval_loss": 0.6496227979660034, "eval_rewards/accuracies": 0.640999972820282, "eval_rewards/chosen": 0.8702697157859802, "eval_rewards/margins": 0.24688567221164703, "eval_rewards/rejected": 0.6233841180801392, "eval_runtime": 432.5785, "eval_samples_per_second": 4.623, "eval_steps_per_second": 1.156, "step": 2180 }, { "epoch": 0.57, "learning_rate": 2.299183896281692e-06, "logits/chosen": 0.11981002241373062, "logits/rejected": 0.20698182284832, "logps/chosen": -3219.846435546875, "logps/rejected": -2736.53125, "loss": 0.6208, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.8445932269096375, "rewards/margins": 0.2988007366657257, "rewards/rejected": 0.5457924008369446, "step": 2190 }, { "epoch": 0.58, "learning_rate": 2.2764217933795297e-06, "logits/chosen": 0.05899732559919357, "logits/rejected": 0.13044658303260803, "logps/chosen": -3448.79345703125, "logps/rejected": -2737.903076171875, "loss": 0.7032, "rewards/accuracies": 0.5, "rewards/chosen": 1.1732314825057983, "rewards/margins": 0.16456131637096405, "rewards/rejected": 1.0086700916290283, "step": 2200 }, { "epoch": 0.58, "eval_logits/chosen": 0.014218594878911972, "eval_logits/rejected": 0.12114160507917404, "eval_logps/chosen": -3115.688720703125, "eval_logps/rejected": -2652.31591796875, "eval_loss": 0.6462064385414124, "eval_rewards/accuracies": 0.6349999904632568, "eval_rewards/chosen": 1.292444109916687, "eval_rewards/margins": 0.3094101846218109, "eval_rewards/rejected": 0.9830338954925537, "eval_runtime": 431.4384, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 2200 }, { "epoch": 0.58, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -0.037935029715299606, "logits/rejected": 0.10359905660152435, "logps/chosen": -3100.48583984375, "logps/rejected": -2533.025390625, "loss": 0.6103, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.2553845643997192, "rewards/margins": 0.30527639389038086, "rewards/rejected": 0.9501080513000488, "step": 2210 }, { "epoch": 0.58, "learning_rate": 2.230955492793149e-06, "logits/chosen": -0.011008324101567268, "logits/rejected": 0.08892561495304108, "logps/chosen": -2913.0478515625, "logps/rejected": -2683.133544921875, "loss": 0.729, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.5283232927322388, "rewards/margins": 0.16127941012382507, "rewards/rejected": 1.3670438528060913, "step": 2220 }, { "epoch": 0.58, "eval_logits/chosen": -0.04715050011873245, "eval_logits/rejected": 0.060948487371206284, "eval_logps/chosen": -3073.691162109375, "eval_logps/rejected": -2616.137939453125, "eval_loss": 0.66034996509552, "eval_rewards/accuracies": 0.6340000033378601, "eval_rewards/chosen": 1.7124191522598267, "eval_rewards/margins": 0.3676074743270874, "eval_rewards/rejected": 1.3448117971420288, "eval_runtime": 434.7271, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 2220 }, { "epoch": 0.58, "learning_rate": 2.208255091531947e-06, "logits/chosen": -0.10482142865657806, "logits/rejected": 0.008805966936051846, "logps/chosen": -3216.06689453125, "logps/rejected": -2836.590087890625, "loss": 0.5958, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.4277985095977783, "rewards/margins": 0.41961798071861267, "rewards/rejected": 1.0081806182861328, "step": 2230 }, { "epoch": 0.59, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -0.004101318307220936, "logits/rejected": 0.11256043612957001, "logps/chosen": -3272.341552734375, "logps/rejected": -2390.12353515625, "loss": 0.6496, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3086886405944824, "rewards/margins": 0.30239319801330566, "rewards/rejected": 1.0062954425811768, "step": 2240 }, { "epoch": 0.59, "eval_logits/chosen": 0.034921858459711075, "eval_logits/rejected": 0.14047324657440186, "eval_logps/chosen": -3115.1220703125, "eval_logps/rejected": -2652.55810546875, "eval_loss": 0.6474931836128235, "eval_rewards/accuracies": 0.6439999938011169, "eval_rewards/chosen": 1.2981125116348267, "eval_rewards/margins": 0.3174985349178314, "eval_rewards/rejected": 0.9806140661239624, "eval_runtime": 430.5768, "eval_samples_per_second": 4.645, "eval_steps_per_second": 1.161, "step": 2240 }, { "epoch": 0.59, "learning_rate": 2.162929264300107e-06, "logits/chosen": 0.06483012437820435, "logits/rejected": 0.10482664406299591, "logps/chosen": -3012.14990234375, "logps/rejected": -2522.513427734375, "loss": 0.6019, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2253971099853516, "rewards/margins": 0.38450688123703003, "rewards/rejected": 0.840890109539032, "step": 2250 }, { "epoch": 0.59, "learning_rate": 2.1403076230230006e-06, "logits/chosen": 0.021324966102838516, "logits/rejected": 0.0925346165895462, "logps/chosen": -3163.22412109375, "logps/rejected": -2812.01708984375, "loss": 0.6615, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.4888947010040283, "rewards/margins": 0.399888277053833, "rewards/rejected": 1.0890061855316162, "step": 2260 }, { "epoch": 0.59, "eval_logits/chosen": 0.046376362442970276, "eval_logits/rejected": 0.15162834525108337, "eval_logps/chosen": -3111.0693359375, "eval_logps/rejected": -2649.958740234375, "eval_loss": 0.6475638747215271, "eval_rewards/accuracies": 0.6449999809265137, "eval_rewards/chosen": 1.3386390209197998, "eval_rewards/margins": 0.3320358097553253, "eval_rewards/rejected": 1.0066033601760864, "eval_runtime": 434.9713, "eval_samples_per_second": 4.598, "eval_steps_per_second": 1.15, "step": 2260 }, { "epoch": 0.59, "learning_rate": 2.11771601595586e-06, "logits/chosen": -0.033201463520526886, "logits/rejected": 0.0651741623878479, "logps/chosen": -3414.555419921875, "logps/rejected": -2966.548828125, "loss": 0.6636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.5030813217163086, "rewards/margins": 0.25987714529037476, "rewards/rejected": 1.2432042360305786, "step": 2270 }, { "epoch": 0.6, "learning_rate": 2.0951563294913737e-06, "logits/chosen": 0.05749652907252312, "logits/rejected": 0.15865135192871094, "logps/chosen": -2950.91748046875, "logps/rejected": -2696.44970703125, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 1.2632497549057007, "rewards/margins": 0.3450451195240021, "rewards/rejected": 0.9182047843933105, "step": 2280 }, { "epoch": 0.6, "eval_logits/chosen": 0.10828562080860138, "eval_logits/rejected": 0.21008984744548798, "eval_logps/chosen": -3144.54736328125, "eval_logps/rejected": -2678.962646484375, "eval_loss": 0.6457942128181458, "eval_rewards/accuracies": 0.6520000100135803, "eval_rewards/chosen": 1.0038599967956543, "eval_rewards/margins": 0.2872925400733948, "eval_rewards/rejected": 0.7165673971176147, "eval_runtime": 431.9546, "eval_samples_per_second": 4.63, "eval_steps_per_second": 1.158, "step": 2280 }, { "epoch": 0.6, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -0.0011093526845797896, "logits/rejected": 0.14498022198677063, "logps/chosen": -3514.94873046875, "logps/rejected": -2620.61474609375, "loss": 0.619, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 1.1506710052490234, "rewards/margins": 0.46578025817871094, "rewards/rejected": 0.6848907470703125, "step": 2290 }, { "epoch": 0.6, "learning_rate": 2.050140250457023e-06, "logits/chosen": 0.1284598410129547, "logits/rejected": 0.1739518791437149, "logps/chosen": -3194.052734375, "logps/rejected": -2728.483154296875, "loss": 0.6604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.9679597616195679, "rewards/margins": 0.332414448261261, "rewards/rejected": 0.6355453729629517, "step": 2300 }, { "epoch": 0.6, "eval_logits/chosen": 0.11187522858381271, "eval_logits/rejected": 0.21232187747955322, "eval_logps/chosen": -3147.330078125, "eval_logps/rejected": -2681.348388671875, "eval_loss": 0.6468117237091064, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": 0.9760299921035767, "eval_rewards/margins": 0.2833207845687866, "eval_rewards/rejected": 0.69270920753479, "eval_runtime": 432.4159, "eval_samples_per_second": 4.625, "eval_steps_per_second": 1.156, "step": 2300 }, { "epoch": 0.6, "learning_rate": 2.0276876167168042e-06, "logits/chosen": 0.22097060084342957, "logits/rejected": 0.25845959782600403, "logps/chosen": -2894.429931640625, "logps/rejected": -2625.072265625, "loss": 0.669, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.826924204826355, "rewards/margins": 0.10632209479808807, "rewards/rejected": 0.7206020355224609, "step": 2310 }, { "epoch": 0.61, "learning_rate": 2.0052744209246682e-06, "logits/chosen": 0.1568722426891327, "logits/rejected": 0.18195810914039612, "logps/chosen": -2841.515625, "logps/rejected": -2742.08154296875, "loss": 0.6762, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8409513235092163, "rewards/margins": 0.11866055428981781, "rewards/rejected": 0.7222907543182373, "step": 2320 }, { "epoch": 0.61, "eval_logits/chosen": 0.07506469637155533, "eval_logits/rejected": 0.17641988396644592, "eval_logps/chosen": -3122.62158203125, "eval_logps/rejected": -2660.251953125, "eval_loss": 0.6451495885848999, "eval_rewards/accuracies": 0.6539999842643738, "eval_rewards/chosen": 1.2231144905090332, "eval_rewards/margins": 0.3194419741630554, "eval_rewards/rejected": 0.9036726355552673, "eval_runtime": 434.3547, "eval_samples_per_second": 4.605, "eval_steps_per_second": 1.151, "step": 2320 }, { "epoch": 0.61, "learning_rate": 1.9829025345760127e-06, "logits/chosen": 0.1837071180343628, "logits/rejected": 0.21800890564918518, "logps/chosen": -2441.47802734375, "logps/rejected": -2437.36865234375, "loss": 0.6442, "rewards/accuracies": 0.625, "rewards/chosen": 0.95319002866745, "rewards/margins": 0.22535128891468048, "rewards/rejected": 0.7278387546539307, "step": 2330 }, { "epoch": 0.61, "learning_rate": 1.9605738257169115e-06, "logits/chosen": 0.02032051794230938, "logits/rejected": 0.16253602504730225, "logps/chosen": -2978.87744140625, "logps/rejected": -2492.47216796875, "loss": 0.6687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9788349866867065, "rewards/margins": 0.18644332885742188, "rewards/rejected": 0.7923917174339294, "step": 2340 }, { "epoch": 0.61, "eval_logits/chosen": 0.10600078850984573, "eval_logits/rejected": 0.2062830775976181, "eval_logps/chosen": -3140.226318359375, "eval_logps/rejected": -2675.71240234375, "eval_loss": 0.6448391675949097, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.0470685958862305, "eval_rewards/margins": 0.298001766204834, "eval_rewards/rejected": 0.7490668892860413, "eval_runtime": 430.7997, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 2340 }, { "epoch": 0.62, "learning_rate": 1.9382901587881275e-06, "logits/chosen": 0.004125489387661219, "logits/rejected": 0.06791798025369644, "logps/chosen": -3325.80615234375, "logps/rejected": -2948.88427734375, "loss": 0.6746, "rewards/accuracies": 0.625, "rewards/chosen": 0.9812939763069153, "rewards/margins": 0.24483951926231384, "rewards/rejected": 0.736454427242279, "step": 2350 }, { "epoch": 0.62, "learning_rate": 1.916053394469437e-06, "logits/chosen": 0.1363631933927536, "logits/rejected": 0.22492682933807373, "logps/chosen": -3089.32275390625, "logps/rejected": -2540.4052734375, "loss": 0.6154, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.1376001834869385, "rewards/margins": 0.31328344345092773, "rewards/rejected": 0.8243168592453003, "step": 2360 }, { "epoch": 0.62, "eval_logits/chosen": 0.050938159227371216, "eval_logits/rejected": 0.15186242759227753, "eval_logps/chosen": -3108.32568359375, "eval_logps/rejected": -2648.18310546875, "eval_loss": 0.6459988951683044, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": 1.366073489189148, "eval_rewards/margins": 0.34171316027641296, "eval_rewards/rejected": 1.0243604183197021, "eval_runtime": 435.3618, "eval_samples_per_second": 4.594, "eval_steps_per_second": 1.148, "step": 2360 }, { "epoch": 0.62, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -0.03477499634027481, "logits/rejected": 0.09535155445337296, "logps/chosen": -3485.15966796875, "logps/rejected": -2760.919921875, "loss": 0.6317, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.5493053197860718, "rewards/margins": 0.3483043313026428, "rewards/rejected": 1.2010010480880737, "step": 2370 }, { "epoch": 0.62, "learning_rate": 1.8717279966446267e-06, "logits/chosen": 0.036048371344804764, "logits/rejected": 0.07624435424804688, "logps/chosen": -3046.197998046875, "logps/rejected": -3003.85888671875, "loss": 0.712, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.561551809310913, "rewards/margins": 0.2907237112522125, "rewards/rejected": 1.2708282470703125, "step": 2380 }, { "epoch": 0.62, "eval_logits/chosen": 0.03968896344304085, "eval_logits/rejected": 0.14003460109233856, "eval_logps/chosen": -3095.83642578125, "eval_logps/rejected": -2637.656005859375, "eval_loss": 0.6491053700447083, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": 1.4909660816192627, "eval_rewards/margins": 0.36133280396461487, "eval_rewards/rejected": 1.1296333074569702, "eval_runtime": 431.9704, "eval_samples_per_second": 4.63, "eval_steps_per_second": 1.157, "step": 2380 }, { "epoch": 0.63, "learning_rate": 1.8496430642964698e-06, "logits/chosen": 0.11335577070713043, "logits/rejected": 0.15954935550689697, "logps/chosen": -3163.934326171875, "logps/rejected": -3170.837646484375, "loss": 0.6882, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.2582193613052368, "rewards/margins": 0.15143504738807678, "rewards/rejected": 1.1067842245101929, "step": 2390 }, { "epoch": 0.63, "learning_rate": 1.827612436565286e-06, "logits/chosen": 0.07212762534618378, "logits/rejected": 0.15457303822040558, "logps/chosen": -3069.61279296875, "logps/rejected": -2909.951171875, "loss": 0.675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.0234498977661133, "rewards/margins": 0.18220865726470947, "rewards/rejected": 0.8412412405014038, "step": 2400 }, { "epoch": 0.63, "eval_logits/chosen": 0.1330627202987671, "eval_logits/rejected": 0.23178908228874207, "eval_logps/chosen": -3155.9833984375, "eval_logps/rejected": -2689.152099609375, "eval_loss": 0.6466850638389587, "eval_rewards/accuracies": 0.6510000228881836, "eval_rewards/chosen": 0.8894966244697571, "eval_rewards/margins": 0.27482500672340393, "eval_rewards/rejected": 0.6146717667579651, "eval_runtime": 433.4189, "eval_samples_per_second": 4.614, "eval_steps_per_second": 1.154, "step": 2400 }, { "epoch": 0.63, "learning_rate": 1.8056379530021492e-06, "logits/chosen": 0.05798906087875366, "logits/rejected": 0.2562600076198578, "logps/chosen": -3251.07861328125, "logps/rejected": -2334.857421875, "loss": 0.6297, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.9605962038040161, "rewards/margins": 0.40911731123924255, "rewards/rejected": 0.551478922367096, "step": 2410 }, { "epoch": 0.63, "learning_rate": 1.7837214484701154e-06, "logits/chosen": 0.10648622363805771, "logits/rejected": 0.15394584834575653, "logps/chosen": -2991.80322265625, "logps/rejected": -3095.5185546875, "loss": 0.6251, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.775627076625824, "rewards/margins": 0.1460181474685669, "rewards/rejected": 0.6296090483665466, "step": 2420 }, { "epoch": 0.63, "eval_logits/chosen": 0.14036604762077332, "eval_logits/rejected": 0.23774415254592896, "eval_logps/chosen": -3152.841552734375, "eval_logps/rejected": -2686.547119140625, "eval_loss": 0.6457948088645935, "eval_rewards/accuracies": 0.6539999842643738, "eval_rewards/chosen": 0.920920729637146, "eval_rewards/margins": 0.28019729256629944, "eval_rewards/rejected": 0.6407234072685242, "eval_runtime": 433.6972, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 2420 }, { "epoch": 0.64, "learning_rate": 1.7618647529910043e-06, "logits/chosen": 0.20348486304283142, "logits/rejected": 0.1962747573852539, "logps/chosen": -2948.943359375, "logps/rejected": -2643.252685546875, "loss": 0.6545, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.8393770456314087, "rewards/margins": 0.23704111576080322, "rewards/rejected": 0.6023359298706055, "step": 2430 }, { "epoch": 0.64, "learning_rate": 1.7400696915925996e-06, "logits/chosen": 0.04657715559005737, "logits/rejected": 0.12928518652915955, "logps/chosen": -3437.40185546875, "logps/rejected": -2935.17529296875, "loss": 0.58, "rewards/accuracies": 0.625, "rewards/chosen": 0.9104098081588745, "rewards/margins": 0.26738661527633667, "rewards/rejected": 0.6430231928825378, "step": 2440 }, { "epoch": 0.64, "eval_logits/chosen": 0.11617037653923035, "eval_logits/rejected": 0.21401867270469666, "eval_logps/chosen": -3141.86962890625, "eval_logps/rejected": -2676.988525390625, "eval_loss": 0.6450735926628113, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.0306344032287598, "eval_rewards/margins": 0.29432639479637146, "eval_rewards/rejected": 0.7363079786300659, "eval_runtime": 430.841, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.161, "step": 2440 }, { "epoch": 0.64, "learning_rate": 1.718338084156254e-06, "logits/chosen": 0.1005939468741417, "logits/rejected": 0.21812936663627625, "logps/chosen": -3698.647216796875, "logps/rejected": -2997.23828125, "loss": 0.6451, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.4306066036224365, "rewards/margins": 0.37357065081596375, "rewards/rejected": 1.0570359230041504, "step": 2450 }, { "epoch": 0.64, "learning_rate": 1.6966717452649372e-06, "logits/chosen": 0.1400797814130783, "logits/rejected": 0.2531244456768036, "logps/chosen": -2717.71484375, "logps/rejected": -2200.76416015625, "loss": 0.6538, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.023131012916565, "rewards/margins": 0.26219961047172546, "rewards/rejected": 0.7609313726425171, "step": 2460 }, { "epoch": 0.64, "eval_logits/chosen": 0.043561920523643494, "eval_logits/rejected": 0.14315102994441986, "eval_logps/chosen": -3093.6923828125, "eval_logps/rejected": -2636.25390625, "eval_loss": 0.6477198004722595, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 1.5124059915542603, "eval_rewards/margins": 0.3687518537044525, "eval_rewards/rejected": 1.143654227256775, "eval_runtime": 435.3433, "eval_samples_per_second": 4.594, "eval_steps_per_second": 1.149, "step": 2460 }, { "epoch": 0.65, "learning_rate": 1.6750724840517103e-06, "logits/chosen": 0.07987643778324127, "logits/rejected": 0.11500252783298492, "logps/chosen": -3045.87255859375, "logps/rejected": -2765.31005859375, "loss": 0.6392, "rewards/accuracies": 0.6875, "rewards/chosen": 1.5031098127365112, "rewards/margins": 0.4152649939060211, "rewards/rejected": 1.087844967842102, "step": 2470 }, { "epoch": 0.65, "learning_rate": 1.6535421040486686e-06, "logits/chosen": 0.15257391333580017, "logits/rejected": 0.14436331391334534, "logps/chosen": -2876.944580078125, "logps/rejected": -2796.17919921875, "loss": 0.6741, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.3678780794143677, "rewards/margins": 0.3369705080986023, "rewards/rejected": 1.0309075117111206, "step": 2480 }, { "epoch": 0.65, "eval_logits/chosen": 0.08561350405216217, "eval_logits/rejected": 0.18462395668029785, "eval_logps/chosen": -3124.2099609375, "eval_logps/rejected": -2662.59716796875, "eval_loss": 0.6436476707458496, "eval_rewards/accuracies": 0.6460000276565552, "eval_rewards/chosen": 1.207233190536499, "eval_rewards/margins": 0.3270133137702942, "eval_rewards/rejected": 0.8802199363708496, "eval_runtime": 430.6828, "eval_samples_per_second": 4.644, "eval_steps_per_second": 1.161, "step": 2480 }, { "epoch": 0.65, "learning_rate": 1.6320824030363458e-06, "logits/chosen": 0.12491425126791, "logits/rejected": 0.11403782665729523, "logps/chosen": -2730.929443359375, "logps/rejected": -2779.236572265625, "loss": 0.6361, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.059452772140503, "rewards/margins": 0.22805866599082947, "rewards/rejected": 0.8313941955566406, "step": 2490 }, { "epoch": 0.65, "learning_rate": 1.6106951728936028e-06, "logits/chosen": 0.17049428820610046, "logits/rejected": 0.2290641963481903, "logps/chosen": -3159.897705078125, "logps/rejected": -2696.354248046875, "loss": 0.6109, "rewards/accuracies": 0.6875, "rewards/chosen": 1.406048059463501, "rewards/margins": 0.47405433654785156, "rewards/rejected": 0.9319936633110046, "step": 2500 }, { "epoch": 0.65, "eval_logits/chosen": 0.09821795672178268, "eval_logits/rejected": 0.19686919450759888, "eval_logps/chosen": -3132.590087890625, "eval_logps/rejected": -2669.93994140625, "eval_loss": 0.6447327733039856, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 1.1234303712844849, "eval_rewards/margins": 0.31663763523101807, "eval_rewards/rejected": 0.806792676448822, "eval_runtime": 434.3767, "eval_samples_per_second": 4.604, "eval_steps_per_second": 1.151, "step": 2500 }, { "epoch": 0.66, "learning_rate": 1.5893821994479996e-06, "logits/chosen": 0.10634864866733551, "logits/rejected": 0.24396662414073944, "logps/chosen": -3185.33740234375, "logps/rejected": -2395.65087890625, "loss": 0.6823, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.1171600818634033, "rewards/margins": 0.1202349066734314, "rewards/rejected": 0.9969251751899719, "step": 2510 }, { "epoch": 0.66, "learning_rate": 1.5681452623266868e-06, "logits/chosen": 0.17572340369224548, "logits/rejected": 0.23831430077552795, "logps/chosen": -2995.54296875, "logps/rejected": -2792.13818359375, "loss": 0.6749, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3150266408920288, "rewards/margins": 0.30383336544036865, "rewards/rejected": 1.0111933946609497, "step": 2520 }, { "epoch": 0.66, "eval_logits/chosen": 0.09724757820367813, "eval_logits/rejected": 0.19246171414852142, "eval_logps/chosen": -3124.983642578125, "eval_logps/rejected": -2663.830810546875, "eval_loss": 0.6447495222091675, "eval_rewards/accuracies": 0.6470000147819519, "eval_rewards/chosen": 1.1994941234588623, "eval_rewards/margins": 0.3316074013710022, "eval_rewards/rejected": 0.8678867220878601, "eval_runtime": 433.3976, "eval_samples_per_second": 4.615, "eval_steps_per_second": 1.154, "step": 2520 }, { "epoch": 0.66, "learning_rate": 1.5469861348078014e-06, "logits/chosen": 0.04054888337850571, "logits/rejected": 0.13890059292316437, "logps/chosen": -3716.65771484375, "logps/rejected": -3132.71923828125, "loss": 0.6366, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.3155286312103271, "rewards/margins": 0.30679869651794434, "rewards/rejected": 1.0087300539016724, "step": 2530 }, { "epoch": 0.66, "learning_rate": 1.5259065836724035e-06, "logits/chosen": 0.058126628398895264, "logits/rejected": 0.10460863262414932, "logps/chosen": -3058.0087890625, "logps/rejected": -2867.950439453125, "loss": 0.6524, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8837096095085144, "rewards/margins": 0.24623966217041016, "rewards/rejected": 0.6374698877334595, "step": 2540 }, { "epoch": 0.66, "eval_logits/chosen": 0.11339578032493591, "eval_logits/rejected": 0.20642559230327606, "eval_logps/chosen": -3132.639404296875, "eval_logps/rejected": -2670.28662109375, "eval_loss": 0.6448596119880676, "eval_rewards/accuracies": 0.6430000066757202, "eval_rewards/chosen": 1.122936487197876, "eval_rewards/margins": 0.319610059261322, "eval_rewards/rejected": 0.803326427936554, "eval_runtime": 432.4732, "eval_samples_per_second": 4.625, "eval_steps_per_second": 1.156, "step": 2540 }, { "epoch": 0.67, "learning_rate": 1.5049083690569456e-06, "logits/chosen": 0.19669227302074432, "logits/rejected": 0.17935971915721893, "logps/chosen": -2936.17578125, "logps/rejected": -2770.346435546875, "loss": 0.6438, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0138237476348877, "rewards/margins": 0.31398168206214905, "rewards/rejected": 0.6998418569564819, "step": 2550 }, { "epoch": 0.67, "learning_rate": 1.4839932443063057e-06, "logits/chosen": 0.25771528482437134, "logits/rejected": 0.22266927361488342, "logps/chosen": -2713.4814453125, "logps/rejected": -2466.1953125, "loss": 0.6155, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8439112901687622, "rewards/margins": 0.2998080551624298, "rewards/rejected": 0.5441033244132996, "step": 2560 }, { "epoch": 0.67, "eval_logits/chosen": 0.08197551965713501, "eval_logits/rejected": 0.176564022898674, "eval_logps/chosen": -3115.648681640625, "eval_logps/rejected": -2655.2099609375, "eval_loss": 0.6444838643074036, "eval_rewards/accuracies": 0.6489999890327454, "eval_rewards/chosen": 1.292845368385315, "eval_rewards/margins": 0.3387540280818939, "eval_rewards/rejected": 0.9540913701057434, "eval_runtime": 435.3319, "eval_samples_per_second": 4.594, "eval_steps_per_second": 1.149, "step": 2560 }, { "epoch": 0.67, "learning_rate": 1.4631629558273803e-06, "logits/chosen": 0.004955637268722057, "logits/rejected": 0.11759240925312042, "logps/chosen": -3666.436767578125, "logps/rejected": -3131.982421875, "loss": 0.6593, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.2341853380203247, "rewards/margins": 0.19638477265834808, "rewards/rejected": 1.0378005504608154, "step": 2570 }, { "epoch": 0.68, "learning_rate": 1.4424192429432657e-06, "logits/chosen": 0.15930558741092682, "logits/rejected": 0.16263648867607117, "logps/chosen": -3107.844970703125, "logps/rejected": -2984.102783203125, "loss": 0.6498, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1205838918685913, "rewards/margins": 0.18836435675621033, "rewards/rejected": 0.9322196841239929, "step": 2580 }, { "epoch": 0.68, "eval_logits/chosen": 0.057882338762283325, "eval_logits/rejected": 0.15365931391716003, "eval_logps/chosen": -3104.314208984375, "eval_logps/rejected": -2645.69580078125, "eval_loss": 0.6460149884223938, "eval_rewards/accuracies": 0.652999997138977, "eval_rewards/chosen": 1.40618896484375, "eval_rewards/margins": 0.35695451498031616, "eval_rewards/rejected": 1.0492345094680786, "eval_runtime": 430.9531, "eval_samples_per_second": 4.641, "eval_steps_per_second": 1.16, "step": 2580 }, { "epoch": 0.68, "learning_rate": 1.421763837748016e-06, "logits/chosen": -0.0022935927845537663, "logits/rejected": 0.09144644439220428, "logps/chosen": -3264.98486328125, "logps/rejected": -2706.81787109375, "loss": 0.5925, "rewards/accuracies": 0.6875, "rewards/chosen": 1.3181378841400146, "rewards/margins": 0.3723452091217041, "rewards/rejected": 0.9457927942276001, "step": 2590 }, { "epoch": 0.68, "learning_rate": 1.401198464962021e-06, "logits/chosen": -0.030201178044080734, "logits/rejected": 0.022038141265511513, "logps/chosen": -3062.179443359375, "logps/rejected": -2931.1298828125, "loss": 0.6205, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.4214913845062256, "rewards/margins": 0.28339818120002747, "rewards/rejected": 1.13809335231781, "step": 2600 }, { "epoch": 0.68, "eval_logits/chosen": 0.04554048180580139, "eval_logits/rejected": 0.14262239634990692, "eval_logps/chosen": -3103.1826171875, "eval_logps/rejected": -2644.53515625, "eval_loss": 0.6453396081924438, "eval_rewards/accuracies": 0.6499999761581421, "eval_rewards/chosen": 1.4175071716308594, "eval_rewards/margins": 0.3566676676273346, "eval_rewards/rejected": 1.0608397722244263, "eval_runtime": 435.573, "eval_samples_per_second": 4.592, "eval_steps_per_second": 1.148, "step": 2600 }, { "epoch": 0.68, "learning_rate": 1.3807248417879896e-06, "logits/chosen": 0.0720294862985611, "logits/rejected": 0.1391828954219818, "logps/chosen": -2709.7734375, "logps/rejected": -2427.530517578125, "loss": 0.6634, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.1152108907699585, "rewards/margins": 0.18765218555927277, "rewards/rejected": 0.9275587201118469, "step": 2610 }, { "epoch": 0.69, "learning_rate": 1.3603446777675665e-06, "logits/chosen": 0.035583239048719406, "logits/rejected": 0.13367043435573578, "logps/chosen": -2965.286865234375, "logps/rejected": -2568.93017578125, "loss": 0.6644, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.0433275699615479, "rewards/margins": 0.19460991024971008, "rewards/rejected": 0.8487176895141602, "step": 2620 }, { "epoch": 0.69, "eval_logits/chosen": 0.0735672190785408, "eval_logits/rejected": 0.16903966665267944, "eval_logps/chosen": -3118.31103515625, "eval_logps/rejected": -2657.253662109375, "eval_loss": 0.643825352191925, "eval_rewards/accuracies": 0.6520000100135803, "eval_rewards/chosen": 1.2662217617034912, "eval_rewards/margins": 0.33256688714027405, "eval_rewards/rejected": 0.9336549043655396, "eval_runtime": 432.3432, "eval_samples_per_second": 4.626, "eval_steps_per_second": 1.156, "step": 2620 }, { "epoch": 0.69, "learning_rate": 1.3400596746385817e-06, "logits/chosen": 0.14778029918670654, "logits/rejected": 0.2360314428806305, "logps/chosen": -3020.750732421875, "logps/rejected": -2715.0810546875, "loss": 0.6417, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.135032057762146, "rewards/margins": 0.3614581525325775, "rewards/rejected": 0.7735738158226013, "step": 2630 }, { "epoch": 0.69, "learning_rate": 1.3198715261929587e-06, "logits/chosen": 0.1426887959241867, "logits/rejected": 0.20884127914905548, "logps/chosen": -2941.52294921875, "logps/rejected": -2602.294189453125, "loss": 0.6403, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8650339841842651, "rewards/margins": 0.17887739837169647, "rewards/rejected": 0.6861566305160522, "step": 2640 }, { "epoch": 0.69, "eval_logits/chosen": 0.15266598761081696, "eval_logits/rejected": 0.24643154442310333, "eval_logps/chosen": -3161.305908203125, "eval_logps/rejected": -2694.70849609375, "eval_loss": 0.6467121839523315, "eval_rewards/accuracies": 0.652999997138977, "eval_rewards/chosen": 0.8362707495689392, "eval_rewards/margins": 0.27716130018234253, "eval_rewards/rejected": 0.5591094493865967, "eval_runtime": 433.8659, "eval_samples_per_second": 4.61, "eval_steps_per_second": 1.152, "step": 2640 }, { "epoch": 0.69, "learning_rate": 1.2997819181352823e-06, "logits/chosen": 0.27842575311660767, "logits/rejected": 0.3141182065010071, "logps/chosen": -2817.89111328125, "logps/rejected": -2384.989501953125, "loss": 0.6099, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.715628981590271, "rewards/margins": 0.35152608156204224, "rewards/rejected": 0.364102840423584, "step": 2650 }, { "epoch": 0.7, "learning_rate": 1.2797925279420454e-06, "logits/chosen": 0.22730746865272522, "logits/rejected": 0.3021041750907898, "logps/chosen": -2992.47216796875, "logps/rejected": -2587.72802734375, "loss": 0.6697, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6642982363700867, "rewards/margins": 0.11359486728906631, "rewards/rejected": 0.5507034063339233, "step": 2660 }, { "epoch": 0.7, "eval_logits/chosen": 0.17614012956619263, "eval_logits/rejected": 0.2697617709636688, "eval_logps/chosen": -3172.23095703125, "eval_logps/rejected": -2704.8720703125, "eval_loss": 0.6504831910133362, "eval_rewards/accuracies": 0.6480000019073486, "eval_rewards/chosen": 0.7270216345787048, "eval_rewards/margins": 0.26955118775367737, "eval_rewards/rejected": 0.4574703872203827, "eval_runtime": 434.0219, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 2660 }, { "epoch": 0.7, "learning_rate": 1.2599050247215764e-06, "logits/chosen": 0.18532994389533997, "logits/rejected": 0.22819623351097107, "logps/chosen": -3182.412841796875, "logps/rejected": -2424.33154296875, "loss": 0.692, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.578414261341095, "rewards/margins": 0.08939043432474136, "rewards/rejected": 0.4890238642692566, "step": 2670 }, { "epoch": 0.7, "learning_rate": 1.2401210690746705e-06, "logits/chosen": 0.1580999791622162, "logits/rejected": 0.25759226083755493, "logps/chosen": -3189.779296875, "logps/rejected": -2857.88525390625, "loss": 0.586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8573214411735535, "rewards/margins": 0.4872228503227234, "rewards/rejected": 0.37009862065315247, "step": 2680 }, { "epoch": 0.7, "eval_logits/chosen": 0.14406052231788635, "eval_logits/rejected": 0.24052554368972778, "eval_logps/chosen": -3153.736083984375, "eval_logps/rejected": -2689.15673828125, "eval_loss": 0.6468480229377747, "eval_rewards/accuracies": 0.652999997138977, "eval_rewards/chosen": 0.9119713306427002, "eval_rewards/margins": 0.2973487079143524, "eval_rewards/rejected": 0.6146225929260254, "eval_runtime": 431.3601, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 2680 }, { "epoch": 0.7, "learning_rate": 1.2204423129559306e-06, "logits/chosen": 0.13271962106227875, "logits/rejected": 0.15295439958572388, "logps/chosen": -2891.67919921875, "logps/rejected": -2917.93017578125, "loss": 0.635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9208828806877136, "rewards/margins": 0.2186553180217743, "rewards/rejected": 0.7022275924682617, "step": 2690 }, { "epoch": 0.71, "learning_rate": 1.20087039953583e-06, "logits/chosen": 0.058627206832170486, "logits/rejected": 0.1339290887117386, "logps/chosen": -3518.30078125, "logps/rejected": -3098.26123046875, "loss": 0.7133, "rewards/accuracies": 0.625, "rewards/chosen": 1.1940460205078125, "rewards/margins": 0.32826152443885803, "rewards/rejected": 0.8657845258712769, "step": 2700 }, { "epoch": 0.71, "eval_logits/chosen": 0.1499045342206955, "eval_logits/rejected": 0.24650132656097412, "eval_logps/chosen": -3154.766845703125, "eval_logps/rejected": -2690.427490234375, "eval_loss": 0.6477073431015015, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 0.9016657471656799, "eval_rewards/margins": 0.299747496843338, "eval_rewards/rejected": 0.6019183397293091, "eval_runtime": 435.4961, "eval_samples_per_second": 4.592, "eval_steps_per_second": 1.148, "step": 2700 }, { "epoch": 0.71, "learning_rate": 1.181406963063507e-06, "logits/chosen": 0.15638697147369385, "logits/rejected": 0.15356355905532837, "logps/chosen": -2875.93603515625, "logps/rejected": -2597.21142578125, "loss": 0.6257, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.7270675897598267, "rewards/margins": 0.2222127616405487, "rewards/rejected": 0.5048547983169556, "step": 2710 }, { "epoch": 0.71, "learning_rate": 1.1620536287303052e-06, "logits/chosen": 0.0035457522608339787, "logits/rejected": 0.10971565544605255, "logps/chosen": -3293.54931640625, "logps/rejected": -2725.47412109375, "loss": 0.6203, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0205591917037964, "rewards/margins": 0.3048398196697235, "rewards/rejected": 0.71571946144104, "step": 2720 }, { "epoch": 0.71, "eval_logits/chosen": 0.10649571567773819, "eval_logits/rejected": 0.2042938470840454, "eval_logps/chosen": -3130.583251953125, "eval_logps/rejected": -2669.188720703125, "eval_loss": 0.6452856063842773, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": 1.1434990167617798, "eval_rewards/margins": 0.32919323444366455, "eval_rewards/rejected": 0.8143056631088257, "eval_runtime": 431.0718, "eval_samples_per_second": 4.64, "eval_steps_per_second": 1.16, "step": 2720 }, { "epoch": 0.71, "learning_rate": 1.1428120125340717e-06, "logits/chosen": 0.08702695369720459, "logits/rejected": 0.15740936994552612, "logps/chosen": -3139.21923828125, "logps/rejected": -2695.92138671875, "loss": 0.6478, "rewards/accuracies": 0.6875, "rewards/chosen": 1.1724882125854492, "rewards/margins": 0.39453238248825073, "rewards/rejected": 0.7779557108879089, "step": 2730 }, { "epoch": 0.72, "learning_rate": 1.123683721144223e-06, "logits/chosen": 0.0618269257247448, "logits/rejected": 0.19205117225646973, "logps/chosen": -3309.751953125, "logps/rejected": -2718.497314453125, "loss": 0.6403, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2841042280197144, "rewards/margins": 0.42666906118392944, "rewards/rejected": 0.8574352264404297, "step": 2740 }, { "epoch": 0.72, "eval_logits/chosen": 0.10141293704509735, "eval_logits/rejected": 0.19877731800079346, "eval_logps/chosen": -3128.739013671875, "eval_logps/rejected": -2667.4482421875, "eval_loss": 0.6447215676307678, "eval_rewards/accuracies": 0.6600000262260437, "eval_rewards/chosen": 1.1619406938552856, "eval_rewards/margins": 0.3302310109138489, "eval_rewards/rejected": 0.8317096829414368, "eval_runtime": 434.204, "eval_samples_per_second": 4.606, "eval_steps_per_second": 1.152, "step": 2740 }, { "epoch": 0.72, "learning_rate": 1.1046703517675848e-06, "logits/chosen": 0.11646068096160889, "logits/rejected": 0.1773780733346939, "logps/chosen": -2655.95751953125, "logps/rejected": -2377.68017578125, "loss": 0.6858, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.9130075573921204, "rewards/margins": 0.1356573849916458, "rewards/rejected": 0.7773502469062805, "step": 2750 }, { "epoch": 0.72, "learning_rate": 1.085773492015028e-06, "logits/chosen": 0.025050807744264603, "logits/rejected": 0.09588897973299026, "logps/chosen": -3217.234375, "logps/rejected": -2751.618408203125, "loss": 0.6562, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3348236083984375, "rewards/margins": 0.20918186008930206, "rewards/rejected": 1.1256418228149414, "step": 2760 }, { "epoch": 0.72, "eval_logits/chosen": 0.07714089751243591, "eval_logits/rejected": 0.17499929666519165, "eval_logps/chosen": -3117.67724609375, "eval_logps/rejected": -2657.104736328125, "eval_loss": 0.6439529061317444, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 1.2725580930709839, "eval_rewards/margins": 0.3374113142490387, "eval_rewards/rejected": 0.9351468086242676, "eval_runtime": 433.1111, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.154, "step": 2760 }, { "epoch": 0.72, "learning_rate": 1.0669947197689034e-06, "logits/chosen": 0.09191405773162842, "logits/rejected": 0.07995611429214478, "logps/chosen": -2935.451904296875, "logps/rejected": -2722.47998046875, "loss": 0.7016, "rewards/accuracies": 0.625, "rewards/chosen": 1.0145564079284668, "rewards/margins": 0.216527059674263, "rewards/rejected": 0.7980293035507202, "step": 2770 }, { "epoch": 0.73, "learning_rate": 1.048335603051291e-06, "logits/chosen": 0.11697208881378174, "logits/rejected": 0.2727583944797516, "logps/chosen": -3261.263916015625, "logps/rejected": -2374.05322265625, "loss": 0.6216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.1328293085098267, "rewards/margins": 0.4422016143798828, "rewards/rejected": 0.6906276941299438, "step": 2780 }, { "epoch": 0.73, "eval_logits/chosen": 0.10101441293954849, "eval_logits/rejected": 0.19839319586753845, "eval_logps/chosen": -3130.215087890625, "eval_logps/rejected": -2667.90966796875, "eval_loss": 0.6433055400848389, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": 1.1471830606460571, "eval_rewards/margins": 0.32008832693099976, "eval_rewards/rejected": 0.8270947933197021, "eval_runtime": 433.033, "eval_samples_per_second": 4.619, "eval_steps_per_second": 1.155, "step": 2780 }, { "epoch": 0.73, "learning_rate": 1.0297976998930665e-06, "logits/chosen": 0.0809980034828186, "logits/rejected": 0.12898331880569458, "logps/chosen": -3059.52197265625, "logps/rejected": -2614.6015625, "loss": 0.6372, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0522754192352295, "rewards/margins": 0.2133600264787674, "rewards/rejected": 0.8389154672622681, "step": 2790 }, { "epoch": 0.73, "learning_rate": 1.0113825582038078e-06, "logits/chosen": 0.14957372844219208, "logits/rejected": 0.2786753177642822, "logps/chosen": -2827.08349609375, "logps/rejected": -2211.76416015625, "loss": 0.6439, "rewards/accuracies": 0.625, "rewards/chosen": 0.9180396199226379, "rewards/margins": 0.31394582986831665, "rewards/rejected": 0.6040938496589661, "step": 2800 }, { "epoch": 0.73, "eval_logits/chosen": 0.10539400577545166, "eval_logits/rejected": 0.20312856137752533, "eval_logps/chosen": -3129.9345703125, "eval_logps/rejected": -2667.8798828125, "eval_loss": 0.6433684825897217, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.1499861478805542, "eval_rewards/margins": 0.32259252667427063, "eval_rewards/rejected": 0.8273937106132507, "eval_runtime": 434.7462, "eval_samples_per_second": 4.6, "eval_steps_per_second": 1.15, "step": 2800 }, { "epoch": 0.74, "learning_rate": 9.930917156425477e-07, "logits/chosen": 0.07070942968130112, "logits/rejected": 0.17799636721611023, "logps/chosen": -3306.61328125, "logps/rejected": -2676.35205078125, "loss": 0.6319, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.192214012145996, "rewards/margins": 0.3723980784416199, "rewards/rejected": 0.8198158144950867, "step": 2810 }, { "epoch": 0.74, "learning_rate": 9.749266994893756e-07, "logits/chosen": 0.15602955222129822, "logits/rejected": 0.1746487021446228, "logps/chosen": -3154.74755859375, "logps/rejected": -2695.767578125, "loss": 0.6545, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.1762325763702393, "rewards/margins": 0.20391640067100525, "rewards/rejected": 0.9723161458969116, "step": 2820 }, { "epoch": 0.74, "eval_logits/chosen": 0.08535545319318771, "eval_logits/rejected": 0.18399646878242493, "eval_logps/chosen": -3117.564453125, "eval_logps/rejected": -2657.365966796875, "eval_loss": 0.6443636417388916, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": 1.2736886739730835, "eval_rewards/margins": 0.3411538898944855, "eval_rewards/rejected": 0.93253493309021, "eval_runtime": 430.782, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 2820 }, { "epoch": 0.74, "learning_rate": 9.56889026517913e-07, "logits/chosen": 0.13604894280433655, "logits/rejected": 0.1604180783033371, "logps/chosen": -3224.18212890625, "logps/rejected": -2598.81201171875, "loss": 0.6266, "rewards/accuracies": 0.625, "rewards/chosen": 1.242482304573059, "rewards/margins": 0.2726319432258606, "rewards/rejected": 0.9698503613471985, "step": 2830 }, { "epoch": 0.74, "learning_rate": 9.389802028686617e-07, "logits/chosen": 0.167219340801239, "logits/rejected": 0.29541537165641785, "logps/chosen": -3496.32177734375, "logps/rejected": -2661.04150390625, "loss": 0.5712, "rewards/accuracies": 0.6875, "rewards/chosen": 1.5568902492523193, "rewards/margins": 0.5824890732765198, "rewards/rejected": 0.9744011759757996, "step": 2840 }, { "epoch": 0.74, "eval_logits/chosen": 0.07399701327085495, "eval_logits/rejected": 0.17377108335494995, "eval_logps/chosen": -3113.695068359375, "eval_logps/rejected": -2653.9677734375, "eval_loss": 0.644152820110321, "eval_rewards/accuracies": 0.6589999794960022, "eval_rewards/chosen": 1.3123818635940552, "eval_rewards/margins": 0.3458673059940338, "eval_rewards/rejected": 0.966514527797699, "eval_runtime": 434.757, "eval_samples_per_second": 4.6, "eval_steps_per_second": 1.15, "step": 2840 }, { "epoch": 0.75, "learning_rate": 9.212017239232427e-07, "logits/chosen": 0.06422942131757736, "logits/rejected": 0.12784716486930847, "logps/chosen": -3029.268798828125, "logps/rejected": -2473.47509765625, "loss": 0.6018, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1396443843841553, "rewards/margins": 0.338540256023407, "rewards/rejected": 0.8011040687561035, "step": 2850 }, { "epoch": 0.75, "learning_rate": 9.03555074179533e-07, "logits/chosen": 0.09111222624778748, "logits/rejected": 0.11049652099609375, "logps/chosen": -3043.90234375, "logps/rejected": -2983.20556640625, "loss": 0.6623, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 1.3758527040481567, "rewards/margins": 0.2255462110042572, "rewards/rejected": 1.1503065824508667, "step": 2860 }, { "epoch": 0.75, "eval_logits/chosen": 0.07583687454462051, "eval_logits/rejected": 0.1759239137172699, "eval_logps/chosen": -3116.11181640625, "eval_logps/rejected": -2656.0341796875, "eval_loss": 0.6435379385948181, "eval_rewards/accuracies": 0.6589999794960022, "eval_rewards/chosen": 1.2882167100906372, "eval_rewards/margins": 0.3423665165901184, "eval_rewards/rejected": 0.945850133895874, "eval_runtime": 431.7893, "eval_samples_per_second": 4.632, "eval_steps_per_second": 1.158, "step": 2860 }, { "epoch": 0.75, "learning_rate": 8.860417271277067e-07, "logits/chosen": 0.14327311515808105, "logits/rejected": 0.2375190556049347, "logps/chosen": -2888.80615234375, "logps/rejected": -2453.49560546875, "loss": 0.6552, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0584075450897217, "rewards/margins": 0.33762770891189575, "rewards/rejected": 0.7207797765731812, "step": 2870 }, { "epoch": 0.75, "learning_rate": 8.686631451272029e-07, "logits/chosen": 0.17280824482440948, "logits/rejected": 0.28266793489456177, "logps/chosen": -3064.078857421875, "logps/rejected": -2512.33056640625, "loss": 0.6491, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.3165113925933838, "rewards/margins": 0.34719833731651306, "rewards/rejected": 0.9693130254745483, "step": 2880 }, { "epoch": 0.75, "eval_logits/chosen": 0.10873594880104065, "eval_logits/rejected": 0.20860151946544647, "eval_logps/chosen": -3138.173583984375, "eval_logps/rejected": -2675.222412109375, "eval_loss": 0.6429147720336914, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.067596673965454, "eval_rewards/margins": 0.31363165378570557, "eval_rewards/rejected": 0.7539650201797485, "eval_runtime": 433.5824, "eval_samples_per_second": 4.613, "eval_steps_per_second": 1.153, "step": 2880 }, { "epoch": 0.76, "learning_rate": 8.514207792846168e-07, "logits/chosen": 0.10971547663211823, "logits/rejected": 0.1789051592350006, "logps/chosen": -2838.19384765625, "logps/rejected": -2569.947265625, "loss": 0.6298, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.0143368244171143, "rewards/margins": 0.22484783828258514, "rewards/rejected": 0.7894889712333679, "step": 2890 }, { "epoch": 0.76, "learning_rate": 8.343160693325356e-07, "logits/chosen": 0.1836162805557251, "logits/rejected": 0.2654489576816559, "logps/chosen": -2718.9228515625, "logps/rejected": -2594.938720703125, "loss": 0.6316, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8563119769096375, "rewards/margins": 0.402261346578598, "rewards/rejected": 0.4540507197380066, "step": 2900 }, { "epoch": 0.76, "eval_logits/chosen": 0.13328304886817932, "eval_logits/rejected": 0.23238782584667206, "eval_logps/chosen": -3153.5068359375, "eval_logps/rejected": -2688.78271484375, "eval_loss": 0.6444342136383057, "eval_rewards/accuracies": 0.656000018119812, "eval_rewards/chosen": 0.9142661094665527, "eval_rewards/margins": 0.29590094089508057, "eval_rewards/rejected": 0.6183652281761169, "eval_runtime": 434.883, "eval_samples_per_second": 4.599, "eval_steps_per_second": 1.15, "step": 2900 }, { "epoch": 0.76, "learning_rate": 8.173504435093174e-07, "logits/chosen": 0.17675557732582092, "logits/rejected": 0.24916550517082214, "logps/chosen": -2998.46826171875, "logps/rejected": -2320.257568359375, "loss": 0.5963, "rewards/accuracies": 0.75, "rewards/chosen": 1.0272307395935059, "rewards/margins": 0.5231108069419861, "rewards/rejected": 0.5041199326515198, "step": 2910 }, { "epoch": 0.76, "learning_rate": 8.00525318439836e-07, "logits/chosen": 0.15026502311229706, "logits/rejected": 0.19382700324058533, "logps/chosen": -2837.82568359375, "logps/rejected": -2428.755615234375, "loss": 0.6851, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8253390192985535, "rewards/margins": 0.17960809171199799, "rewards/rejected": 0.6457308530807495, "step": 2920 }, { "epoch": 0.76, "eval_logits/chosen": 0.12329066544771194, "eval_logits/rejected": 0.22322072088718414, "eval_logps/chosen": -3146.34912109375, "eval_logps/rejected": -2683.052978515625, "eval_loss": 0.6433141827583313, "eval_rewards/accuracies": 0.6549999713897705, "eval_rewards/chosen": 0.9858420491218567, "eval_rewards/margins": 0.31017976999282837, "eval_rewards/rejected": 0.6756622195243835, "eval_runtime": 431.7202, "eval_samples_per_second": 4.633, "eval_steps_per_second": 1.158, "step": 2920 }, { "epoch": 0.77, "learning_rate": 7.838420990171927e-07, "logits/chosen": 0.14905816316604614, "logits/rejected": 0.28530722856521606, "logps/chosen": -3198.9931640625, "logps/rejected": -2699.28662109375, "loss": 0.6377, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.105789303779602, "rewards/margins": 0.33509576320648193, "rewards/rejected": 0.7706934809684753, "step": 2930 }, { "epoch": 0.77, "learning_rate": 7.673021782854084e-07, "logits/chosen": 0.04495352879166603, "logits/rejected": 0.13894930481910706, "logps/chosen": -3583.732421875, "logps/rejected": -2903.57861328125, "loss": 0.6261, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.2249748706817627, "rewards/margins": 0.36022740602493286, "rewards/rejected": 0.8647474050521851, "step": 2940 }, { "epoch": 0.77, "eval_logits/chosen": 0.1108195036649704, "eval_logits/rejected": 0.21030418574810028, "eval_logps/chosen": -3135.825927734375, "eval_logps/rejected": -2673.878173828125, "eval_loss": 0.6436037421226501, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 1.091071605682373, "eval_rewards/margins": 0.3236630856990814, "eval_rewards/rejected": 0.7674086093902588, "eval_runtime": 435.4437, "eval_samples_per_second": 4.593, "eval_steps_per_second": 1.148, "step": 2940 }, { "epoch": 0.77, "learning_rate": 7.509069373231039e-07, "logits/chosen": 0.17480526864528656, "logits/rejected": 0.08554727584123611, "logps/chosen": -3018.408935546875, "logps/rejected": -3066.44775390625, "loss": 0.6474, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.075202226638794, "rewards/margins": 0.2256067544221878, "rewards/rejected": 0.8495955467224121, "step": 2950 }, { "epoch": 0.77, "learning_rate": 7.346577451281822e-07, "logits/chosen": 0.139469712972641, "logits/rejected": 0.25643855333328247, "logps/chosen": -3390.76611328125, "logps/rejected": -2477.589111328125, "loss": 0.591, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.2087128162384033, "rewards/margins": 0.394287645816803, "rewards/rejected": 0.8144251108169556, "step": 2960 }, { "epoch": 0.77, "eval_logits/chosen": 0.11245152354240417, "eval_logits/rejected": 0.21180951595306396, "eval_logps/chosen": -3136.499267578125, "eval_logps/rejected": -2674.64501953125, "eval_loss": 0.6434337496757507, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 1.084335446357727, "eval_rewards/margins": 0.3245924115180969, "eval_rewards/rejected": 0.7597430348396301, "eval_runtime": 431.3709, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 2960 }, { "epoch": 0.78, "learning_rate": 7.185559585035138e-07, "logits/chosen": 0.15489263832569122, "logits/rejected": 0.16186505556106567, "logps/chosen": -2839.524658203125, "logps/rejected": -2662.75146484375, "loss": 0.6112, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.20577073097229, "rewards/margins": 0.45446911454200745, "rewards/rejected": 0.7513017654418945, "step": 2970 }, { "epoch": 0.78, "learning_rate": 7.026029219436504e-07, "logits/chosen": 0.006139541510492563, "logits/rejected": 0.12576696276664734, "logps/chosen": -3261.79052734375, "logps/rejected": -2888.37939453125, "loss": 0.6719, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.0913257598876953, "rewards/margins": 0.19699665904045105, "rewards/rejected": 0.8943290710449219, "step": 2980 }, { "epoch": 0.78, "eval_logits/chosen": 0.11001601815223694, "eval_logits/rejected": 0.20900581777095795, "eval_logps/chosen": -3135.50537109375, "eval_logps/rejected": -2673.852783203125, "eval_loss": 0.6439700722694397, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.0942782163619995, "eval_rewards/margins": 0.3266145884990692, "eval_rewards/rejected": 0.7676635980606079, "eval_runtime": 434.1431, "eval_samples_per_second": 4.607, "eval_steps_per_second": 1.152, "step": 2980 }, { "epoch": 0.78, "learning_rate": 6.867999675225523e-07, "logits/chosen": 0.055107347667217255, "logits/rejected": 0.15146927535533905, "logps/chosen": -3175.4345703125, "logps/rejected": -2682.394287109375, "loss": 0.6542, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.1792373657226562, "rewards/margins": 0.3137392997741699, "rewards/rejected": 0.8654980659484863, "step": 2990 }, { "epoch": 0.79, "learning_rate": 6.711484147823663e-07, "logits/chosen": 0.013993637636303902, "logits/rejected": 0.06237439438700676, "logps/chosen": -3342.016357421875, "logps/rejected": -3156.628173828125, "loss": 0.6609, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.243975281715393, "rewards/margins": 0.20992612838745117, "rewards/rejected": 1.0340490341186523, "step": 3000 }, { "epoch": 0.79, "eval_logits/chosen": 0.11470787227153778, "eval_logits/rejected": 0.21279804408550262, "eval_logps/chosen": -3137.02294921875, "eval_logps/rejected": -2675.142333984375, "eval_loss": 0.6441584229469299, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.0791016817092896, "eval_rewards/margins": 0.3243294656276703, "eval_rewards/rejected": 0.7547722458839417, "eval_runtime": 433.2196, "eval_samples_per_second": 4.617, "eval_steps_per_second": 1.154, "step": 3000 }, { "epoch": 0.79, "learning_rate": 6.556495706232413e-07, "logits/chosen": 0.033185649663209915, "logits/rejected": 0.13457655906677246, "logps/chosen": -3349.48681640625, "logps/rejected": -2719.23388671875, "loss": 0.6297, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2869220972061157, "rewards/margins": 0.45312124490737915, "rewards/rejected": 0.8338009119033813, "step": 3010 }, { "epoch": 0.79, "learning_rate": 6.403047291942057e-07, "logits/chosen": 0.10025575011968613, "logits/rejected": 0.214462012052536, "logps/chosen": -3174.3515625, "logps/rejected": -2716.052001953125, "loss": 0.6365, "rewards/accuracies": 0.625, "rewards/chosen": 1.1797490119934082, "rewards/margins": 0.26942500472068787, "rewards/rejected": 0.9103239178657532, "step": 3020 }, { "epoch": 0.79, "eval_logits/chosen": 0.0969366729259491, "eval_logits/rejected": 0.1954101324081421, "eval_logps/chosen": -3125.75439453125, "eval_logps/rejected": -2665.18115234375, "eval_loss": 0.644593358039856, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": 1.191789150238037, "eval_rewards/margins": 0.33740758895874023, "eval_rewards/rejected": 0.8543814420700073, "eval_runtime": 431.4659, "eval_samples_per_second": 4.635, "eval_steps_per_second": 1.159, "step": 3020 }, { "epoch": 0.79, "learning_rate": 6.251151717851023e-07, "logits/chosen": 0.10123336315155029, "logits/rejected": 0.19451609253883362, "logps/chosen": -3180.0263671875, "logps/rejected": -3087.802490234375, "loss": 0.7029, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 1.2668492794036865, "rewards/margins": 0.11371113359928131, "rewards/rejected": 1.1531381607055664, "step": 3030 }, { "epoch": 0.8, "learning_rate": 6.100821667196041e-07, "logits/chosen": 0.11681978404521942, "logits/rejected": 0.22388572990894318, "logps/chosen": -3313.73876953125, "logps/rejected": -2696.520263671875, "loss": 0.6146, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.238525629043579, "rewards/margins": 0.42913132905960083, "rewards/rejected": 0.809394359588623, "step": 3040 }, { "epoch": 0.8, "eval_logits/chosen": 0.1046353131532669, "eval_logits/rejected": 0.2032736837863922, "eval_logps/chosen": -3129.448974609375, "eval_logps/rejected": -2668.28857421875, "eval_loss": 0.6440542936325073, "eval_rewards/accuracies": 0.6600000262260437, "eval_rewards/chosen": 1.1548380851745605, "eval_rewards/margins": 0.3315299153327942, "eval_rewards/rejected": 0.8233082890510559, "eval_runtime": 435.3483, "eval_samples_per_second": 4.594, "eval_steps_per_second": 1.149, "step": 3040 }, { "epoch": 0.8, "learning_rate": 5.952069692493062e-07, "logits/chosen": 0.06525509059429169, "logits/rejected": 0.13589707016944885, "logps/chosen": -3388.70361328125, "logps/rejected": -3126.42529296875, "loss": 0.663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.245054006576538, "rewards/margins": 0.20850059390068054, "rewards/rejected": 1.0365536212921143, "step": 3050 }, { "epoch": 0.8, "learning_rate": 5.80490821448918e-07, "logits/chosen": 0.07924032211303711, "logits/rejected": 0.15562908351421356, "logps/chosen": -3187.68896484375, "logps/rejected": -3115.541259765625, "loss": 0.6289, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1435292959213257, "rewards/margins": 0.2427886426448822, "rewards/rejected": 0.9007408022880554, "step": 3060 }, { "epoch": 0.8, "eval_logits/chosen": 0.12074922770261765, "eval_logits/rejected": 0.21903274953365326, "eval_logps/chosen": -3140.2470703125, "eval_logps/rejected": -2677.65576171875, "eval_loss": 0.6435455083847046, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 1.0468610525131226, "eval_rewards/margins": 0.31722480058670044, "eval_rewards/rejected": 0.7296363711357117, "eval_runtime": 430.8523, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 3060 }, { "epoch": 0.8, "learning_rate": 5.659349521125459e-07, "logits/chosen": 0.16133460402488708, "logits/rejected": 0.16121089458465576, "logps/chosen": -3174.993408203125, "logps/rejected": -3014.8642578125, "loss": 0.678, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.8593528866767883, "rewards/margins": 0.1852940320968628, "rewards/rejected": 0.6740589141845703, "step": 3070 }, { "epoch": 0.81, "learning_rate": 5.5154057665109e-07, "logits/chosen": 0.20438699424266815, "logits/rejected": 0.23765972256660461, "logps/chosen": -2979.115234375, "logps/rejected": -2638.84765625, "loss": 0.6233, "rewards/accuracies": 0.625, "rewards/chosen": 0.8604068756103516, "rewards/margins": 0.23307761549949646, "rewards/rejected": 0.6273292899131775, "step": 3080 }, { "epoch": 0.81, "eval_logits/chosen": 0.13310247659683228, "eval_logits/rejected": 0.2311699092388153, "eval_logps/chosen": -3148.380859375, "eval_logps/rejected": -2684.7822265625, "eval_loss": 0.6443056464195251, "eval_rewards/accuracies": 0.6570000052452087, "eval_rewards/chosen": 0.9655249118804932, "eval_rewards/margins": 0.3071555197238922, "eval_rewards/rejected": 0.6583693027496338, "eval_runtime": 435.1978, "eval_samples_per_second": 4.596, "eval_steps_per_second": 1.149, "step": 3080 }, { "epoch": 0.81, "learning_rate": 5.373088969907586e-07, "logits/chosen": 0.20158644020557404, "logits/rejected": 0.24684882164001465, "logps/chosen": -2855.24560546875, "logps/rejected": -2698.558837890625, "loss": 0.6291, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.9169914126396179, "rewards/margins": 0.29850396513938904, "rewards/rejected": 0.6184874773025513, "step": 3090 }, { "epoch": 0.81, "learning_rate": 5.23241101472709e-07, "logits/chosen": 0.16126073896884918, "logits/rejected": 0.2546280026435852, "logps/chosen": -3180.9599609375, "logps/rejected": -2852.08154296875, "loss": 0.5942, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.0891501903533936, "rewards/margins": 0.4536283016204834, "rewards/rejected": 0.6355219483375549, "step": 3100 }, { "epoch": 0.81, "eval_logits/chosen": 0.12147192656993866, "eval_logits/rejected": 0.22079919278621674, "eval_logps/chosen": -3139.727783203125, "eval_logps/rejected": -2677.511962890625, "eval_loss": 0.6440848112106323, "eval_rewards/accuracies": 0.6620000004768372, "eval_rewards/chosen": 1.0520540475845337, "eval_rewards/margins": 0.3209841251373291, "eval_rewards/rejected": 0.7310699224472046, "eval_runtime": 432.5195, "eval_samples_per_second": 4.624, "eval_steps_per_second": 1.156, "step": 3100 }, { "epoch": 0.81, "learning_rate": 5.09338364753818e-07, "logits/chosen": 0.05312347412109375, "logits/rejected": 0.17926687002182007, "logps/chosen": -2952.879150390625, "logps/rejected": -2260.589111328125, "loss": 0.6808, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.0430628061294556, "rewards/margins": 0.3113624155521393, "rewards/rejected": 0.7317003607749939, "step": 3110 }, { "epoch": 0.82, "learning_rate": 4.956018477086005e-07, "logits/chosen": 0.08119906485080719, "logits/rejected": 0.11619164794683456, "logps/chosen": -3140.6591796875, "logps/rejected": -2938.876220703125, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": 0.9056208729743958, "rewards/margins": 0.20703260600566864, "rewards/rejected": 0.6985882520675659, "step": 3120 }, { "epoch": 0.82, "eval_logits/chosen": 0.12069052457809448, "eval_logits/rejected": 0.21995414793491364, "eval_logps/chosen": -3138.308349609375, "eval_logps/rejected": -2676.256591796875, "eval_loss": 0.6438711881637573, "eval_rewards/accuracies": 0.6589999794960022, "eval_rewards/chosen": 1.0662508010864258, "eval_rewards/margins": 0.32262590527534485, "eval_rewards/rejected": 0.7436249852180481, "eval_runtime": 433.6217, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 3120 }, { "epoch": 0.82, "learning_rate": 4.820326973322764e-07, "logits/chosen": 0.2029900848865509, "logits/rejected": 0.30564722418785095, "logps/chosen": -3236.748046875, "logps/rejected": -2794.49169921875, "loss": 0.6663, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 1.2101401090621948, "rewards/margins": 0.24165792763233185, "rewards/rejected": 0.9684821963310242, "step": 3130 }, { "epoch": 0.82, "learning_rate": 4.686320466449981e-07, "logits/chosen": 0.18596743047237396, "logits/rejected": 0.16868725419044495, "logps/chosen": -3142.15771484375, "logps/rejected": -2987.83544921875, "loss": 0.7201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 1.0711314678192139, "rewards/margins": 0.25457140803337097, "rewards/rejected": 0.8165599703788757, "step": 3140 }, { "epoch": 0.82, "eval_logits/chosen": 0.11725565791130066, "eval_logits/rejected": 0.21697619557380676, "eval_logps/chosen": -3138.20166015625, "eval_logps/rejected": -2675.9697265625, "eval_loss": 0.6430820226669312, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.0673128366470337, "eval_rewards/margins": 0.3208179175853729, "eval_rewards/rejected": 0.7464949488639832, "eval_runtime": 434.0694, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 3140 }, { "epoch": 0.82, "learning_rate": 4.554010145972418e-07, "logits/chosen": 0.08858512341976166, "logits/rejected": 0.21292802691459656, "logps/chosen": -3150.5732421875, "logps/rejected": -2674.91064453125, "loss": 0.6572, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 1.1328978538513184, "rewards/margins": 0.32505902647972107, "rewards/rejected": 0.8078387975692749, "step": 3150 }, { "epoch": 0.83, "learning_rate": 4.4234070597637455e-07, "logits/chosen": 0.25616246461868286, "logits/rejected": 0.2481415569782257, "logps/chosen": -2742.87744140625, "logps/rejected": -2460.419189453125, "loss": 0.684, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.8944629430770874, "rewards/margins": 0.06996430456638336, "rewards/rejected": 0.8244986534118652, "step": 3160 }, { "epoch": 0.83, "eval_logits/chosen": 0.11384333670139313, "eval_logits/rejected": 0.2137880176305771, "eval_logps/chosen": -3137.109619140625, "eval_logps/rejected": -2674.922119140625, "eval_loss": 0.6429367661476135, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 1.0782382488250732, "eval_rewards/margins": 0.3212665915489197, "eval_rewards/rejected": 0.7569717168807983, "eval_runtime": 430.7431, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 3160 }, { "epoch": 0.83, "learning_rate": 4.2945221131440783e-07, "logits/chosen": 0.2089655101299286, "logits/rejected": 0.3098098337650299, "logps/chosen": -3122.196044921875, "logps/rejected": -2862.058837890625, "loss": 0.6115, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.9570213556289673, "rewards/margins": 0.3452797830104828, "rewards/rejected": 0.6117415428161621, "step": 3170 }, { "epoch": 0.83, "learning_rate": 4.167366067969381e-07, "logits/chosen": 0.07694603502750397, "logits/rejected": 0.1465863734483719, "logps/chosen": -3364.65380859375, "logps/rejected": -3150.26318359375, "loss": 0.6372, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.1156318187713623, "rewards/margins": 0.29412493109703064, "rewards/rejected": 0.8215068578720093, "step": 3180 }, { "epoch": 0.83, "eval_logits/chosen": 0.11947512626647949, "eval_logits/rejected": 0.21990032494068146, "eval_logps/chosen": -3139.81640625, "eval_logps/rejected": -2677.553466796875, "eval_loss": 0.6424374580383301, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 1.051164150238037, "eval_rewards/margins": 0.32050707936286926, "eval_rewards/rejected": 0.7306571006774902, "eval_runtime": 435.2193, "eval_samples_per_second": 4.595, "eval_steps_per_second": 1.149, "step": 3180 }, { "epoch": 0.83, "learning_rate": 4.041949541732826e-07, "logits/chosen": 0.0780300498008728, "logits/rejected": 0.12249604612588882, "logps/chosen": -3217.436767578125, "logps/rejected": -2983.2734375, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 1.0621402263641357, "rewards/margins": 0.2845233976840973, "rewards/rejected": 0.7776168584823608, "step": 3190 }, { "epoch": 0.84, "learning_rate": 3.9182830066782614e-07, "logits/chosen": 0.20067648589611053, "logits/rejected": 0.23752641677856445, "logps/chosen": -3057.96044921875, "logps/rejected": -2844.114990234375, "loss": 0.6491, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.8515130281448364, "rewards/margins": 0.16000904142856598, "rewards/rejected": 0.6915039420127869, "step": 3200 }, { "epoch": 0.84, "eval_logits/chosen": 0.13129757344722748, "eval_logits/rejected": 0.2311340868473053, "eval_logps/chosen": -3146.293212890625, "eval_logps/rejected": -2683.253173828125, "eval_loss": 0.6428888440132141, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 0.986397385597229, "eval_rewards/margins": 0.3127368986606598, "eval_rewards/rejected": 0.6736605167388916, "eval_runtime": 430.7323, "eval_samples_per_second": 4.643, "eval_steps_per_second": 1.161, "step": 3200 }, { "epoch": 0.84, "learning_rate": 3.796376788925771e-07, "logits/chosen": 0.0898682028055191, "logits/rejected": 0.23483076691627502, "logps/chosen": -3323.624267578125, "logps/rejected": -2500.621337890625, "loss": 0.6195, "rewards/accuracies": 0.6875, "rewards/chosen": 1.231750726699829, "rewards/margins": 0.43932047486305237, "rewards/rejected": 0.7924304008483887, "step": 3210 }, { "epoch": 0.84, "learning_rate": 3.676241067609465e-07, "logits/chosen": 0.1804880052804947, "logits/rejected": 0.31166332960128784, "logps/chosen": -2948.858642578125, "logps/rejected": -2542.364501953125, "loss": 0.6321, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.93202143907547, "rewards/margins": 0.3906312584877014, "rewards/rejected": 0.5413901209831238, "step": 3220 }, { "epoch": 0.84, "eval_logits/chosen": 0.11839088797569275, "eval_logits/rejected": 0.21903064846992493, "eval_logps/chosen": -3139.008056640625, "eval_logps/rejected": -2676.87890625, "eval_loss": 0.6418802738189697, "eval_rewards/accuracies": 0.6639999747276306, "eval_rewards/chosen": 1.0592516660690308, "eval_rewards/margins": 0.32184797525405884, "eval_rewards/rejected": 0.7374037504196167, "eval_runtime": 434.5001, "eval_samples_per_second": 4.603, "eval_steps_per_second": 1.151, "step": 3220 }, { "epoch": 0.85, "learning_rate": 3.5578858740274976e-07, "logits/chosen": 0.16984917223453522, "logits/rejected": 0.21354170143604279, "logps/chosen": -3462.376953125, "logps/rejected": -3062.519287109375, "loss": 0.6389, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.356074571609497, "rewards/margins": 0.39215773344039917, "rewards/rejected": 0.9639169573783875, "step": 3230 }, { "epoch": 0.85, "learning_rate": 3.44132109080447e-07, "logits/chosen": 0.16598071157932281, "logits/rejected": 0.22008033096790314, "logps/chosen": -2946.054443359375, "logps/rejected": -2547.639404296875, "loss": 0.6858, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.8202168345451355, "rewards/margins": 0.016638051718473434, "rewards/rejected": 0.8035787343978882, "step": 3240 }, { "epoch": 0.85, "eval_logits/chosen": 0.10809030383825302, "eval_logits/rejected": 0.20932409167289734, "eval_logps/chosen": -3133.078369140625, "eval_logps/rejected": -2671.571044921875, "eval_loss": 0.6417847275733948, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 1.1185483932495117, "eval_rewards/margins": 0.328066885471344, "eval_rewards/rejected": 0.7904813885688782, "eval_runtime": 432.3904, "eval_samples_per_second": 4.625, "eval_steps_per_second": 1.156, "step": 3240 }, { "epoch": 0.85, "learning_rate": 3.3265564510662344e-07, "logits/chosen": 0.07405012845993042, "logits/rejected": 0.09166523814201355, "logps/chosen": -3139.692626953125, "logps/rejected": -3007.415283203125, "loss": 0.6917, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 1.1080840826034546, "rewards/margins": 0.1181420311331749, "rewards/rejected": 0.9899420738220215, "step": 3250 }, { "epoch": 0.85, "learning_rate": 3.213601537627195e-07, "logits/chosen": 0.07350599765777588, "logits/rejected": 0.24852828681468964, "logps/chosen": -3038.053955078125, "logps/rejected": -2664.875244140625, "loss": 0.6487, "rewards/accuracies": 0.625, "rewards/chosen": 1.0974736213684082, "rewards/margins": 0.255354106426239, "rewards/rejected": 0.8421195149421692, "step": 3260 }, { "epoch": 0.85, "eval_logits/chosen": 0.1091584637761116, "eval_logits/rejected": 0.21017196774482727, "eval_logps/chosen": -3134.90771484375, "eval_logps/rejected": -2673.0029296875, "eval_loss": 0.6413732767105103, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 1.1002529859542847, "eval_rewards/margins": 0.324088454246521, "eval_rewards/rejected": 0.7761646509170532, "eval_runtime": 432.4483, "eval_samples_per_second": 4.625, "eval_steps_per_second": 1.156, "step": 3260 }, { "epoch": 0.86, "learning_rate": 3.1024657821901063e-07, "logits/chosen": 0.14261241257190704, "logits/rejected": 0.2448866367340088, "logps/chosen": -2551.448974609375, "logps/rejected": -2258.469970703125, "loss": 0.6416, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.9900485873222351, "rewards/margins": 0.33649030327796936, "rewards/rejected": 0.6535583734512329, "step": 3270 }, { "epoch": 0.86, "learning_rate": 2.9931584645585654e-07, "logits/chosen": 0.1322273313999176, "logits/rejected": 0.22478432953357697, "logps/chosen": -2869.87353515625, "logps/rejected": -2353.48876953125, "loss": 0.6232, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8771117925643921, "rewards/margins": 0.3413812518119812, "rewards/rejected": 0.5357305407524109, "step": 3280 }, { "epoch": 0.86, "eval_logits/chosen": 0.11526377499103546, "eval_logits/rejected": 0.21548150479793549, "eval_logps/chosen": -3136.031494140625, "eval_logps/rejected": -2674.21044921875, "eval_loss": 0.6417762637138367, "eval_rewards/accuracies": 0.6650000214576721, "eval_rewards/chosen": 1.089017629623413, "eval_rewards/margins": 0.32493001222610474, "eval_rewards/rejected": 0.7640874981880188, "eval_runtime": 434.5229, "eval_samples_per_second": 4.603, "eval_steps_per_second": 1.151, "step": 3280 }, { "epoch": 0.86, "learning_rate": 2.885688711862136e-07, "logits/chosen": 0.16889066994190216, "logits/rejected": 0.22484536468982697, "logps/chosen": -2846.7021484375, "logps/rejected": -2759.17333984375, "loss": 0.6353, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8602210879325867, "rewards/margins": 0.23440483212471008, "rewards/rejected": 0.625816285610199, "step": 3290 }, { "epoch": 0.86, "learning_rate": 2.7800654977942486e-07, "logits/chosen": 0.24448397755622864, "logits/rejected": 0.22337858378887177, "logps/chosen": -2816.523193359375, "logps/rejected": -2483.67236328125, "loss": 0.6751, "rewards/accuracies": 0.6875, "rewards/chosen": 0.9299218058586121, "rewards/margins": 0.15793947875499725, "rewards/rejected": 0.7719823122024536, "step": 3300 }, { "epoch": 0.86, "eval_logits/chosen": 0.11126742511987686, "eval_logits/rejected": 0.2115521878004074, "eval_logps/chosen": -3132.7705078125, "eval_logps/rejected": -2671.365966796875, "eval_loss": 0.6422792673110962, "eval_rewards/accuracies": 0.6690000295639038, "eval_rewards/chosen": 1.121626377105713, "eval_rewards/margins": 0.32909339666366577, "eval_rewards/rejected": 0.7925330400466919, "eval_runtime": 430.6463, "eval_samples_per_second": 4.644, "eval_steps_per_second": 1.161, "step": 3300 }, { "epoch": 0.87, "learning_rate": 2.6762976418628797e-07, "logits/chosen": 0.10731194168329239, "logits/rejected": 0.1879521906375885, "logps/chosen": -3171.45654296875, "logps/rejected": -3107.197265625, "loss": 0.5988, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.2582088708877563, "rewards/margins": 0.4580596387386322, "rewards/rejected": 0.800149142742157, "step": 3310 }, { "epoch": 0.87, "learning_rate": 2.5743938086541354e-07, "logits/chosen": 0.11037665605545044, "logits/rejected": 0.16277505457401276, "logps/chosen": -3042.420654296875, "logps/rejected": -2585.08447265625, "loss": 0.6696, "rewards/accuracies": 0.625, "rewards/chosen": 1.0343624353408813, "rewards/margins": 0.23506657779216766, "rewards/rejected": 0.7992957830429077, "step": 3320 }, { "epoch": 0.87, "eval_logits/chosen": 0.11268280446529388, "eval_logits/rejected": 0.21244128048419952, "eval_logps/chosen": -3133.55126953125, "eval_logps/rejected": -2672.0673828125, "eval_loss": 0.6420189142227173, "eval_rewards/accuracies": 0.6650000214576721, "eval_rewards/chosen": 1.1138184070587158, "eval_rewards/margins": 0.32830312848091125, "eval_rewards/rejected": 0.7855152487754822, "eval_runtime": 435.0315, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 3320 }, { "epoch": 0.87, "learning_rate": 2.4743625071087574e-07, "logits/chosen": 0.15489795804023743, "logits/rejected": 0.20360830426216125, "logps/chosen": -3383.7421875, "logps/rejected": -2936.027587890625, "loss": 0.6473, "rewards/accuracies": 0.5625, "rewards/chosen": 1.198460340499878, "rewards/margins": 0.2576596736907959, "rewards/rejected": 0.9408007860183716, "step": 3330 }, { "epoch": 0.87, "learning_rate": 2.3762120898116498e-07, "logits/chosen": 0.1508510410785675, "logits/rejected": 0.15869677066802979, "logps/chosen": -3049.195068359375, "logps/rejected": -2959.291748046875, "loss": 0.6762, "rewards/accuracies": 0.625, "rewards/chosen": 1.0120429992675781, "rewards/margins": 0.12844504415988922, "rewards/rejected": 0.8835979700088501, "step": 3340 }, { "epoch": 0.87, "eval_logits/chosen": 0.12380759418010712, "eval_logits/rejected": 0.22342368960380554, "eval_logps/chosen": -3140.6455078125, "eval_logps/rejected": -2678.20263671875, "eval_loss": 0.6417787075042725, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 1.0428744554519653, "eval_rewards/margins": 0.3187069594860077, "eval_rewards/rejected": 0.7241674661636353, "eval_runtime": 431.1987, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.16, "step": 3340 }, { "epoch": 0.88, "learning_rate": 2.2799507522944048e-07, "logits/chosen": 0.09113749116659164, "logits/rejected": 0.14742733538150787, "logps/chosen": -3390.270263671875, "logps/rejected": -3176.9375, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": 1.251697301864624, "rewards/margins": 0.2717041075229645, "rewards/rejected": 0.9799932241439819, "step": 3350 }, { "epoch": 0.88, "learning_rate": 2.1855865323510056e-07, "logits/chosen": 0.1404508650302887, "logits/rejected": 0.08529405295848846, "logps/chosen": -2861.108154296875, "logps/rejected": -2932.998779296875, "loss": 0.6431, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 1.053432822227478, "rewards/margins": 0.2687282860279083, "rewards/rejected": 0.7847046852111816, "step": 3360 }, { "epoch": 0.88, "eval_logits/chosen": 0.1333971917629242, "eval_logits/rejected": 0.23242510855197906, "eval_logps/chosen": -3146.1572265625, "eval_logps/rejected": -2682.8466796875, "eval_loss": 0.6422930955886841, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": 0.9877604842185974, "eval_rewards/margins": 0.3100367784500122, "eval_rewards/rejected": 0.6777237057685852, "eval_runtime": 433.0552, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.155, "step": 3360 }, { "epoch": 0.88, "learning_rate": 2.0931273093666575e-07, "logits/chosen": 0.2043648660182953, "logits/rejected": 0.18084268271923065, "logps/chosen": -2757.97119140625, "logps/rejected": -2589.1455078125, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.7416321039199829, "rewards/margins": 0.12895885109901428, "rewards/rejected": 0.6126731634140015, "step": 3370 }, { "epoch": 0.88, "learning_rate": 2.002580803659873e-07, "logits/chosen": 0.14425447583198547, "logits/rejected": 0.24503755569458008, "logps/chosen": -3296.520263671875, "logps/rejected": -2861.49951171875, "loss": 0.6533, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0957818031311035, "rewards/margins": 0.29859644174575806, "rewards/rejected": 0.7971853017807007, "step": 3380 }, { "epoch": 0.88, "eval_logits/chosen": 0.13685256242752075, "eval_logits/rejected": 0.23569966852664948, "eval_logps/chosen": -3148.362548828125, "eval_logps/rejected": -2684.86962890625, "eval_loss": 0.6421592235565186, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9657065272331238, "eval_rewards/margins": 0.30820992588996887, "eval_rewards/rejected": 0.6574966311454773, "eval_runtime": 433.1383, "eval_samples_per_second": 4.617, "eval_steps_per_second": 1.154, "step": 3380 }, { "epoch": 0.89, "learning_rate": 1.913954575837826e-07, "logits/chosen": 0.1676422357559204, "logits/rejected": 0.26789090037345886, "logps/chosen": -3243.557373046875, "logps/rejected": -2936.014404296875, "loss": 0.6907, "rewards/accuracies": 0.5625, "rewards/chosen": 0.9341079592704773, "rewards/margins": 0.10601285845041275, "rewards/rejected": 0.8280950784683228, "step": 3390 }, { "epoch": 0.89, "learning_rate": 1.827256026165028e-07, "logits/chosen": 0.08100247383117676, "logits/rejected": 0.11943802982568741, "logps/chosen": -3502.637939453125, "logps/rejected": -3098.47607421875, "loss": 0.6517, "rewards/accuracies": 0.625, "rewards/chosen": 1.2382854223251343, "rewards/margins": 0.3256802558898926, "rewards/rejected": 0.9126052856445312, "step": 3400 }, { "epoch": 0.89, "eval_logits/chosen": 0.13286927342414856, "eval_logits/rejected": 0.23189303278923035, "eval_logps/chosen": -3144.69091796875, "eval_logps/rejected": -2681.69287109375, "eval_loss": 0.641541600227356, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 1.0024189949035645, "eval_rewards/margins": 0.3131525218486786, "eval_rewards/rejected": 0.6892665028572083, "eval_runtime": 430.8935, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 3400 }, { "epoch": 0.89, "learning_rate": 1.7424923939454274e-07, "logits/chosen": 0.22224673628807068, "logits/rejected": 0.21275004744529724, "logps/chosen": -2959.708251953125, "logps/rejected": -2885.37841796875, "loss": 0.6669, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.9747493863105774, "rewards/margins": 0.1536032259464264, "rewards/rejected": 0.8211461901664734, "step": 3410 }, { "epoch": 0.9, "learning_rate": 1.6596707569179304e-07, "logits/chosen": 0.08265651762485504, "logits/rejected": 0.13371232151985168, "logps/chosen": -3498.89111328125, "logps/rejected": -2985.024658203125, "loss": 0.7125, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 1.1055257320404053, "rewards/margins": 0.035826317965984344, "rewards/rejected": 1.0696992874145508, "step": 3420 }, { "epoch": 0.9, "eval_logits/chosen": 0.13406240940093994, "eval_logits/rejected": 0.23270981013774872, "eval_logps/chosen": -3146.035888671875, "eval_logps/rejected": -2682.671142578125, "eval_loss": 0.6420262455940247, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": 0.9889739155769348, "eval_rewards/margins": 0.30949535965919495, "eval_rewards/rejected": 0.6794785857200623, "eval_runtime": 434.6617, "eval_samples_per_second": 4.601, "eval_steps_per_second": 1.15, "step": 3420 }, { "epoch": 0.9, "learning_rate": 1.578798030665385e-07, "logits/chosen": 0.06310279667377472, "logits/rejected": 0.1371438205242157, "logps/chosen": -2948.4287109375, "logps/rejected": -2756.453369140625, "loss": 0.6437, "rewards/accuracies": 0.625, "rewards/chosen": 0.9179822206497192, "rewards/margins": 0.18609705567359924, "rewards/rejected": 0.7318851351737976, "step": 3430 }, { "epoch": 0.9, "learning_rate": 1.499880968037165e-07, "logits/chosen": 0.2662331461906433, "logits/rejected": 0.25772660970687866, "logps/chosen": -3215.6611328125, "logps/rejected": -2962.76416015625, "loss": 0.655, "rewards/accuracies": 0.5625, "rewards/chosen": 0.8480676412582397, "rewards/margins": 0.1061033234000206, "rewards/rejected": 0.7419644594192505, "step": 3440 }, { "epoch": 0.9, "eval_logits/chosen": 0.1352957934141159, "eval_logits/rejected": 0.23386944830417633, "eval_logps/chosen": -3146.521728515625, "eval_logps/rejected": -2683.09716796875, "eval_loss": 0.6418280601501465, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.984112560749054, "eval_rewards/margins": 0.30889230966567993, "eval_rewards/rejected": 0.6752201914787292, "eval_runtime": 430.3332, "eval_samples_per_second": 4.648, "eval_steps_per_second": 1.162, "step": 3440 }, { "epoch": 0.9, "learning_rate": 1.4229261585852805e-07, "logits/chosen": 0.1342567652463913, "logits/rejected": 0.24183864891529083, "logps/chosen": -2942.876220703125, "logps/rejected": -2744.811767578125, "loss": 0.606, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.8744937777519226, "rewards/margins": 0.32828474044799805, "rewards/rejected": 0.5462090969085693, "step": 3450 }, { "epoch": 0.91, "learning_rate": 1.3479400280141886e-07, "logits/chosen": 0.168122798204422, "logits/rejected": 0.12927907705307007, "logps/chosen": -2853.358154296875, "logps/rejected": -2873.123291015625, "loss": 0.6298, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.956896185874939, "rewards/margins": 0.1745554655790329, "rewards/rejected": 0.7823408246040344, "step": 3460 }, { "epoch": 0.91, "eval_logits/chosen": 0.13761194050312042, "eval_logits/rejected": 0.2362123727798462, "eval_logps/chosen": -3148.104736328125, "eval_logps/rejected": -2684.45166015625, "eval_loss": 0.6420778632164001, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9682847857475281, "eval_rewards/margins": 0.3066103458404541, "eval_rewards/rejected": 0.661674439907074, "eval_runtime": 434.035, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 3460 }, { "epoch": 0.91, "learning_rate": 1.2749288376442044e-07, "logits/chosen": 0.12254656851291656, "logits/rejected": 0.22399628162384033, "logps/chosen": -3280.57666015625, "logps/rejected": -2838.071044921875, "loss": 0.5903, "rewards/accuracies": 0.6875, "rewards/chosen": 1.0716785192489624, "rewards/margins": 0.4430391192436218, "rewards/rejected": 0.6286393404006958, "step": 3470 }, { "epoch": 0.91, "learning_rate": 1.203898683888713e-07, "logits/chosen": 0.21893087029457092, "logits/rejected": 0.19607816636562347, "logps/chosen": -2907.127685546875, "logps/rejected": -2707.402099609375, "loss": 0.634, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.8684245944023132, "rewards/margins": 0.3170326352119446, "rewards/rejected": 0.5513919591903687, "step": 3480 }, { "epoch": 0.91, "eval_logits/chosen": 0.1375933289527893, "eval_logits/rejected": 0.2363169640302658, "eval_logps/chosen": -3148.218994140625, "eval_logps/rejected": -2684.616943359375, "eval_loss": 0.6420032978057861, "eval_rewards/accuracies": 0.6639999747276306, "eval_rewards/chosen": 0.9671412706375122, "eval_rewards/margins": 0.3071180284023285, "eval_rewards/rejected": 0.6600232124328613, "eval_runtime": 431.6204, "eval_samples_per_second": 4.634, "eval_steps_per_second": 1.158, "step": 3480 }, { "epoch": 0.91, "learning_rate": 1.1348554977451132e-07, "logits/chosen": 0.14745394885540009, "logits/rejected": 0.21867947280406952, "logps/chosen": -3186.76416015625, "logps/rejected": -2969.57275390625, "loss": 0.6453, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9377028346061707, "rewards/margins": 0.2782467007637024, "rewards/rejected": 0.6594561338424683, "step": 3490 }, { "epoch": 0.92, "learning_rate": 1.0678050442995802e-07, "logits/chosen": 0.09887160360813141, "logits/rejected": 0.15829409658908844, "logps/chosen": -2869.447021484375, "logps/rejected": -2969.70458984375, "loss": 0.6325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.842033863067627, "rewards/margins": 0.29257142543792725, "rewards/rejected": 0.5494624376296997, "step": 3500 }, { "epoch": 0.92, "eval_logits/chosen": 0.14096896350383759, "eval_logits/rejected": 0.23980994522571564, "eval_logps/chosen": -3150.32080078125, "eval_logps/rejected": -2686.537353515625, "eval_loss": 0.6422155499458313, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9461269378662109, "eval_rewards/margins": 0.3053058087825775, "eval_rewards/rejected": 0.6408212184906006, "eval_runtime": 432.2983, "eval_samples_per_second": 4.626, "eval_steps_per_second": 1.157, "step": 3500 }, { "epoch": 0.92, "learning_rate": 1.0027529222456755e-07, "logits/chosen": 0.18164488673210144, "logits/rejected": 0.20625858008861542, "logps/chosen": -2912.694580078125, "logps/rejected": -2778.01708984375, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.8258099555969238, "rewards/margins": 0.08004599809646606, "rewards/rejected": 0.745763897895813, "step": 3510 }, { "epoch": 0.92, "learning_rate": 9.397045634168766e-08, "logits/chosen": 0.06146081164479256, "logits/rejected": 0.18860608339309692, "logps/chosen": -3066.725341796875, "logps/rejected": -2404.23583984375, "loss": 0.6207, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7969748377799988, "rewards/margins": 0.2295728474855423, "rewards/rejected": 0.5674020051956177, "step": 3520 }, { "epoch": 0.92, "eval_logits/chosen": 0.14320848882198334, "eval_logits/rejected": 0.2419550120830536, "eval_logps/chosen": -3151.443359375, "eval_logps/rejected": -2687.47021484375, "eval_loss": 0.6423044800758362, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9348993301391602, "eval_rewards/margins": 0.30340930819511414, "eval_rewards/rejected": 0.6314901113510132, "eval_runtime": 433.6572, "eval_samples_per_second": 4.612, "eval_steps_per_second": 1.153, "step": 3520 }, { "epoch": 0.92, "learning_rate": 8.78665232332998e-08, "logits/chosen": 0.20460374653339386, "logits/rejected": 0.25026071071624756, "logps/chosen": -3280.63330078125, "logps/rejected": -2944.979248046875, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.946318507194519, "rewards/margins": 0.2902964949607849, "rewards/rejected": 0.6560220122337341, "step": 3530 }, { "epoch": 0.93, "learning_rate": 8.196400257606208e-08, "logits/chosen": 0.19042614102363586, "logits/rejected": 0.1725964993238449, "logps/chosen": -2849.282958984375, "logps/rejected": -3012.115234375, "loss": 0.6435, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8099976778030396, "rewards/margins": 0.16670723259449005, "rewards/rejected": 0.6432903409004211, "step": 3540 }, { "epoch": 0.93, "eval_logits/chosen": 0.143966943025589, "eval_logits/rejected": 0.24250434339046478, "eval_logps/chosen": -3152.145263671875, "eval_logps/rejected": -2688.084228515625, "eval_loss": 0.6423359513282776, "eval_rewards/accuracies": 0.6629999876022339, "eval_rewards/chosen": 0.9278807044029236, "eval_rewards/margins": 0.30252939462661743, "eval_rewards/rejected": 0.6253513097763062, "eval_runtime": 430.8737, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 3540 }, { "epoch": 0.93, "learning_rate": 7.626338722875076e-08, "logits/chosen": 0.10929808765649796, "logits/rejected": 0.19140291213989258, "logps/chosen": -3540.82275390625, "logps/rejected": -3043.31103515625, "loss": 0.6958, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9103952646255493, "rewards/margins": 0.21534967422485352, "rewards/rejected": 0.695045530796051, "step": 3550 }, { "epoch": 0.93, "learning_rate": 7.076515319110688e-08, "logits/chosen": 0.17049703001976013, "logits/rejected": 0.24311724305152893, "logps/chosen": -2963.123779296875, "logps/rejected": -2558.143310546875, "loss": 0.6271, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.8535161018371582, "rewards/margins": 0.3562083840370178, "rewards/rejected": 0.4973076283931732, "step": 3560 }, { "epoch": 0.93, "eval_logits/chosen": 0.1455048769712448, "eval_logits/rejected": 0.24423882365226746, "eval_logps/chosen": -3153.5029296875, "eval_logps/rejected": -2689.1689453125, "eval_loss": 0.6427844166755676, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9143030047416687, "eval_rewards/margins": 0.299798846244812, "eval_rewards/rejected": 0.6145041584968567, "eval_runtime": 435.2491, "eval_samples_per_second": 4.595, "eval_steps_per_second": 1.149, "step": 3560 }, { "epoch": 0.93, "learning_rate": 6.54697595640899e-08, "logits/chosen": 0.23459525406360626, "logits/rejected": 0.29453641176223755, "logps/chosen": -2793.845458984375, "logps/rejected": -2490.65380859375, "loss": 0.6598, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.6831232309341431, "rewards/margins": 0.1600319892168045, "rewards/rejected": 0.523091197013855, "step": 3570 }, { "epoch": 0.94, "learning_rate": 6.037764851154426e-08, "logits/chosen": 0.12254978716373444, "logits/rejected": 0.24803802371025085, "logps/chosen": -3540.91455078125, "logps/rejected": -2858.28759765625, "loss": 0.6405, "rewards/accuracies": 0.625, "rewards/chosen": 1.1985795497894287, "rewards/margins": 0.438490629196167, "rewards/rejected": 0.7600890398025513, "step": 3580 }, { "epoch": 0.94, "eval_logits/chosen": 0.14586107432842255, "eval_logits/rejected": 0.24473732709884644, "eval_logps/chosen": -3154.44970703125, "eval_logps/rejected": -2690.07177734375, "eval_loss": 0.6426360607147217, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9048318862915039, "eval_rewards/margins": 0.29935455322265625, "eval_rewards/rejected": 0.6054772734642029, "eval_runtime": 431.2662, "eval_samples_per_second": 4.638, "eval_steps_per_second": 1.159, "step": 3580 }, { "epoch": 0.94, "learning_rate": 5.548924522327748e-08, "logits/chosen": 0.17424562573432922, "logits/rejected": 0.28861844539642334, "logps/chosen": -3053.926025390625, "logps/rejected": -2872.008056640625, "loss": 0.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.7467017769813538, "rewards/margins": 0.18287941813468933, "rewards/rejected": 0.5638222694396973, "step": 3590 }, { "epoch": 0.94, "learning_rate": 5.0804957879556915e-08, "logits/chosen": 0.14216585457324982, "logits/rejected": 0.1937674731016159, "logps/chosen": -3065.022705078125, "logps/rejected": -2476.083251953125, "loss": 0.6822, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.8878313302993774, "rewards/margins": 0.2785571813583374, "rewards/rejected": 0.6092742085456848, "step": 3600 }, { "epoch": 0.94, "eval_logits/chosen": 0.1442757099866867, "eval_logits/rejected": 0.24282436072826385, "eval_logps/chosen": -3153.019775390625, "eval_logps/rejected": -2688.75048828125, "eval_loss": 0.6423978209495544, "eval_rewards/accuracies": 0.6610000133514404, "eval_rewards/chosen": 0.9191364049911499, "eval_rewards/margins": 0.30045053362846375, "eval_rewards/rejected": 0.6186859011650085, "eval_runtime": 433.8321, "eval_samples_per_second": 4.61, "eval_steps_per_second": 1.153, "step": 3600 }, { "epoch": 0.94, "learning_rate": 4.632517761702815e-08, "logits/chosen": 0.027212318032979965, "logits/rejected": 0.21198460459709167, "logps/chosen": -3662.328857421875, "logps/rejected": -2687.39013671875, "loss": 0.6039, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.226254940032959, "rewards/margins": 0.5234248638153076, "rewards/rejected": 0.7028301954269409, "step": 3610 }, { "epoch": 0.95, "learning_rate": 4.205027849605359e-08, "logits/chosen": 0.16220639646053314, "logits/rejected": 0.2090630978345871, "logps/chosen": -2833.3662109375, "logps/rejected": -2673.255859375, "loss": 0.6431, "rewards/accuracies": 0.625, "rewards/chosen": 0.703280508518219, "rewards/margins": 0.2885347306728363, "rewards/rejected": 0.4147458076477051, "step": 3620 }, { "epoch": 0.95, "eval_logits/chosen": 0.1428971290588379, "eval_logits/rejected": 0.2416784018278122, "eval_logps/chosen": -3151.9921875, "eval_logps/rejected": -2687.9921875, "eval_loss": 0.6422638893127441, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9294078350067139, "eval_rewards/margins": 0.30313628911972046, "eval_rewards/rejected": 0.6262715458869934, "eval_runtime": 433.2732, "eval_samples_per_second": 4.616, "eval_steps_per_second": 1.154, "step": 3620 }, { "epoch": 0.95, "learning_rate": 3.798061746947995e-08, "logits/chosen": 0.10277267545461655, "logits/rejected": 0.19308963418006897, "logps/chosen": -3161.52001953125, "logps/rejected": -2675.018310546875, "loss": 0.6479, "rewards/accuracies": 0.625, "rewards/chosen": 1.0065993070602417, "rewards/margins": 0.29847452044487, "rewards/rejected": 0.7081248164176941, "step": 3630 }, { "epoch": 0.95, "learning_rate": 3.411653435283158e-08, "logits/chosen": 0.14962689578533173, "logits/rejected": 0.1731754094362259, "logps/chosen": -3634.749267578125, "logps/rejected": -3319.98486328125, "loss": 0.6189, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.2629544734954834, "rewards/margins": 0.4409838616847992, "rewards/rejected": 0.8219706416130066, "step": 3640 }, { "epoch": 0.95, "eval_logits/chosen": 0.14218606054782867, "eval_logits/rejected": 0.24096493422985077, "eval_logps/chosen": -3151.537841796875, "eval_logps/rejected": -2687.5673828125, "eval_loss": 0.6423946619033813, "eval_rewards/accuracies": 0.6690000295639038, "eval_rewards/chosen": 0.9339535236358643, "eval_rewards/margins": 0.3034360706806183, "eval_rewards/rejected": 0.6305176019668579, "eval_runtime": 431.4073, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 3640 }, { "epoch": 0.96, "learning_rate": 3.04583517959367e-08, "logits/chosen": 0.1383778601884842, "logits/rejected": 0.1905566155910492, "logps/chosen": -3331.52734375, "logps/rejected": -2788.855224609375, "loss": 0.6325, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.9437848925590515, "rewards/margins": 0.24613627791404724, "rewards/rejected": 0.6976486444473267, "step": 3650 }, { "epoch": 0.96, "learning_rate": 2.7006375255985984e-08, "logits/chosen": 0.31125739216804504, "logits/rejected": 0.2655700147151947, "logps/chosen": -2763.419189453125, "logps/rejected": -2680.633544921875, "loss": 0.6516, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.7674001455307007, "rewards/margins": 0.2442699372768402, "rewards/rejected": 0.5231302976608276, "step": 3660 }, { "epoch": 0.96, "eval_logits/chosen": 0.1409374177455902, "eval_logits/rejected": 0.2398187816143036, "eval_logps/chosen": -3150.634521484375, "eval_logps/rejected": -2686.77392578125, "eval_loss": 0.6423558592796326, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": 0.9429846405982971, "eval_rewards/margins": 0.3045307397842407, "eval_rewards/rejected": 0.638454020023346, "eval_runtime": 435.0996, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 3660 }, { "epoch": 0.96, "learning_rate": 2.3760892972027328e-08, "logits/chosen": 0.09737087786197662, "logits/rejected": 0.15384702384471893, "logps/chosen": -3067.33837890625, "logps/rejected": -3067.926513671875, "loss": 0.6072, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8167799115180969, "rewards/margins": 0.2765643000602722, "rewards/rejected": 0.5402156114578247, "step": 3670 }, { "epoch": 0.96, "learning_rate": 2.072217594089765e-08, "logits/chosen": 0.05576412007212639, "logits/rejected": 0.1261301338672638, "logps/chosen": -3411.518310546875, "logps/rejected": -3069.054443359375, "loss": 0.6229, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 1.0969626903533936, "rewards/margins": 0.3264775276184082, "rewards/rejected": 0.7704852819442749, "step": 3680 }, { "epoch": 0.96, "eval_logits/chosen": 0.14164119958877563, "eval_logits/rejected": 0.2402096539735794, "eval_logps/chosen": -3150.943115234375, "eval_logps/rejected": -2687.004150390625, "eval_loss": 0.6422104835510254, "eval_rewards/accuracies": 0.6679999828338623, "eval_rewards/chosen": 0.9399007558822632, "eval_rewards/margins": 0.30375197529792786, "eval_rewards/rejected": 0.6361488103866577, "eval_runtime": 430.8891, "eval_samples_per_second": 4.642, "eval_steps_per_second": 1.16, "step": 3680 }, { "epoch": 0.97, "learning_rate": 1.789047789459375e-08, "logits/chosen": 0.15312783420085907, "logits/rejected": 0.20862571895122528, "logps/chosen": -2919.60498046875, "logps/rejected": -2619.73193359375, "loss": 0.6412, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.8828672170639038, "rewards/margins": 0.3349805474281311, "rewards/rejected": 0.5478867888450623, "step": 3690 }, { "epoch": 0.97, "learning_rate": 1.5266035279088708e-08, "logits/chosen": 0.15150216221809387, "logits/rejected": 0.24923443794250488, "logps/chosen": -3298.520751953125, "logps/rejected": -3067.990478515625, "loss": 0.6209, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.9211770296096802, "rewards/margins": 0.33588749170303345, "rewards/rejected": 0.5852895379066467, "step": 3700 }, { "epoch": 0.97, "eval_logits/chosen": 0.14189252257347107, "eval_logits/rejected": 0.24056430160999298, "eval_logps/chosen": -3151.036865234375, "eval_logps/rejected": -2687.092529296875, "eval_loss": 0.6424371600151062, "eval_rewards/accuracies": 0.6690000295639038, "eval_rewards/chosen": 0.9389640092849731, "eval_rewards/margins": 0.3036960959434509, "eval_rewards/rejected": 0.6352678537368774, "eval_runtime": 435.0479, "eval_samples_per_second": 4.597, "eval_steps_per_second": 1.149, "step": 3700 }, { "epoch": 0.97, "learning_rate": 1.2849067234584623e-08, "logits/chosen": 0.26942816376686096, "logits/rejected": 0.25100386142730713, "logps/chosen": -2976.239501953125, "logps/rejected": -2849.65087890625, "loss": 0.6222, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.8999488949775696, "rewards/margins": 0.326371967792511, "rewards/rejected": 0.5735768675804138, "step": 3710 }, { "epoch": 0.97, "learning_rate": 1.0639775577218625e-08, "logits/chosen": 0.15196841955184937, "logits/rejected": 0.2014082968235016, "logps/chosen": -3189.337890625, "logps/rejected": -2805.33642578125, "loss": 0.5807, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.0105528831481934, "rewards/margins": 0.31935805082321167, "rewards/rejected": 0.6911949515342712, "step": 3720 }, { "epoch": 0.97, "eval_logits/chosen": 0.14208458364009857, "eval_logits/rejected": 0.24078558385372162, "eval_logps/chosen": -3151.357666015625, "eval_logps/rejected": -2687.388427734375, "eval_loss": 0.642538845539093, "eval_rewards/accuracies": 0.6700000166893005, "eval_rewards/chosen": 0.9357544779777527, "eval_rewards/margins": 0.30344799160957336, "eval_rewards/rejected": 0.6323065161705017, "eval_runtime": 431.9454, "eval_samples_per_second": 4.63, "eval_steps_per_second": 1.158, "step": 3720 }, { "epoch": 0.98, "learning_rate": 8.638344782207486e-09, "logits/chosen": 0.1329575479030609, "logits/rejected": 0.23533578217029572, "logps/chosen": -3605.190673828125, "logps/rejected": -3222.681396484375, "loss": 0.6171, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 1.3076648712158203, "rewards/margins": 0.28768327832221985, "rewards/rejected": 1.0199816226959229, "step": 3730 }, { "epoch": 0.98, "learning_rate": 6.84494196844715e-09, "logits/chosen": 0.12342377007007599, "logits/rejected": 0.21276125311851501, "logps/chosen": -3099.862548828125, "logps/rejected": -2531.071044921875, "loss": 0.6304, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.8624950647354126, "rewards/margins": 0.2630658447742462, "rewards/rejected": 0.5994292497634888, "step": 3740 }, { "epoch": 0.98, "eval_logits/chosen": 0.14190411567687988, "eval_logits/rejected": 0.2406345158815384, "eval_logps/chosen": -3150.5283203125, "eval_logps/rejected": -2686.679443359375, "eval_loss": 0.6422514915466309, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9440484642982483, "eval_rewards/margins": 0.3046514391899109, "eval_rewards/rejected": 0.6393970847129822, "eval_runtime": 433.1101, "eval_samples_per_second": 4.618, "eval_steps_per_second": 1.154, "step": 3740 }, { "epoch": 0.98, "learning_rate": 5.259716884556121e-09, "logits/chosen": 0.14595457911491394, "logits/rejected": 0.25075823068618774, "logps/chosen": -3109.19580078125, "logps/rejected": -2650.501220703125, "loss": 0.6356, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.6928092241287231, "rewards/margins": 0.20054228603839874, "rewards/rejected": 0.49226704239845276, "step": 3750 }, { "epoch": 0.98, "learning_rate": 3.882801896372967e-09, "logits/chosen": 0.004992401693016291, "logits/rejected": 0.06620874255895615, "logps/chosen": -3837.669189453125, "logps/rejected": -3101.035888671875, "loss": 0.6049, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.3325315713882446, "rewards/margins": 0.43494343757629395, "rewards/rejected": 0.8975881338119507, "step": 3760 }, { "epoch": 0.98, "eval_logits/chosen": 0.1403258889913559, "eval_logits/rejected": 0.23912887275218964, "eval_logps/chosen": -3150.423828125, "eval_logps/rejected": -2686.570556640625, "eval_loss": 0.6423506736755371, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 0.9450932741165161, "eval_rewards/margins": 0.3046078383922577, "eval_rewards/rejected": 0.640485405921936, "eval_runtime": 434.0586, "eval_samples_per_second": 4.608, "eval_steps_per_second": 1.152, "step": 3760 }, { "epoch": 0.99, "learning_rate": 2.7143119759026614e-09, "logits/chosen": 0.09692613780498505, "logits/rejected": 0.19789806008338928, "logps/chosen": -3384.86181640625, "logps/rejected": -3104.510986328125, "loss": 0.6084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 1.0735002756118774, "rewards/margins": 0.3706549406051636, "rewards/rejected": 0.7028452157974243, "step": 3770 }, { "epoch": 0.99, "learning_rate": 1.754344691717591e-09, "logits/chosen": 0.22347232699394226, "logits/rejected": 0.2464027851819992, "logps/chosen": -2992.58740234375, "logps/rejected": -2852.219970703125, "loss": 0.6624, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.9263873100280762, "rewards/margins": 0.31008180975914, "rewards/rejected": 0.6163055300712585, "step": 3780 }, { "epoch": 0.99, "eval_logits/chosen": 0.14074553549289703, "eval_logits/rejected": 0.23948989808559418, "eval_logps/chosen": -3150.441162109375, "eval_logps/rejected": -2686.549072265625, "eval_loss": 0.6424062252044678, "eval_rewards/accuracies": 0.6639999747276306, "eval_rewards/chosen": 0.9449189901351929, "eval_rewards/margins": 0.30421924591064453, "eval_rewards/rejected": 0.6406996846199036, "eval_runtime": 431.0896, "eval_samples_per_second": 4.639, "eval_steps_per_second": 1.16, "step": 3780 }, { "epoch": 0.99, "learning_rate": 1.0029802008096335e-09, "logits/chosen": 0.18332697451114655, "logits/rejected": 0.21350276470184326, "logps/chosen": -2667.836669921875, "logps/rejected": -2443.216796875, "loss": 0.6471, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.6844807863235474, "rewards/margins": 0.21864625811576843, "rewards/rejected": 0.46583452820777893, "step": 3790 }, { "epoch": 0.99, "learning_rate": 4.602812418974534e-10, "logits/chosen": 0.22342488169670105, "logits/rejected": 0.2208918035030365, "logps/chosen": -2860.394775390625, "logps/rejected": -2959.43994140625, "loss": 0.6649, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.9285785555839539, "rewards/margins": 0.17109037935733795, "rewards/rejected": 0.7574881911277771, "step": 3800 }, { "epoch": 0.99, "eval_logits/chosen": 0.14151111245155334, "eval_logits/rejected": 0.24028992652893066, "eval_logps/chosen": -3150.71337890625, "eval_logps/rejected": -2686.836181640625, "eval_loss": 0.6422630548477173, "eval_rewards/accuracies": 0.6660000085830688, "eval_rewards/chosen": 0.9421964883804321, "eval_rewards/margins": 0.3043671250343323, "eval_rewards/rejected": 0.6378294229507446, "eval_runtime": 435.3836, "eval_samples_per_second": 4.594, "eval_steps_per_second": 1.148, "step": 3800 }, { "epoch": 1.0, "learning_rate": 1.2629313018819312e-10, "logits/chosen": 0.21134226024150848, "logits/rejected": 0.2608878016471863, "logps/chosen": -2880.048095703125, "logps/rejected": -2482.66748046875, "loss": 0.6818, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.7530244588851929, "rewards/margins": 0.0916735827922821, "rewards/rejected": 0.6613508462905884, "step": 3810 }, { "epoch": 1.0, "learning_rate": 1.0437535929996855e-12, "logits/chosen": 0.2175508290529251, "logits/rejected": 0.18654941022396088, "logps/chosen": -3237.44677734375, "logps/rejected": -2925.17919921875, "loss": 0.638, "rewards/accuracies": 0.5625, "rewards/chosen": 0.9160250425338745, "rewards/margins": 0.291111558675766, "rewards/rejected": 0.6249134540557861, "step": 3820 }, { "epoch": 1.0, "eval_logits/chosen": 0.14104244112968445, "eval_logits/rejected": 0.23967020213603973, "eval_logps/chosen": -3150.4404296875, "eval_logps/rejected": -2686.593505859375, "eval_loss": 0.642267107963562, "eval_rewards/accuracies": 0.6669999957084656, "eval_rewards/chosen": 0.9449259042739868, "eval_rewards/margins": 0.3046669363975525, "eval_rewards/rejected": 0.6402589082717896, "eval_runtime": 431.3932, "eval_samples_per_second": 4.636, "eval_steps_per_second": 1.159, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6582325137556925, "train_runtime": 111789.0929, "train_samples_per_second": 0.547, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }