diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7646 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.6665765102161667, + "learning_rate": 1.3054830287206266e-09, + "logits/chosen": -2.7590973377227783, + "logits/rejected": -2.847461462020874, + "logps/chosen": -183.89276123046875, + "logps/rejected": -240.56399536132812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.8575756438551951, + "learning_rate": 1.3054830287206264e-08, + "logits/chosen": -2.8650596141815186, + "logits/rejected": -2.741213798522949, + "logps/chosen": -287.0735778808594, + "logps/rejected": -190.1938934326172, + "loss": 0.6932, + "rewards/accuracies": 0.0833333358168602, + "rewards/chosen": -0.00012776268704328686, + "rewards/margins": -0.0002777025511022657, + "rewards/margins_max": 0.0002956644748337567, + "rewards/margins_min": -0.0011607869528234005, + "rewards/margins_std": 0.0006434161914512515, + "rewards/rejected": 0.00014993984950706363, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 1.9415081772633282, + "learning_rate": 2.610966057441253e-08, + "logits/chosen": -2.903657913208008, + "logits/rejected": -2.836289882659912, + "logps/chosen": -350.173095703125, + "logps/rejected": -270.01080322265625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0004818691813852638, + "rewards/margins": 0.0004207765741739422, + "rewards/margins_max": 0.00469308253377676, + "rewards/margins_min": -0.003478027181699872, + "rewards/margins_std": 0.0036645419895648956, + "rewards/rejected": 6.109262903919443e-05, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 2.1991873107072264, + "learning_rate": 3.91644908616188e-08, + "logits/chosen": -2.8335931301116943, + "logits/rejected": -2.8483452796936035, + "logps/chosen": -251.32809448242188, + "logps/rejected": -251.9710235595703, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00031053851125761867, + "rewards/margins": 0.000631550035905093, + "rewards/margins_max": 0.0033597375731915236, + "rewards/margins_min": -0.0023628503549844027, + "rewards/margins_std": 0.0026020309887826443, + "rewards/rejected": -0.0003210115246474743, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 2.0605644623064605, + "learning_rate": 5.221932114882506e-08, + "logits/chosen": -2.8021039962768555, + "logits/rejected": -2.8041481971740723, + "logps/chosen": -225.26821899414062, + "logps/rejected": -243.1289520263672, + "loss": 0.6933, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00025035307044163346, + "rewards/margins": 5.1289807743160054e-05, + "rewards/margins_max": 0.001968192169442773, + "rewards/margins_min": -0.002292071934789419, + "rewards/margins_std": 0.0019450311083346605, + "rewards/rejected": -0.000301642925478518, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 2.093204028556121, + "learning_rate": 6.527415143603133e-08, + "logits/chosen": -2.9541964530944824, + "logits/rejected": -2.9155194759368896, + "logps/chosen": -341.29949951171875, + "logps/rejected": -307.144287109375, + "loss": 0.6931, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00034787936601787806, + "rewards/margins": 0.0003529744572006166, + "rewards/margins_max": 0.003089633770287037, + "rewards/margins_min": -0.0021128272637724876, + "rewards/margins_std": 0.0023237646091729403, + "rewards/rejected": -5.094980679132277e-06, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 1.9463503110658755, + "learning_rate": 7.83289817232376e-08, + "logits/chosen": -2.7380714416503906, + "logits/rejected": -2.6864514350891113, + "logps/chosen": -247.39389038085938, + "logps/rejected": -251.6951446533203, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00020935627981089056, + "rewards/margins": 0.0005321280332282186, + "rewards/margins_max": 0.003940979018807411, + "rewards/margins_min": -0.0022508639376610518, + "rewards/margins_std": 0.0028098116163164377, + "rewards/rejected": -0.000322771753417328, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 1.624425052753792, + "learning_rate": 9.138381201044386e-08, + "logits/chosen": -2.8518402576446533, + "logits/rejected": -2.8149445056915283, + "logps/chosen": -260.69287109375, + "logps/rejected": -244.67037963867188, + "loss": 0.6929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00042275115265510976, + "rewards/margins": 0.0011347316903993487, + "rewards/margins_max": 0.004627277608960867, + "rewards/margins_min": -0.0014453496551141143, + "rewards/margins_std": 0.0027524891775101423, + "rewards/rejected": -0.0007119806250557303, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 2.4924587951131536, + "learning_rate": 1.0443864229765012e-07, + "logits/chosen": -2.7476470470428467, + "logits/rejected": -2.777383327484131, + "logps/chosen": -297.1468505859375, + "logps/rejected": -234.67489624023438, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00037956429878249764, + "rewards/margins": 0.000978610711172223, + "rewards/margins_max": 0.0038528472650796175, + "rewards/margins_min": -0.0018364314455538988, + "rewards/margins_std": 0.0025692558847367764, + "rewards/rejected": -0.0005990464123897254, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 1.804530611691274, + "learning_rate": 1.174934725848564e-07, + "logits/chosen": -2.9625868797302246, + "logits/rejected": -2.96667742729187, + "logps/chosen": -356.4012756347656, + "logps/rejected": -324.1001281738281, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0005120973801240325, + "rewards/margins": 0.0006280826637521386, + "rewards/margins_max": 0.00523213529959321, + "rewards/margins_min": -0.003726007416844368, + "rewards/margins_std": 0.0037922263145446777, + "rewards/rejected": -0.0001159853782155551, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 2.121390137592622, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.704665422439575, + "logits/rejected": -2.6805214881896973, + "logps/chosen": -298.2516784667969, + "logps/rejected": -227.35568237304688, + "loss": 0.6929, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00021588514209724963, + "rewards/margins": 0.0008079448016360402, + "rewards/margins_max": 0.004152725450694561, + "rewards/margins_min": -0.0023785457015037537, + "rewards/margins_std": 0.002924787113443017, + "rewards/rejected": -0.0010238299146294594, + "step": 100 + }, + { + "epoch": 0.03, + "eval_logits/chosen": -2.820028305053711, + "eval_logits/rejected": -2.7819433212280273, + "eval_logps/chosen": -284.4885559082031, + "eval_logps/rejected": -262.1841125488281, + "eval_loss": 0.6930052638053894, + "eval_rewards/accuracies": 0.5376983880996704, + "eval_rewards/chosen": 6.330771429929882e-05, + "eval_rewards/margins": 0.0004110113950446248, + "eval_rewards/margins_max": 0.005395225249230862, + "eval_rewards/margins_min": -0.004149184096604586, + "eval_rewards/margins_std": 0.0031532698776572943, + "eval_rewards/rejected": -0.00034770366619341075, + "eval_runtime": 390.7172, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 0.161, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 2.1442806465740376, + "learning_rate": 1.4360313315926893e-07, + "logits/chosen": -2.827916145324707, + "logits/rejected": -2.814990282058716, + "logps/chosen": -266.7802734375, + "logps/rejected": -263.96221923828125, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0003445649635978043, + "rewards/margins": 0.0005720595945604146, + "rewards/margins_max": 0.003972175531089306, + "rewards/margins_min": -0.0030974920373409986, + "rewards/margins_std": 0.003098450368270278, + "rewards/rejected": -0.00022749471827410161, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 2.1173282590576274, + "learning_rate": 1.566579634464752e-07, + "logits/chosen": -2.818742275238037, + "logits/rejected": -2.7966997623443604, + "logps/chosen": -248.96817016601562, + "logps/rejected": -239.61398315429688, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0006901187589392066, + "rewards/margins": 0.0009271263843402267, + "rewards/margins_max": 0.004433986730873585, + "rewards/margins_min": -0.0023641528096050024, + "rewards/margins_std": 0.0029318395536392927, + "rewards/rejected": -0.0002370075962971896, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 8.259799570233511, + "learning_rate": 1.6971279373368143e-07, + "logits/chosen": -2.780726671218872, + "logits/rejected": -2.737283229827881, + "logps/chosen": -279.51470947265625, + "logps/rejected": -393.16912841796875, + "loss": 0.6927, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 6.787038000766188e-05, + "rewards/margins": 0.0010505912359803915, + "rewards/margins_max": 0.005779094062745571, + "rewards/margins_min": -0.003547274973243475, + "rewards/margins_std": 0.00423091696575284, + "rewards/rejected": -0.0009827208705246449, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 1.8242350869601038, + "learning_rate": 1.8276762402088773e-07, + "logits/chosen": -2.7619147300720215, + "logits/rejected": -2.7275538444519043, + "logps/chosen": -234.1826629638672, + "logps/rejected": -217.1774139404297, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0001320727460552007, + "rewards/margins": 0.00024431064957752824, + "rewards/margins_max": 0.003625961020588875, + "rewards/margins_min": -0.0029167174361646175, + "rewards/margins_std": 0.00299251195974648, + "rewards/rejected": -0.000112237932626158, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 1.843052230955893, + "learning_rate": 1.95822454308094e-07, + "logits/chosen": -2.777858257293701, + "logits/rejected": -2.7580411434173584, + "logps/chosen": -219.1260223388672, + "logps/rejected": -249.1741485595703, + "loss": 0.6929, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.00024328229483217, + "rewards/margins": 1.8283386452822015e-05, + "rewards/margins_max": 0.00345350569114089, + "rewards/margins_min": -0.003710265038534999, + "rewards/margins_std": 0.00322783924639225, + "rewards/rejected": -0.0002615656703710556, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 1.6568764995163816, + "learning_rate": 2.0887728459530023e-07, + "logits/chosen": -2.8361194133758545, + "logits/rejected": -2.827834367752075, + "logps/chosen": -264.36041259765625, + "logps/rejected": -228.36276245117188, + "loss": 0.6926, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0007215446676127613, + "rewards/margins": 0.0007531328010372818, + "rewards/margins_max": 0.0032718696165829897, + "rewards/margins_min": -0.0016916522290557623, + "rewards/margins_std": 0.0022232590708881617, + "rewards/rejected": -3.1588067940901965e-05, + "step": 160 + }, + { + "epoch": 0.04, + "grad_norm": 1.8405703266100064, + "learning_rate": 2.2193211488250652e-07, + "logits/chosen": -2.75876784324646, + "logits/rejected": -2.7373013496398926, + "logps/chosen": -245.30673217773438, + "logps/rejected": -219.68887329101562, + "loss": 0.6927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0006504602497443557, + "rewards/margins": 0.0008395075565204024, + "rewards/margins_max": 0.003791679162532091, + "rewards/margins_min": -0.002601263113319874, + "rewards/margins_std": 0.0027966652996838093, + "rewards/rejected": -0.0001890472776722163, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 1.9194899235128287, + "learning_rate": 2.349869451697128e-07, + "logits/chosen": -2.7823123931884766, + "logits/rejected": -2.7283520698547363, + "logps/chosen": -313.1502380371094, + "logps/rejected": -244.85079956054688, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00046452778042294085, + "rewards/margins": 0.0017079252284020185, + "rewards/margins_max": 0.006643104366958141, + "rewards/margins_min": -0.003109711455181241, + "rewards/margins_std": 0.004393307026475668, + "rewards/rejected": -0.0012433973606675863, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 2.122545170747038, + "learning_rate": 2.4804177545691903e-07, + "logits/chosen": -2.8908307552337646, + "logits/rejected": -2.833364486694336, + "logps/chosen": -282.91094970703125, + "logps/rejected": -212.580078125, + "loss": 0.6923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00046637243940494955, + "rewards/margins": 0.0011371548753231764, + "rewards/margins_max": 0.006242834962904453, + "rewards/margins_min": -0.005177261307835579, + "rewards/margins_std": 0.005070381797850132, + "rewards/rejected": -0.0006707825814373791, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 1.9905585117704374, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.8145833015441895, + "logits/rejected": -2.761857271194458, + "logps/chosen": -322.8946228027344, + "logps/rejected": -255.11563110351562, + "loss": 0.6922, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.001599178067408502, + "rewards/margins": 0.0023355488665401936, + "rewards/margins_max": 0.006906877271831036, + "rewards/margins_min": -0.0019176024943590164, + "rewards/margins_std": 0.0039895204827189445, + "rewards/rejected": -0.0007363707991316915, + "step": 200 + }, + { + "epoch": 0.05, + "eval_logits/chosen": -2.819481134414673, + "eval_logits/rejected": -2.781402587890625, + "eval_logps/chosen": -284.4120178222656, + "eval_logps/rejected": -262.2542724609375, + "eval_loss": 0.6922732591629028, + "eval_rewards/accuracies": 0.6626983880996704, + "eval_rewards/chosen": 0.0008288321550935507, + "eval_rewards/margins": 0.0018778499215841293, + "eval_rewards/margins_max": 0.009960982948541641, + "eval_rewards/margins_min": -0.005824839696288109, + "eval_rewards/margins_std": 0.005126262549310923, + "eval_rewards/rejected": -0.0010490177664905787, + "eval_runtime": 389.9883, + "eval_samples_per_second": 5.128, + "eval_steps_per_second": 0.162, + "step": 200 + }, + { + "epoch": 0.05, + "grad_norm": 2.483358115087511, + "learning_rate": 2.7415143603133156e-07, + "logits/chosen": -2.7985758781433105, + "logits/rejected": -2.7510921955108643, + "logps/chosen": -341.04254150390625, + "logps/rejected": -275.554931640625, + "loss": 0.6922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.000645408290438354, + "rewards/margins": 0.0015350284520536661, + "rewards/margins_max": 0.007925329729914665, + "rewards/margins_min": -0.003419017419219017, + "rewards/margins_std": 0.005072223022580147, + "rewards/rejected": -0.0008896199869923294, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 1.7959485526713073, + "learning_rate": 2.8720626631853785e-07, + "logits/chosen": -2.739232063293457, + "logits/rejected": -2.682781219482422, + "logps/chosen": -265.99359130859375, + "logps/rejected": -296.9486389160156, + "loss": 0.6922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0007849860121496022, + "rewards/margins": 0.002244026865810156, + "rewards/margins_max": 0.007413020823150873, + "rewards/margins_min": -0.004091161303222179, + "rewards/margins_std": 0.0051065245643258095, + "rewards/rejected": -0.001459040679037571, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 1.894287345366052, + "learning_rate": 3.002610966057441e-07, + "logits/chosen": -2.855494976043701, + "logits/rejected": -2.8278939723968506, + "logps/chosen": -307.49322509765625, + "logps/rejected": -254.54275512695312, + "loss": 0.692, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0009105976787395775, + "rewards/margins": 0.001909045153297484, + "rewards/margins_max": 0.010586127638816833, + "rewards/margins_min": -0.007014777511358261, + "rewards/margins_std": 0.007901398465037346, + "rewards/rejected": -0.0009984474163502455, + "step": 230 + }, + { + "epoch": 0.06, + "grad_norm": 2.1418885448818967, + "learning_rate": 3.133159268929504e-07, + "logits/chosen": -2.7284979820251465, + "logits/rejected": -2.6077353954315186, + "logps/chosen": -275.98480224609375, + "logps/rejected": -225.40377807617188, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0025788077618926764, + "rewards/margins": 0.0034575846511870623, + "rewards/margins_max": 0.01471433974802494, + "rewards/margins_min": -0.004577940795570612, + "rewards/margins_std": 0.008655351586639881, + "rewards/rejected": -0.0008787767728790641, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 1.677114672002745, + "learning_rate": 3.263707571801567e-07, + "logits/chosen": -2.7854180335998535, + "logits/rejected": -2.8031558990478516, + "logps/chosen": -285.1125793457031, + "logps/rejected": -253.5863800048828, + "loss": 0.6916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.003366816323250532, + "rewards/margins": 0.0031893576961010695, + "rewards/margins_max": 0.010487152263522148, + "rewards/margins_min": -0.002120216377079487, + "rewards/margins_std": 0.00551719032227993, + "rewards/rejected": 0.00017745881632436067, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 3.289012192728762, + "learning_rate": 3.3942558746736286e-07, + "logits/chosen": -2.824139356613159, + "logits/rejected": -2.7897555828094482, + "logps/chosen": -350.8598327636719, + "logps/rejected": -325.8442687988281, + "loss": 0.6909, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0026700394228100777, + "rewards/margins": 0.006464863661676645, + "rewards/margins_max": 0.015432248823344707, + "rewards/margins_min": -0.0023544426076114178, + "rewards/margins_std": 0.00776649359613657, + "rewards/rejected": -0.003794824704527855, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 1.7982929108642927, + "learning_rate": 3.5248041775456916e-07, + "logits/chosen": -2.8094305992126465, + "logits/rejected": -2.7534804344177246, + "logps/chosen": -325.5042724609375, + "logps/rejected": -300.2442932128906, + "loss": 0.6905, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.002700726268813014, + "rewards/margins": 0.004537572618573904, + "rewards/margins_max": 0.0177843626588583, + "rewards/margins_min": -0.006224422715604305, + "rewards/margins_std": 0.01057769451290369, + "rewards/rejected": -0.0018368462333455682, + "step": 270 + }, + { + "epoch": 0.07, + "grad_norm": 1.7442643996315879, + "learning_rate": 3.6553524804177545e-07, + "logits/chosen": -2.9020028114318848, + "logits/rejected": -2.859393835067749, + "logps/chosen": -277.94287109375, + "logps/rejected": -251.18673706054688, + "loss": 0.6914, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.002671810332685709, + "rewards/margins": 0.0014847529819235206, + "rewards/margins_max": 0.01451245229691267, + "rewards/margins_min": -0.01505077164620161, + "rewards/margins_std": 0.013318007811903954, + "rewards/rejected": 0.0011870566522702575, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 1.9467189867139072, + "learning_rate": 3.785900783289817e-07, + "logits/chosen": -2.8301501274108887, + "logits/rejected": -2.784475088119507, + "logps/chosen": -300.81072998046875, + "logps/rejected": -240.54299926757812, + "loss": 0.6904, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0028155259788036346, + "rewards/margins": 0.006524374242872, + "rewards/margins_max": 0.017730673775076866, + "rewards/margins_min": -0.0032137357629835606, + "rewards/margins_std": 0.00946731399744749, + "rewards/rejected": -0.0037088487297296524, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 1.8517327854061378, + "learning_rate": 3.91644908616188e-07, + "logits/chosen": -2.896501064300537, + "logits/rejected": -2.8582100868225098, + "logps/chosen": -296.66583251953125, + "logps/rejected": -240.6426239013672, + "loss": 0.6908, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0033859715331345797, + "rewards/margins": 0.005925352685153484, + "rewards/margins_max": 0.02057039365172386, + "rewards/margins_min": -0.003685288829728961, + "rewards/margins_std": 0.010821264237165451, + "rewards/rejected": -0.0025393813848495483, + "step": 300 + }, + { + "epoch": 0.08, + "eval_logits/chosen": -2.8185482025146484, + "eval_logits/rejected": -2.7806403636932373, + "eval_logps/chosen": -284.08837890625, + "eval_logps/rejected": -262.3995056152344, + "eval_loss": 0.6903285980224609, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": 0.004065065179020166, + "eval_rewards/margins": 0.006566147785633802, + "eval_rewards/margins_max": 0.028133919462561607, + "eval_rewards/margins_min": -0.014066058211028576, + "eval_rewards/margins_std": 0.01369909942150116, + "eval_rewards/rejected": -0.002501083305105567, + "eval_runtime": 390.0525, + "eval_samples_per_second": 5.128, + "eval_steps_per_second": 0.162, + "step": 300 + }, + { + "epoch": 0.08, + "grad_norm": 2.2431798991672682, + "learning_rate": 4.046997389033943e-07, + "logits/chosen": -2.8644967079162598, + "logits/rejected": -2.821427583694458, + "logps/chosen": -287.7940368652344, + "logps/rejected": -306.9446716308594, + "loss": 0.6903, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.003703732741996646, + "rewards/margins": 0.004778963979333639, + "rewards/margins_max": 0.0203932486474514, + "rewards/margins_min": -0.007310159504413605, + "rewards/margins_std": 0.01216651126742363, + "rewards/rejected": -0.0010752308880910277, + "step": 310 + }, + { + "epoch": 0.08, + "grad_norm": 1.8937973646025752, + "learning_rate": 4.1775456919060046e-07, + "logits/chosen": -2.8774189949035645, + "logits/rejected": -2.851226806640625, + "logps/chosen": -309.1805419921875, + "logps/rejected": -266.7749938964844, + "loss": 0.6901, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.006021854467689991, + "rewards/margins": 0.00795650202780962, + "rewards/margins_max": 0.030785446986556053, + "rewards/margins_min": -0.010963315144181252, + "rewards/margins_std": 0.0191182903945446, + "rewards/rejected": -0.0019346469780430198, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 2.28689701908976, + "learning_rate": 4.3080939947780675e-07, + "logits/chosen": -2.832188606262207, + "logits/rejected": -2.77230167388916, + "logps/chosen": -286.9901428222656, + "logps/rejected": -292.3201904296875, + "loss": 0.6896, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0058955540880560875, + "rewards/margins": 0.008626717142760754, + "rewards/margins_max": 0.030670801177620888, + "rewards/margins_min": -0.007502097636461258, + "rewards/margins_std": 0.01697888970375061, + "rewards/rejected": -0.0027311635203659534, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 2.1729689662804597, + "learning_rate": 4.4386422976501305e-07, + "logits/chosen": -2.7142205238342285, + "logits/rejected": -2.766373872756958, + "logps/chosen": -323.22161865234375, + "logps/rejected": -298.8636779785156, + "loss": 0.689, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007713875267654657, + "rewards/margins": 0.012516962364315987, + "rewards/margins_max": 0.036870528012514114, + "rewards/margins_min": -0.004132881294935942, + "rewards/margins_std": 0.018929392099380493, + "rewards/rejected": -0.004803087096661329, + "step": 340 + }, + { + "epoch": 0.09, + "grad_norm": 3.6870094878258546, + "learning_rate": 4.569190600522193e-07, + "logits/chosen": -2.755967617034912, + "logits/rejected": -2.7246787548065186, + "logps/chosen": -308.46197509765625, + "logps/rejected": -234.56497192382812, + "loss": 0.6889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006695881485939026, + "rewards/margins": 0.01046693418174982, + "rewards/margins_max": 0.03676037862896919, + "rewards/margins_min": -0.012542182579636574, + "rewards/margins_std": 0.02209511585533619, + "rewards/rejected": -0.00377105176448822, + "step": 350 + }, + { + "epoch": 0.09, + "grad_norm": 2.1428634189332154, + "learning_rate": 4.699738903394256e-07, + "logits/chosen": -2.9467151165008545, + "logits/rejected": -2.923065185546875, + "logps/chosen": -376.23114013671875, + "logps/rejected": -287.07366943359375, + "loss": 0.6884, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009115653112530708, + "rewards/margins": 0.013875307515263557, + "rewards/margins_max": 0.03865843266248703, + "rewards/margins_min": -0.008211077190935612, + "rewards/margins_std": 0.02091318741440773, + "rewards/rejected": -0.004759654402732849, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 1.843026617353686, + "learning_rate": 4.830287206266319e-07, + "logits/chosen": -2.6760706901550293, + "logits/rejected": -2.7056565284729004, + "logps/chosen": -213.94680786132812, + "logps/rejected": -252.49667358398438, + "loss": 0.6889, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.002887092065066099, + "rewards/margins": 0.0059961299411952496, + "rewards/margins_max": 0.022742409259080887, + "rewards/margins_min": -0.010617375373840332, + "rewards/margins_std": 0.014525306411087513, + "rewards/rejected": -0.0031090383417904377, + "step": 370 + }, + { + "epoch": 0.1, + "grad_norm": 1.8667599495785703, + "learning_rate": 4.960835509138381e-07, + "logits/chosen": -2.7873382568359375, + "logits/rejected": -2.667421817779541, + "logps/chosen": -281.27203369140625, + "logps/rejected": -286.5188293457031, + "loss": 0.689, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.005140557885169983, + "rewards/margins": 0.010293196886777878, + "rewards/margins_max": 0.04405423626303673, + "rewards/margins_min": -0.01714705117046833, + "rewards/margins_std": 0.026976879686117172, + "rewards/rejected": -0.0051526399329304695, + "step": 380 + }, + { + "epoch": 0.1, + "grad_norm": 2.065873594806032, + "learning_rate": 4.999948856244767e-07, + "logits/chosen": -2.9125492572784424, + "logits/rejected": -2.8705246448516846, + "logps/chosen": -287.80206298828125, + "logps/rejected": -260.21356201171875, + "loss": 0.6868, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.014310446567833424, + "rewards/margins": 0.018354644998908043, + "rewards/margins_max": 0.043558765202760696, + "rewards/margins_min": -0.0034627572167664766, + "rewards/margins_std": 0.020905693992972374, + "rewards/rejected": -0.004044197034090757, + "step": 390 + }, + { + "epoch": 0.1, + "grad_norm": 1.6976705579510238, + "learning_rate": 4.999698361256577e-07, + "logits/chosen": -2.7299952507019043, + "logits/rejected": -2.7269272804260254, + "logps/chosen": -254.76498413085938, + "logps/rejected": -235.59017944335938, + "loss": 0.689, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0034802258014678955, + "rewards/margins": 0.005200815852731466, + "rewards/margins_max": 0.04349198192358017, + "rewards/margins_min": -0.02966468594968319, + "rewards/margins_std": 0.0328243263065815, + "rewards/rejected": -0.0017205901676788926, + "step": 400 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.816160202026367, + "eval_logits/rejected": -2.7783429622650146, + "eval_logps/chosen": -283.56207275390625, + "eval_logps/rejected": -262.612548828125, + "eval_loss": 0.6870259046554565, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": 0.009328610263764858, + "eval_rewards/margins": 0.013960286043584347, + "eval_rewards/margins_max": 0.058580923825502396, + "eval_rewards/margins_min": -0.02823115698993206, + "eval_rewards/margins_std": 0.028497325256466866, + "eval_rewards/rejected": -0.0046316757798194885, + "eval_runtime": 389.918, + "eval_samples_per_second": 5.129, + "eval_steps_per_second": 0.162, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 1.8548169085276736, + "learning_rate": 4.99923914217458e-07, + "logits/chosen": -2.786663770675659, + "logits/rejected": -2.678673505783081, + "logps/chosen": -318.3971252441406, + "logps/rejected": -294.6222839355469, + "loss": 0.6872, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.00912786740809679, + "rewards/margins": 0.014760250225663185, + "rewards/margins_max": 0.03773067519068718, + "rewards/margins_min": -0.007255108095705509, + "rewards/margins_std": 0.020068898797035217, + "rewards/rejected": -0.005632384214550257, + "step": 410 + }, + { + "epoch": 0.11, + "grad_norm": 1.623237681276458, + "learning_rate": 4.99857123734344e-07, + "logits/chosen": -2.7617125511169434, + "logits/rejected": -2.7409169673919678, + "logps/chosen": -270.8794250488281, + "logps/rejected": -234.03955078125, + "loss": 0.6841, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0072760209441185, + "rewards/margins": 0.015793126076459885, + "rewards/margins_max": 0.04422985762357712, + "rewards/margins_min": -0.007469795644283295, + "rewards/margins_std": 0.023148344829678535, + "rewards/rejected": -0.008517105132341385, + "step": 420 + }, + { + "epoch": 0.11, + "grad_norm": 2.1875999427852597, + "learning_rate": 4.997694702533016e-07, + "logits/chosen": -2.789379119873047, + "logits/rejected": -2.6866648197174072, + "logps/chosen": -298.89141845703125, + "logps/rejected": -222.01318359375, + "loss": 0.6824, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02044040709733963, + "rewards/margins": 0.024051448330283165, + "rewards/margins_max": 0.0603443868458271, + "rewards/margins_min": -0.014315268024802208, + "rewards/margins_std": 0.03251716122031212, + "rewards/rejected": -0.0036110393702983856, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 1.8849223491753326, + "learning_rate": 4.996609610933712e-07, + "logits/chosen": -2.847097873687744, + "logits/rejected": -2.842127799987793, + "logps/chosen": -283.3700866699219, + "logps/rejected": -261.63629150390625, + "loss": 0.6819, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01466794777661562, + "rewards/margins": 0.024427276104688644, + "rewards/margins_max": 0.06727512180805206, + "rewards/margins_min": -0.005743957124650478, + "rewards/margins_std": 0.03240900859236717, + "rewards/rejected": -0.009759325534105301, + "step": 440 + }, + { + "epoch": 0.12, + "grad_norm": 2.0591923171014925, + "learning_rate": 4.995316053150366e-07, + "logits/chosen": -2.8860111236572266, + "logits/rejected": -2.8293492794036865, + "logps/chosen": -285.3565673828125, + "logps/rejected": -234.31381225585938, + "loss": 0.6841, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.008421518839895725, + "rewards/margins": 0.010583983734250069, + "rewards/margins_max": 0.05998050421476364, + "rewards/margins_min": -0.03539139777421951, + "rewards/margins_std": 0.04224396497011185, + "rewards/rejected": -0.0021624676883220673, + "step": 450 + }, + { + "epoch": 0.12, + "grad_norm": 2.0676253243670892, + "learning_rate": 4.99381413719468e-07, + "logits/chosen": -2.7188210487365723, + "logits/rejected": -2.7170615196228027, + "logps/chosen": -250.4535369873047, + "logps/rejected": -265.24041748046875, + "loss": 0.6796, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.010399861261248589, + "rewards/margins": 0.01576301082968712, + "rewards/margins_max": 0.06852856278419495, + "rewards/margins_min": -0.025561099871993065, + "rewards/margins_std": 0.04107708856463432, + "rewards/rejected": -0.005363152362406254, + "step": 460 + }, + { + "epoch": 0.12, + "grad_norm": 1.4198008069840118, + "learning_rate": 4.992103988476205e-07, + "logits/chosen": -2.7635912895202637, + "logits/rejected": -2.829055070877075, + "logps/chosen": -285.4428405761719, + "logps/rejected": -276.16107177734375, + "loss": 0.6851, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.020185593515634537, + "rewards/margins": 0.02256001904606819, + "rewards/margins_max": 0.06617563962936401, + "rewards/margins_min": -0.02061784267425537, + "rewards/margins_std": 0.039386551827192307, + "rewards/rejected": -0.002374425530433655, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 1.5042782013912142, + "learning_rate": 4.990185749791864e-07, + "logits/chosen": -2.864286184310913, + "logits/rejected": -2.80326509475708, + "logps/chosen": -273.052001953125, + "logps/rejected": -241.15426635742188, + "loss": 0.6818, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02452642284333706, + "rewards/margins": 0.023531539365649223, + "rewards/margins_max": 0.08403842151165009, + "rewards/margins_min": -0.04219301789999008, + "rewards/margins_std": 0.05506383255124092, + "rewards/rejected": 0.0009948821971192956, + "step": 480 + }, + { + "epoch": 0.13, + "grad_norm": 1.7149327283293978, + "learning_rate": 4.988059581314039e-07, + "logits/chosen": -2.8230090141296387, + "logits/rejected": -2.8302505016326904, + "logps/chosen": -320.6971740722656, + "logps/rejected": -317.40228271484375, + "loss": 0.684, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.027586495503783226, + "rewards/margins": 0.03036860190331936, + "rewards/margins_max": 0.09689000993967056, + "rewards/margins_min": -0.027619604021310806, + "rewards/margins_std": 0.055679332464933395, + "rewards/rejected": -0.0027821047697216272, + "step": 490 + }, + { + "epoch": 0.13, + "grad_norm": 1.8419426286939293, + "learning_rate": 4.985725660577184e-07, + "logits/chosen": -2.7903518676757812, + "logits/rejected": -2.7788021564483643, + "logps/chosen": -297.08685302734375, + "logps/rejected": -247.04733276367188, + "loss": 0.6813, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0243827011436224, + "rewards/margins": 0.0315212681889534, + "rewards/margins_max": 0.10152582824230194, + "rewards/margins_min": -0.02060362510383129, + "rewards/margins_std": 0.05341911315917969, + "rewards/rejected": -0.007138565182685852, + "step": 500 + }, + { + "epoch": 0.13, + "eval_logits/chosen": -2.813218832015991, + "eval_logits/rejected": -2.775815963745117, + "eval_logps/chosen": -282.1426086425781, + "eval_logps/rejected": -262.54498291015625, + "eval_loss": 0.6812968850135803, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.023522978648543358, + "eval_rewards/margins": 0.027478953823447227, + "eval_rewards/margins_max": 0.1137406975030899, + "eval_rewards/margins_min": -0.053425367921590805, + "eval_rewards/margins_std": 0.055097389966249466, + "eval_rewards/rejected": -0.003955978900194168, + "eval_runtime": 422.0487, + "eval_samples_per_second": 4.739, + "eval_steps_per_second": 0.149, + "step": 500 + }, + { + "epoch": 0.13, + "grad_norm": 1.8186607889535364, + "learning_rate": 4.983184182463008e-07, + "logits/chosen": -2.675616502761841, + "logits/rejected": -2.704348564147949, + "logps/chosen": -289.99566650390625, + "logps/rejected": -253.94094848632812, + "loss": 0.6768, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.024030497297644615, + "rewards/margins": 0.028584983199834824, + "rewards/margins_max": 0.10755284130573273, + "rewards/margins_min": -0.03850904852151871, + "rewards/margins_std": 0.06555557996034622, + "rewards/rejected": -0.00455448916181922, + "step": 510 + }, + { + "epoch": 0.14, + "grad_norm": 1.7613794376414036, + "learning_rate": 4.980435359184203e-07, + "logits/chosen": -2.7527947425842285, + "logits/rejected": -2.665276527404785, + "logps/chosen": -272.4964294433594, + "logps/rejected": -231.20956420898438, + "loss": 0.6814, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01815834641456604, + "rewards/margins": 0.02934591844677925, + "rewards/margins_max": 0.08893508464097977, + "rewards/margins_min": -0.017313817515969276, + "rewards/margins_std": 0.04701102524995804, + "rewards/rejected": -0.011187572963535786, + "step": 520 + }, + { + "epoch": 0.14, + "grad_norm": 2.075796328408375, + "learning_rate": 4.977479420266723e-07, + "logits/chosen": -2.864997625350952, + "logits/rejected": -2.8428683280944824, + "logps/chosen": -323.49578857421875, + "logps/rejected": -292.41558837890625, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03332526609301567, + "rewards/margins": 0.03451292961835861, + "rewards/margins_max": 0.11126528680324554, + "rewards/margins_min": -0.02738284505903721, + "rewards/margins_std": 0.06441030651330948, + "rewards/rejected": -0.0011876634089276195, + "step": 530 + }, + { + "epoch": 0.14, + "grad_norm": 2.071716408391973, + "learning_rate": 4.974316612530614e-07, + "logits/chosen": -2.7868294715881348, + "logits/rejected": -2.779574394226074, + "logps/chosen": -287.04852294921875, + "logps/rejected": -250.73941040039062, + "loss": 0.6779, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.019049355760216713, + "rewards/margins": 0.014977745711803436, + "rewards/margins_max": 0.0998459979891777, + "rewards/margins_min": -0.07045114785432816, + "rewards/margins_std": 0.0745791643857956, + "rewards/rejected": 0.004071609117090702, + "step": 540 + }, + { + "epoch": 0.14, + "grad_norm": 1.8136389212374102, + "learning_rate": 4.970947200069415e-07, + "logits/chosen": -2.8890540599823, + "logits/rejected": -2.8480703830718994, + "logps/chosen": -294.9941101074219, + "logps/rejected": -267.368896484375, + "loss": 0.6788, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03156139701604843, + "rewards/margins": 0.041848182678222656, + "rewards/margins_max": 0.11787474155426025, + "rewards/margins_min": -0.03936387598514557, + "rewards/margins_std": 0.06998318433761597, + "rewards/rejected": -0.010286782868206501, + "step": 550 + }, + { + "epoch": 0.15, + "grad_norm": 1.9199011365948238, + "learning_rate": 4.967371464228095e-07, + "logits/chosen": -2.8221750259399414, + "logits/rejected": -2.8718185424804688, + "logps/chosen": -297.29278564453125, + "logps/rejected": -264.77740478515625, + "loss": 0.6787, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02653132937848568, + "rewards/margins": 0.0242681335657835, + "rewards/margins_max": 0.10515744984149933, + "rewards/margins_min": -0.0664416179060936, + "rewards/margins_std": 0.07764483988285065, + "rewards/rejected": 0.0022631962783634663, + "step": 560 + }, + { + "epoch": 0.15, + "grad_norm": 1.8023497397427022, + "learning_rate": 4.963589703579569e-07, + "logits/chosen": -2.797438144683838, + "logits/rejected": -2.757880449295044, + "logps/chosen": -270.39324951171875, + "logps/rejected": -230.986083984375, + "loss": 0.6735, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.026806339621543884, + "rewards/margins": 0.05177195742726326, + "rewards/margins_max": 0.12073322385549545, + "rewards/margins_min": -0.0232029240578413, + "rewards/margins_std": 0.06589220464229584, + "rewards/rejected": -0.024965617805719376, + "step": 570 + }, + { + "epoch": 0.15, + "grad_norm": 2.086887329101702, + "learning_rate": 4.959602233899761e-07, + "logits/chosen": -2.928542375564575, + "logits/rejected": -2.8668572902679443, + "logps/chosen": -364.71795654296875, + "logps/rejected": -268.4106750488281, + "loss": 0.673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05592746287584305, + "rewards/margins": 0.07521191984415054, + "rewards/margins_max": 0.15813851356506348, + "rewards/margins_min": -0.0054847379215061665, + "rewards/margins_std": 0.0745110958814621, + "rewards/rejected": -0.019284451380372047, + "step": 580 + }, + { + "epoch": 0.15, + "grad_norm": 1.6888933434065136, + "learning_rate": 4.955409388141243e-07, + "logits/chosen": -2.8233585357666016, + "logits/rejected": -2.790508508682251, + "logps/chosen": -257.5228576660156, + "logps/rejected": -240.89035034179688, + "loss": 0.6683, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.014462867751717567, + "rewards/margins": 0.028144676238298416, + "rewards/margins_max": 0.10829715430736542, + "rewards/margins_min": -0.05848151445388794, + "rewards/margins_std": 0.07187186181545258, + "rewards/rejected": -0.013681808486580849, + "step": 590 + }, + { + "epoch": 0.16, + "grad_norm": 1.8095757144161815, + "learning_rate": 4.951011516405429e-07, + "logits/chosen": -2.892937183380127, + "logits/rejected": -2.8614859580993652, + "logps/chosen": -377.584228515625, + "logps/rejected": -303.1300964355469, + "loss": 0.6712, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03588090091943741, + "rewards/margins": 0.03732754662632942, + "rewards/margins_max": 0.16343256831169128, + "rewards/margins_min": -0.07301119714975357, + "rewards/margins_std": 0.10263297706842422, + "rewards/rejected": -0.0014466438442468643, + "step": 600 + }, + { + "epoch": 0.16, + "eval_logits/chosen": -2.801452398300171, + "eval_logits/rejected": -2.7638142108917236, + "eval_logps/chosen": -282.4901123046875, + "eval_logps/rejected": -264.6150817871094, + "eval_loss": 0.6741740703582764, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": 0.0200479906052351, + "eval_rewards/margins": 0.04470517113804817, + "eval_rewards/margins_max": 0.18139547109603882, + "eval_rewards/margins_min": -0.0859316885471344, + "eval_rewards/margins_std": 0.08839290589094162, + "eval_rewards/rejected": -0.02465718239545822, + "eval_runtime": 390.9152, + "eval_samples_per_second": 5.116, + "eval_steps_per_second": 0.161, + "step": 600 + }, + { + "epoch": 0.16, + "grad_norm": 2.2428448607305453, + "learning_rate": 4.946408985913344e-07, + "logits/chosen": -2.8098888397216797, + "logits/rejected": -2.7577571868896484, + "logps/chosen": -341.11248779296875, + "logps/rejected": -273.4836120605469, + "loss": 0.6653, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02543538250029087, + "rewards/margins": 0.050846438854932785, + "rewards/margins_max": 0.17087194323539734, + "rewards/margins_min": -0.050321198999881744, + "rewards/margins_std": 0.10122491419315338, + "rewards/rejected": -0.02541106380522251, + "step": 610 + }, + { + "epoch": 0.16, + "grad_norm": 3.5624487354538616, + "learning_rate": 4.941602180974958e-07, + "logits/chosen": -2.8534107208251953, + "logits/rejected": -2.8060998916625977, + "logps/chosen": -267.41961669921875, + "logps/rejected": -235.6201629638672, + "loss": 0.6679, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010599276050925255, + "rewards/margins": 0.03714997321367264, + "rewards/margins_max": 0.1458701342344284, + "rewards/margins_min": -0.04852636158466339, + "rewards/margins_std": 0.08977067470550537, + "rewards/rejected": -0.04774925112724304, + "step": 620 + }, + { + "epoch": 0.16, + "grad_norm": 1.8792532681671057, + "learning_rate": 4.936591502957101e-07, + "logits/chosen": -2.8062539100646973, + "logits/rejected": -2.797257900238037, + "logps/chosen": -291.7829895019531, + "logps/rejected": -279.5833435058594, + "loss": 0.6758, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0280984528362751, + "rewards/margins": 0.05850961059331894, + "rewards/margins_max": 0.1763559877872467, + "rewards/margins_min": -0.04410520941019058, + "rewards/margins_std": 0.09619376808404922, + "rewards/rejected": -0.03041115775704384, + "step": 630 + }, + { + "epoch": 0.17, + "grad_norm": 2.385636254299802, + "learning_rate": 4.931377370249945e-07, + "logits/chosen": -2.8240973949432373, + "logits/rejected": -2.780646800994873, + "logps/chosen": -342.7118835449219, + "logps/rejected": -264.07098388671875, + "loss": 0.6625, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.026798825711011887, + "rewards/margins": 0.06868405640125275, + "rewards/margins_max": 0.1606755554676056, + "rewards/margins_min": -0.021353289484977722, + "rewards/margins_std": 0.0826662927865982, + "rewards/rejected": -0.04188523441553116, + "step": 640 + }, + { + "epoch": 0.17, + "grad_norm": 2.236591973211155, + "learning_rate": 4.925960218232072e-07, + "logits/chosen": -2.83638334274292, + "logits/rejected": -2.83496356010437, + "logps/chosen": -279.4608459472656, + "logps/rejected": -284.7081604003906, + "loss": 0.6657, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007772954646497965, + "rewards/margins": 0.05204874277114868, + "rewards/margins_max": 0.17527373135089874, + "rewards/margins_min": -0.06125093251466751, + "rewards/margins_std": 0.10625018924474716, + "rewards/rejected": -0.05982169508934021, + "step": 650 + }, + { + "epoch": 0.17, + "grad_norm": 1.8746089527521534, + "learning_rate": 4.920340499234116e-07, + "logits/chosen": -2.8171701431274414, + "logits/rejected": -2.7943077087402344, + "logps/chosen": -239.93441772460938, + "logps/rejected": -258.3723449707031, + "loss": 0.6712, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.002816335763782263, + "rewards/margins": 0.046120982617139816, + "rewards/margins_max": 0.19472922384738922, + "rewards/margins_min": -0.11324380338191986, + "rewards/margins_std": 0.1352563202381134, + "rewards/rejected": -0.04330464452505112, + "step": 660 + }, + { + "epoch": 0.18, + "grad_norm": 1.964155922354421, + "learning_rate": 4.914518682500995e-07, + "logits/chosen": -2.8140225410461426, + "logits/rejected": -2.783785343170166, + "logps/chosen": -247.772216796875, + "logps/rejected": -249.89608764648438, + "loss": 0.6717, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.002225994598120451, + "rewards/margins": 0.030118519440293312, + "rewards/margins_max": 0.1618417501449585, + "rewards/margins_min": -0.11976579576730728, + "rewards/margins_std": 0.12460299581289291, + "rewards/rejected": -0.0323445126414299, + "step": 670 + }, + { + "epoch": 0.18, + "grad_norm": 1.976125917731639, + "learning_rate": 4.90849525415273e-07, + "logits/chosen": -2.74239182472229, + "logits/rejected": -2.6956348419189453, + "logps/chosen": -324.91510009765625, + "logps/rejected": -293.3082275390625, + "loss": 0.6618, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.04486643522977829, + "rewards/margins": 0.07141599804162979, + "rewards/margins_max": 0.20019304752349854, + "rewards/margins_min": -0.05447880178689957, + "rewards/margins_std": 0.11058036983013153, + "rewards/rejected": -0.026549557223916054, + "step": 680 + }, + { + "epoch": 0.18, + "grad_norm": 1.9160157494345653, + "learning_rate": 4.902270717143858e-07, + "logits/chosen": -2.7687084674835205, + "logits/rejected": -2.714263916015625, + "logps/chosen": -283.96124267578125, + "logps/rejected": -248.7123565673828, + "loss": 0.66, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004153305198997259, + "rewards/margins": 0.08984359353780746, + "rewards/margins_max": 0.23725533485412598, + "rewards/margins_min": -0.04190779849886894, + "rewards/margins_std": 0.12452250719070435, + "rewards/rejected": -0.08569028973579407, + "step": 690 + }, + { + "epoch": 0.18, + "grad_norm": 2.067618636356554, + "learning_rate": 4.895845591221426e-07, + "logits/chosen": -2.7659411430358887, + "logits/rejected": -2.7344131469726562, + "logps/chosen": -273.3089294433594, + "logps/rejected": -265.91455078125, + "loss": 0.6643, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.009775553829967976, + "rewards/margins": 0.053518861532211304, + "rewards/margins_max": 0.24035973846912384, + "rewards/margins_min": -0.07251081615686417, + "rewards/margins_std": 0.14143694937229156, + "rewards/rejected": -0.0632944107055664, + "step": 700 + }, + { + "epoch": 0.18, + "eval_logits/chosen": -2.7925009727478027, + "eval_logits/rejected": -2.7557969093322754, + "eval_logps/chosen": -284.4590759277344, + "eval_logps/rejected": -268.8294677734375, + "eval_loss": 0.6653199195861816, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.00035800979821942747, + "eval_rewards/margins": 0.06715869158506393, + "eval_rewards/margins_max": 0.27070772647857666, + "eval_rewards/margins_min": -0.13053427636623383, + "eval_rewards/margins_std": 0.13289296627044678, + "eval_rewards/rejected": -0.06680068373680115, + "eval_runtime": 391.2896, + "eval_samples_per_second": 5.111, + "eval_steps_per_second": 0.161, + "step": 700 + }, + { + "epoch": 0.19, + "grad_norm": 1.96522703281245, + "learning_rate": 4.8892204128816e-07, + "logits/chosen": -2.820277214050293, + "logits/rejected": -2.797170400619507, + "logps/chosen": -229.0479278564453, + "logps/rejected": -195.39166259765625, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05253206938505173, + "rewards/margins": 0.010952472686767578, + "rewards/margins_max": 0.09288744628429413, + "rewards/margins_min": -0.06414759904146194, + "rewards/margins_std": 0.06775766611099243, + "rewards/rejected": -0.0634845420718193, + "step": 710 + }, + { + "epoch": 0.19, + "grad_norm": 2.35938490303437, + "learning_rate": 4.882395735324863e-07, + "logits/chosen": -2.731125831604004, + "logits/rejected": -2.678368091583252, + "logps/chosen": -367.2640686035156, + "logps/rejected": -308.25408935546875, + "loss": 0.657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.020218107849359512, + "rewards/margins": 0.07828657329082489, + "rewards/margins_max": 0.2631523609161377, + "rewards/margins_min": -0.0792718231678009, + "rewards/margins_std": 0.15117041766643524, + "rewards/rejected": -0.05806846544146538, + "step": 720 + }, + { + "epoch": 0.19, + "grad_norm": 2.289306523462492, + "learning_rate": 4.875372128409829e-07, + "logits/chosen": -2.890432834625244, + "logits/rejected": -2.8096516132354736, + "logps/chosen": -313.586669921875, + "logps/rejected": -263.13653564453125, + "loss": 0.6558, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02410699799656868, + "rewards/margins": 0.10518620163202286, + "rewards/margins_max": 0.2743312418460846, + "rewards/margins_min": -0.021943673491477966, + "rewards/margins_std": 0.1392769068479538, + "rewards/rejected": -0.08107919991016388, + "step": 730 + }, + { + "epoch": 0.19, + "grad_norm": 2.094035228830045, + "learning_rate": 4.868150178605653e-07, + "logits/chosen": -2.783390760421753, + "logits/rejected": -2.836897373199463, + "logps/chosen": -269.4111328125, + "logps/rejected": -335.0610046386719, + "loss": 0.6654, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00902662891894579, + "rewards/margins": 0.03886953741312027, + "rewards/margins_max": 0.2387189418077469, + "rewards/margins_min": -0.14600315690040588, + "rewards/margins_std": 0.16796886920928955, + "rewards/rejected": -0.04789616912603378, + "step": 740 + }, + { + "epoch": 0.2, + "grad_norm": 2.367114176102428, + "learning_rate": 4.860730488943068e-07, + "logits/chosen": -2.770197868347168, + "logits/rejected": -2.838973045349121, + "logps/chosen": -254.0792999267578, + "logps/rejected": -276.044921875, + "loss": 0.6632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007259447127580643, + "rewards/margins": 0.06518127024173737, + "rewards/margins_max": 0.22273357212543488, + "rewards/margins_min": -0.06072039157152176, + "rewards/margins_std": 0.1269427388906479, + "rewards/rejected": -0.07244071364402771, + "step": 750 + }, + { + "epoch": 0.2, + "grad_norm": 2.345833729483626, + "learning_rate": 4.853113678964021e-07, + "logits/chosen": -2.866006851196289, + "logits/rejected": -2.7869017124176025, + "logps/chosen": -293.8077087402344, + "logps/rejected": -251.48779296875, + "loss": 0.654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03852896764874458, + "rewards/margins": 0.12122176587581635, + "rewards/margins_max": 0.2612719237804413, + "rewards/margins_min": 0.007431653328239918, + "rewards/margins_std": 0.11730413138866425, + "rewards/rejected": -0.08269279450178146, + "step": 760 + }, + { + "epoch": 0.2, + "grad_norm": 2.227742842635253, + "learning_rate": 4.845300384669957e-07, + "logits/chosen": -2.8291211128234863, + "logits/rejected": -2.7683908939361572, + "logps/chosen": -317.635009765625, + "logps/rejected": -284.8511962890625, + "loss": 0.6493, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02825760282576084, + "rewards/margins": 0.10237088054418564, + "rewards/margins_max": 0.3379212021827698, + "rewards/margins_min": -0.0696760043501854, + "rewards/margins_std": 0.17869111895561218, + "rewards/rejected": -0.13062849640846252, + "step": 770 + }, + { + "epoch": 0.2, + "grad_norm": 3.0787860915639174, + "learning_rate": 4.8372912584687e-07, + "logits/chosen": -2.87286376953125, + "logits/rejected": -2.809812307357788, + "logps/chosen": -312.3179626464844, + "logps/rejected": -289.5312805175781, + "loss": 0.6559, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004219514317810535, + "rewards/margins": 0.08016898483037949, + "rewards/margins_max": 0.2879684567451477, + "rewards/margins_min": -0.07088414579629898, + "rewards/margins_std": 0.15463107824325562, + "rewards/rejected": -0.08438849449157715, + "step": 780 + }, + { + "epoch": 0.21, + "grad_norm": 2.3434691197253277, + "learning_rate": 4.829086969119983e-07, + "logits/chosen": -2.8896708488464355, + "logits/rejected": -2.8565032482147217, + "logps/chosen": -293.54132080078125, + "logps/rejected": -275.9646301269531, + "loss": 0.665, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04692497104406357, + "rewards/margins": 0.04775308072566986, + "rewards/margins_max": 0.20688419044017792, + "rewards/margins_min": -0.13901486992835999, + "rewards/margins_std": 0.15275272727012634, + "rewards/rejected": -0.09467805922031403, + "step": 790 + }, + { + "epoch": 0.21, + "grad_norm": 2.2991182959255636, + "learning_rate": 4.820688201679605e-07, + "logits/chosen": -2.7383341789245605, + "logits/rejected": -2.708282232284546, + "logps/chosen": -353.880126953125, + "logps/rejected": -266.53765869140625, + "loss": 0.6421, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012230666354298592, + "rewards/margins": 0.138229638338089, + "rewards/margins_max": 0.3964124321937561, + "rewards/margins_min": -0.05819373577833176, + "rewards/margins_std": 0.2058323174715042, + "rewards/rejected": -0.15046028792858124, + "step": 800 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.787998914718628, + "eval_logits/rejected": -2.7518978118896484, + "eval_logps/chosen": -286.80169677734375, + "eval_logps/rejected": -273.6846618652344, + "eval_loss": 0.656235933303833, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": -0.0230683833360672, + "eval_rewards/margins": 0.09228412806987762, + "eval_rewards/margins_max": 0.37058985233306885, + "eval_rewards/margins_min": -0.17606160044670105, + "eval_rewards/margins_std": 0.18203040957450867, + "eval_rewards/rejected": -0.11535251140594482, + "eval_runtime": 391.4416, + "eval_samples_per_second": 5.109, + "eval_steps_per_second": 0.161, + "step": 800 + }, + { + "epoch": 0.21, + "grad_norm": 2.570075583626715, + "learning_rate": 4.812095657442231e-07, + "logits/chosen": -2.8586349487304688, + "logits/rejected": -2.8139870166778564, + "logps/chosen": -309.8063049316406, + "logps/rejected": -281.1913146972656, + "loss": 0.6462, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014870223589241505, + "rewards/margins": 0.10527092218399048, + "rewards/margins_max": 0.3421846032142639, + "rewards/margins_min": -0.10647294670343399, + "rewards/margins_std": 0.20276649296283722, + "rewards/rejected": -0.12014114856719971, + "step": 810 + }, + { + "epoch": 0.21, + "grad_norm": 2.4601158102424137, + "learning_rate": 4.803310053882831e-07, + "logits/chosen": -2.716233491897583, + "logits/rejected": -2.649547815322876, + "logps/chosen": -270.30303955078125, + "logps/rejected": -226.66641235351562, + "loss": 0.6469, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.004079883452504873, + "rewards/margins": 0.1493518054485321, + "rewards/margins_max": 0.4142698347568512, + "rewards/margins_min": -0.04770839586853981, + "rewards/margins_std": 0.2037762701511383, + "rewards/rejected": -0.15343168377876282, + "step": 820 + }, + { + "epoch": 0.22, + "grad_norm": 2.4684877859968193, + "learning_rate": 4.794332124596775e-07, + "logits/chosen": -2.745849609375, + "logits/rejected": -2.7065165042877197, + "logps/chosen": -323.3494567871094, + "logps/rejected": -310.7379455566406, + "loss": 0.6565, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007220558822154999, + "rewards/margins": 0.08912922441959381, + "rewards/margins_max": 0.3232632279396057, + "rewards/margins_min": -0.08977918326854706, + "rewards/margins_std": 0.1849728524684906, + "rewards/rejected": -0.0963498055934906, + "step": 830 + }, + { + "epoch": 0.22, + "grad_norm": 2.9483084985503996, + "learning_rate": 4.785162619238574e-07, + "logits/chosen": -2.8032937049865723, + "logits/rejected": -2.7877614498138428, + "logps/chosen": -257.4497985839844, + "logps/rejected": -216.58297729492188, + "loss": 0.6636, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.022510075941681862, + "rewards/margins": 0.04860452190041542, + "rewards/margins_max": 0.22036270797252655, + "rewards/margins_min": -0.11207699775695801, + "rewards/margins_std": 0.15363383293151855, + "rewards/rejected": -0.07111459970474243, + "step": 840 + }, + { + "epoch": 0.22, + "grad_norm": 2.598261391274029, + "learning_rate": 4.775802303459287e-07, + "logits/chosen": -2.7339680194854736, + "logits/rejected": -2.6969475746154785, + "logps/chosen": -234.9973907470703, + "logps/rejected": -246.1621856689453, + "loss": 0.6429, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027218088507652283, + "rewards/margins": 0.045462869107723236, + "rewards/margins_max": 0.2264690399169922, + "rewards/margins_min": -0.1118873581290245, + "rewards/margins_std": 0.1564524918794632, + "rewards/rejected": -0.07268096506595612, + "step": 850 + }, + { + "epoch": 0.23, + "grad_norm": 2.6844287445052757, + "learning_rate": 4.766251958842589e-07, + "logits/chosen": -2.709624767303467, + "logits/rejected": -2.7868399620056152, + "logps/chosen": -146.85594177246094, + "logps/rejected": -210.48385620117188, + "loss": 0.6581, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017467444762587547, + "rewards/margins": 0.08572043478488922, + "rewards/margins_max": 0.31933772563934326, + "rewards/margins_min": -0.0943085104227066, + "rewards/margins_std": 0.17804118990898132, + "rewards/rejected": -0.06825298070907593, + "step": 860 + }, + { + "epoch": 0.23, + "grad_norm": 3.26143073906196, + "learning_rate": 4.756512382839506e-07, + "logits/chosen": -2.737210750579834, + "logits/rejected": -2.719393491744995, + "logps/chosen": -306.5011291503906, + "logps/rejected": -251.0727081298828, + "loss": 0.6466, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.005747741553932428, + "rewards/margins": 0.1588570773601532, + "rewards/margins_max": 0.3946298062801361, + "rewards/margins_min": -0.04763476178050041, + "rewards/margins_std": 0.20159199833869934, + "rewards/rejected": -0.1531093418598175, + "step": 870 + }, + { + "epoch": 0.23, + "grad_norm": 2.5109300101737624, + "learning_rate": 4.746584388701831e-07, + "logits/chosen": -2.8163034915924072, + "logits/rejected": -2.8024582862854004, + "logps/chosen": -270.1556396484375, + "logps/rejected": -318.8199157714844, + "loss": 0.666, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0483771488070488, + "rewards/margins": 0.04363058879971504, + "rewards/margins_max": 0.2674371004104614, + "rewards/margins_min": -0.20797009766101837, + "rewards/margins_std": 0.20692701637744904, + "rewards/rejected": -0.09200773388147354, + "step": 880 + }, + { + "epoch": 0.23, + "grad_norm": 2.8116402901537425, + "learning_rate": 4.736468805414218e-07, + "logits/chosen": -2.7627415657043457, + "logits/rejected": -2.720236301422119, + "logps/chosen": -311.46551513671875, + "logps/rejected": -267.77471923828125, + "loss": 0.6486, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0706297755241394, + "rewards/margins": 0.09270794689655304, + "rewards/margins_max": 0.33370673656463623, + "rewards/margins_min": -0.11423573642969131, + "rewards/margins_std": 0.1945406049489975, + "rewards/rejected": -0.16333773732185364, + "step": 890 + }, + { + "epoch": 0.24, + "grad_norm": 3.0400011040033044, + "learning_rate": 4.7261664776249595e-07, + "logits/chosen": -2.758789539337158, + "logits/rejected": -2.7628347873687744, + "logps/chosen": -237.7381134033203, + "logps/rejected": -288.02923583984375, + "loss": 0.648, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08034859597682953, + "rewards/margins": 0.08839185535907745, + "rewards/margins_max": 0.37854239344596863, + "rewards/margins_min": -0.17396271228790283, + "rewards/margins_std": 0.2471902072429657, + "rewards/rejected": -0.1687404364347458, + "step": 900 + }, + { + "epoch": 0.24, + "eval_logits/chosen": -2.7834675312042236, + "eval_logits/rejected": -2.747748374938965, + "eval_logps/chosen": -291.9791259765625, + "eval_logps/rejected": -281.5313720703125, + "eval_loss": 0.6480011940002441, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.07484235614538193, + "eval_rewards/margins": 0.1189778596162796, + "eval_rewards/margins_max": 0.482337087392807, + "eval_rewards/margins_min": -0.2242155224084854, + "eval_rewards/margins_std": 0.2358839511871338, + "eval_rewards/rejected": -0.19382022321224213, + "eval_runtime": 391.0601, + "eval_samples_per_second": 5.114, + "eval_steps_per_second": 0.161, + "step": 900 + }, + { + "epoch": 0.24, + "grad_norm": 2.970779046788304, + "learning_rate": 4.7156782655754624e-07, + "logits/chosen": -2.782240629196167, + "logits/rejected": -2.76359224319458, + "logps/chosen": -275.40557861328125, + "logps/rejected": -260.886474609375, + "loss": 0.645, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.06377319991588593, + "rewards/margins": 0.10989705473184586, + "rewards/margins_max": 0.39979439973831177, + "rewards/margins_min": -0.1536472737789154, + "rewards/margins_std": 0.2433263510465622, + "rewards/rejected": -0.17367026209831238, + "step": 910 + }, + { + "epoch": 0.24, + "grad_norm": 2.4418458588586818, + "learning_rate": 4.705005045028414e-07, + "logits/chosen": -2.798334836959839, + "logits/rejected": -2.7217626571655273, + "logps/chosen": -295.2657165527344, + "logps/rejected": -265.0308532714844, + "loss": 0.6571, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.049138277769088745, + "rewards/margins": 0.07089325040578842, + "rewards/margins_max": 0.30888086557388306, + "rewards/margins_min": -0.2100508213043213, + "rewards/margins_std": 0.23000212013721466, + "rewards/rejected": -0.12003152072429657, + "step": 920 + }, + { + "epoch": 0.24, + "grad_norm": 2.885220150274803, + "learning_rate": 4.694147707194659e-07, + "logits/chosen": -2.7554876804351807, + "logits/rejected": -2.674265146255493, + "logps/chosen": -338.0996398925781, + "logps/rejected": -315.718505859375, + "loss": 0.6182, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03579360246658325, + "rewards/margins": 0.18136434257030487, + "rewards/margins_max": 0.5156379342079163, + "rewards/margins_min": -0.15117457509040833, + "rewards/margins_std": 0.3062846064567566, + "rewards/rejected": -0.21715793013572693, + "step": 930 + }, + { + "epoch": 0.25, + "grad_norm": 3.4228904297454465, + "learning_rate": 4.683107158658781e-07, + "logits/chosen": -2.76875638961792, + "logits/rejected": -2.7667553424835205, + "logps/chosen": -308.06768798828125, + "logps/rejected": -289.34466552734375, + "loss": 0.639, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0005728408577851951, + "rewards/margins": 0.10930474102497101, + "rewards/margins_max": 0.3374796509742737, + "rewards/margins_min": -0.15817001461982727, + "rewards/margins_std": 0.21780426800251007, + "rewards/rejected": -0.1098775863647461, + "step": 940 + }, + { + "epoch": 0.25, + "grad_norm": 2.5362388521102686, + "learning_rate": 4.6718843213034066e-07, + "logits/chosen": -2.7847044467926025, + "logits/rejected": -2.8171563148498535, + "logps/chosen": -273.6488037109375, + "logps/rejected": -321.4742431640625, + "loss": 0.6409, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0013743474846705794, + "rewards/margins": 0.13102427124977112, + "rewards/margins_max": 0.38909009099006653, + "rewards/margins_min": -0.19913214445114136, + "rewards/margins_std": 0.27481839060783386, + "rewards/rejected": -0.13239862024784088, + "step": 950 + }, + { + "epoch": 0.25, + "grad_norm": 2.6719721726264822, + "learning_rate": 4.660480132232224e-07, + "logits/chosen": -2.801206111907959, + "logits/rejected": -2.701752185821533, + "logps/chosen": -375.70648193359375, + "logps/rejected": -313.06781005859375, + "loss": 0.6355, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.05575571209192276, + "rewards/margins": 0.07538709044456482, + "rewards/margins_max": 0.37454521656036377, + "rewards/margins_min": -0.16868866980075836, + "rewards/margins_std": 0.25122708082199097, + "rewards/rejected": -0.13114280998706818, + "step": 960 + }, + { + "epoch": 0.25, + "grad_norm": 2.917127046127246, + "learning_rate": 4.64889554369174e-07, + "logits/chosen": -2.7776808738708496, + "logits/rejected": -2.7744948863983154, + "logps/chosen": -330.93975830078125, + "logps/rejected": -301.06915283203125, + "loss": 0.6212, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.00033350809826515615, + "rewards/margins": 0.21753618121147156, + "rewards/margins_max": 0.44270166754722595, + "rewards/margins_min": -0.07134756445884705, + "rewards/margins_std": 0.22576940059661865, + "rewards/rejected": -0.21786971390247345, + "step": 970 + }, + { + "epoch": 0.26, + "grad_norm": 2.630156583874938, + "learning_rate": 4.637131522991764e-07, + "logits/chosen": -2.8452365398406982, + "logits/rejected": -2.8390238285064697, + "logps/chosen": -311.511474609375, + "logps/rejected": -262.41619873046875, + "loss": 0.6312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.055445801466703415, + "rewards/margins": 0.1492082178592682, + "rewards/margins_max": 0.45821937918663025, + "rewards/margins_min": -0.12308479845523834, + "rewards/margins_std": 0.26353055238723755, + "rewards/rejected": -0.2046540230512619, + "step": 980 + }, + { + "epoch": 0.26, + "grad_norm": 3.5375964094308534, + "learning_rate": 4.6251890524246375e-07, + "logits/chosen": -2.7648274898529053, + "logits/rejected": -2.690225601196289, + "logps/chosen": -346.8294677734375, + "logps/rejected": -316.94134521484375, + "loss": 0.6342, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.0730915293097496, + "rewards/margins": 0.18477630615234375, + "rewards/margins_max": 0.47284793853759766, + "rewards/margins_min": -0.1267492026090622, + "rewards/margins_std": 0.27003124356269836, + "rewards/rejected": -0.25786784291267395, + "step": 990 + }, + { + "epoch": 0.26, + "grad_norm": 3.292440315464503, + "learning_rate": 4.613069129183218e-07, + "logits/chosen": -2.7658019065856934, + "logits/rejected": -2.7269351482391357, + "logps/chosen": -254.8045196533203, + "logps/rejected": -239.03604125976562, + "loss": 0.6547, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1744861900806427, + "rewards/margins": 0.08728297054767609, + "rewards/margins_max": 0.38256916403770447, + "rewards/margins_min": -0.21102270483970642, + "rewards/margins_std": 0.26708346605300903, + "rewards/rejected": -0.26176920533180237, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_logits/chosen": -2.779768943786621, + "eval_logits/rejected": -2.744645833969116, + "eval_logps/chosen": -292.126220703125, + "eval_logps/rejected": -284.93408203125, + "eval_loss": 0.6377893090248108, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.07631318271160126, + "eval_rewards/margins": 0.15153387188911438, + "eval_rewards/margins_max": 0.5994539856910706, + "eval_rewards/margins_min": -0.28164777159690857, + "eval_rewards/margins_std": 0.2953792214393616, + "eval_rewards/rejected": -0.22784705460071564, + "eval_runtime": 400.8047, + "eval_samples_per_second": 4.99, + "eval_steps_per_second": 0.157, + "step": 1000 + }, + { + "epoch": 0.26, + "grad_norm": 3.7765372676740343, + "learning_rate": 4.6007727652776065e-07, + "logits/chosen": -2.82297682762146, + "logits/rejected": -2.763232469558716, + "logps/chosen": -288.2690124511719, + "logps/rejected": -302.622314453125, + "loss": 0.6255, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0548722967505455, + "rewards/margins": 0.1781053990125656, + "rewards/margins_max": 0.5366408824920654, + "rewards/margins_min": -0.1015828400850296, + "rewards/margins_std": 0.29184603691101074, + "rewards/rejected": -0.2329777181148529, + "step": 1010 + }, + { + "epoch": 0.27, + "grad_norm": 2.81044864018951, + "learning_rate": 4.588300987450652e-07, + "logits/chosen": -2.88057541847229, + "logits/rejected": -2.796628475189209, + "logps/chosen": -273.1822509765625, + "logps/rejected": -287.0662841796875, + "loss": 0.6412, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.06448214501142502, + "rewards/margins": 0.07124833017587662, + "rewards/margins_max": 0.3305808901786804, + "rewards/margins_min": -0.15144583582878113, + "rewards/margins_std": 0.2111046016216278, + "rewards/rejected": -0.13573047518730164, + "step": 1020 + }, + { + "epoch": 0.27, + "grad_norm": 3.564428108859133, + "learning_rate": 4.5756548370922134e-07, + "logits/chosen": -2.784345865249634, + "logits/rejected": -2.7977664470672607, + "logps/chosen": -303.06097412109375, + "logps/rejected": -315.5431213378906, + "loss": 0.6303, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02970045804977417, + "rewards/margins": 0.19278813898563385, + "rewards/margins_max": 0.5599647760391235, + "rewards/margins_min": -0.13379275798797607, + "rewards/margins_std": 0.31263867020606995, + "rewards/rejected": -0.22248859703540802, + "step": 1030 + }, + { + "epoch": 0.27, + "grad_norm": 3.335890043756419, + "learning_rate": 4.5628353701522047e-07, + "logits/chosen": -2.704432725906372, + "logits/rejected": -2.6589608192443848, + "logps/chosen": -303.9294128417969, + "logps/rejected": -283.72711181640625, + "loss": 0.6396, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.058063607662916183, + "rewards/margins": 0.19258825480937958, + "rewards/margins_max": 0.6151366233825684, + "rewards/margins_min": -0.13671031594276428, + "rewards/margins_std": 0.35448747873306274, + "rewards/rejected": -0.25065189599990845, + "step": 1040 + }, + { + "epoch": 0.27, + "grad_norm": 4.707953018537447, + "learning_rate": 4.549843657052429e-07, + "logits/chosen": -2.7663302421569824, + "logits/rejected": -2.695659637451172, + "logps/chosen": -331.55462646484375, + "logps/rejected": -315.97015380859375, + "loss": 0.6325, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.041464101523160934, + "rewards/margins": 0.14594176411628723, + "rewards/margins_max": 0.5471688508987427, + "rewards/margins_min": -0.24168343842029572, + "rewards/margins_std": 0.35626357793807983, + "rewards/rejected": -0.18740583956241608, + "step": 1050 + }, + { + "epoch": 0.28, + "grad_norm": 4.015113412055745, + "learning_rate": 4.5366807825971907e-07, + "logits/chosen": -2.812615156173706, + "logits/rejected": -2.751713991165161, + "logps/chosen": -358.430419921875, + "logps/rejected": -360.36614990234375, + "loss": 0.6227, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.045555230230093, + "rewards/margins": 0.16782787442207336, + "rewards/margins_max": 0.5209230184555054, + "rewards/margins_min": -0.17321905493736267, + "rewards/margins_std": 0.32079094648361206, + "rewards/rejected": -0.21338307857513428, + "step": 1060 + }, + { + "epoch": 0.28, + "grad_norm": 4.203474632951669, + "learning_rate": 4.5233478458827176e-07, + "logits/chosen": -2.8208816051483154, + "logits/rejected": -2.824300765991211, + "logps/chosen": -264.9644470214844, + "logps/rejected": -278.8236999511719, + "loss": 0.6338, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.056551020592451096, + "rewards/margins": 0.23958036303520203, + "rewards/margins_max": 0.6823800206184387, + "rewards/margins_min": -0.2035522162914276, + "rewards/margins_std": 0.38822150230407715, + "rewards/rejected": -0.296131432056427, + "step": 1070 + }, + { + "epoch": 0.28, + "grad_norm": 3.1602479900157263, + "learning_rate": 4.509845960205389e-07, + "logits/chosen": -2.813753366470337, + "logits/rejected": -2.7190589904785156, + "logps/chosen": -333.58935546875, + "logps/rejected": -343.64764404296875, + "loss": 0.6478, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.129521906375885, + "rewards/margins": 0.0951724722981453, + "rewards/margins_max": 0.49923378229141235, + "rewards/margins_min": -0.3019828796386719, + "rewards/margins_std": 0.358330100774765, + "rewards/rejected": -0.2246943712234497, + "step": 1080 + }, + { + "epoch": 0.29, + "grad_norm": 3.2085630821766817, + "learning_rate": 4.4961762529687736e-07, + "logits/chosen": -2.8025732040405273, + "logits/rejected": -2.735069513320923, + "logps/chosen": -266.5699462890625, + "logps/rejected": -240.78662109375, + "loss": 0.6242, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.11671829223632812, + "rewards/margins": 0.1773519665002823, + "rewards/margins_max": 0.5497051477432251, + "rewards/margins_min": -0.2379584014415741, + "rewards/margins_std": 0.3455933928489685, + "rewards/rejected": -0.2940702438354492, + "step": 1090 + }, + { + "epoch": 0.29, + "grad_norm": 3.693720855856101, + "learning_rate": 4.482339865589492e-07, + "logits/chosen": -2.7057557106018066, + "logits/rejected": -2.7166707515716553, + "logps/chosen": -300.71044921875, + "logps/rejected": -291.16351318359375, + "loss": 0.6408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12895426154136658, + "rewards/margins": 0.1124846488237381, + "rewards/margins_max": 0.47015827894210815, + "rewards/margins_min": -0.257484495639801, + "rewards/margins_std": 0.32513368129730225, + "rewards/rejected": -0.24143891036510468, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_logits/chosen": -2.788508653640747, + "eval_logits/rejected": -2.754523754119873, + "eval_logps/chosen": -288.8172912597656, + "eval_logps/rejected": -283.51324462890625, + "eval_loss": 0.6316921710968018, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.043223775923252106, + "eval_rewards/margins": 0.1704149693250656, + "eval_rewards/margins_max": 0.6413965225219727, + "eval_rewards/margins_min": -0.29533880949020386, + "eval_rewards/margins_std": 0.3162749111652374, + "eval_rewards/rejected": -0.21363872289657593, + "eval_runtime": 410.5359, + "eval_samples_per_second": 4.872, + "eval_steps_per_second": 0.153, + "step": 1100 + }, + { + "epoch": 0.29, + "grad_norm": 3.063538978434532, + "learning_rate": 4.4683379534019076e-07, + "logits/chosen": -2.782644033432007, + "logits/rejected": -2.7872426509857178, + "logps/chosen": -280.27545166015625, + "logps/rejected": -282.95220947265625, + "loss": 0.6229, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07499603927135468, + "rewards/margins": 0.20169000327587128, + "rewards/margins_max": 0.5610953569412231, + "rewards/margins_min": -0.08692102134227753, + "rewards/margins_std": 0.2816941738128662, + "rewards/rejected": -0.27668604254722595, + "step": 1110 + }, + { + "epoch": 0.29, + "grad_norm": 3.274424984370765, + "learning_rate": 4.4541716855616593e-07, + "logits/chosen": -2.8156991004943848, + "logits/rejected": -2.7696075439453125, + "logps/chosen": -275.5388488769531, + "logps/rejected": -225.65670776367188, + "loss": 0.6332, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.04060032591223717, + "rewards/margins": 0.16571339964866638, + "rewards/margins_max": 0.45696109533309937, + "rewards/margins_min": -0.1612904667854309, + "rewards/margins_std": 0.2717466354370117, + "rewards/rejected": -0.20631375908851624, + "step": 1120 + }, + { + "epoch": 0.3, + "grad_norm": 3.9935978676918644, + "learning_rate": 4.4398422449480357e-07, + "logits/chosen": -2.8790533542633057, + "logits/rejected": -2.8592159748077393, + "logps/chosen": -299.87152099609375, + "logps/rejected": -283.5611267089844, + "loss": 0.6041, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.049256496131420135, + "rewards/margins": 0.2299860417842865, + "rewards/margins_max": 0.6271066069602966, + "rewards/margins_min": -0.12086255848407745, + "rewards/margins_std": 0.3318033218383789, + "rewards/rejected": -0.27924251556396484, + "step": 1130 + }, + { + "epoch": 0.3, + "grad_norm": 6.331396542930943, + "learning_rate": 4.4253508280652036e-07, + "logits/chosen": -2.800306797027588, + "logits/rejected": -2.7696163654327393, + "logps/chosen": -242.6008758544922, + "logps/rejected": -227.0926971435547, + "loss": 0.6265, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13074740767478943, + "rewards/margins": 0.1560574769973755, + "rewards/margins_max": 0.5698440074920654, + "rewards/margins_min": -0.2698151469230652, + "rewards/margins_std": 0.3701775074005127, + "rewards/rejected": -0.28680485486984253, + "step": 1140 + }, + { + "epoch": 0.3, + "grad_norm": 4.1821261413173145, + "learning_rate": 4.410698644942302e-07, + "logits/chosen": -2.852303981781006, + "logits/rejected": -2.846554756164551, + "logps/chosen": -313.3166809082031, + "logps/rejected": -284.25653076171875, + "loss": 0.6023, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03649923950433731, + "rewards/margins": 0.3177655041217804, + "rewards/margins_max": 0.6924189925193787, + "rewards/margins_min": -0.058290135115385056, + "rewards/margins_std": 0.32986223697662354, + "rewards/rejected": -0.3542647361755371, + "step": 1150 + }, + { + "epoch": 0.3, + "grad_norm": 4.223020525525396, + "learning_rate": 4.3958869190324057e-07, + "logits/chosen": -2.755011558532715, + "logits/rejected": -2.739072322845459, + "logps/chosen": -200.59523010253906, + "logps/rejected": -221.538818359375, + "loss": 0.6155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09212960302829742, + "rewards/margins": 0.17368380725383759, + "rewards/margins_max": 0.5498765707015991, + "rewards/margins_min": -0.12208227813243866, + "rewards/margins_std": 0.29486706852912903, + "rewards/rejected": -0.265813410282135, + "step": 1160 + }, + { + "epoch": 0.31, + "grad_norm": 4.616920424739203, + "learning_rate": 4.380916887110365e-07, + "logits/chosen": -2.76914644241333, + "logits/rejected": -2.7425856590270996, + "logps/chosen": -261.69647216796875, + "logps/rejected": -283.03277587890625, + "loss": 0.6117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.13767361640930176, + "rewards/margins": 0.13705959916114807, + "rewards/margins_max": 0.4427351951599121, + "rewards/margins_min": -0.25656723976135254, + "rewards/margins_std": 0.30955496430397034, + "rewards/rejected": -0.27473321557044983, + "step": 1170 + }, + { + "epoch": 0.31, + "grad_norm": 4.007184014668779, + "learning_rate": 4.3657897991695394e-07, + "logits/chosen": -2.777548313140869, + "logits/rejected": -2.727642774581909, + "logps/chosen": -278.05328369140625, + "logps/rejected": -277.41314697265625, + "loss": 0.6265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.08509541302919388, + "rewards/margins": 0.23192818462848663, + "rewards/margins_max": 0.6069828271865845, + "rewards/margins_min": -0.11118284612894058, + "rewards/margins_std": 0.32264116406440735, + "rewards/rejected": -0.3170236051082611, + "step": 1180 + }, + { + "epoch": 0.31, + "grad_norm": 6.562683684185295, + "learning_rate": 4.350506918317416e-07, + "logits/chosen": -2.8414430618286133, + "logits/rejected": -2.8465590476989746, + "logps/chosen": -264.95806884765625, + "logps/rejected": -241.4886474609375, + "loss": 0.6258, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.03980771824717522, + "rewards/margins": 0.18716038763523102, + "rewards/margins_max": 0.44271737337112427, + "rewards/margins_min": -0.055542610585689545, + "rewards/margins_std": 0.22382323443889618, + "rewards/rejected": -0.22696809470653534, + "step": 1190 + }, + { + "epoch": 0.31, + "grad_norm": 3.894378322403381, + "learning_rate": 4.335069520670149e-07, + "logits/chosen": -2.7170228958129883, + "logits/rejected": -2.6704020500183105, + "logps/chosen": -239.4353790283203, + "logps/rejected": -261.59783935546875, + "loss": 0.6358, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10473034530878067, + "rewards/margins": 0.06937801837921143, + "rewards/margins_max": 0.43103742599487305, + "rewards/margins_min": -0.298106849193573, + "rewards/margins_std": 0.32226479053497314, + "rewards/rejected": -0.1741083562374115, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.79135799407959, + "eval_logits/rejected": -2.7584729194641113, + "eval_logps/chosen": -289.78094482421875, + "eval_logps/rejected": -286.95135498046875, + "eval_loss": 0.6260370016098022, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.052860863506793976, + "eval_rewards/margins": 0.19515882432460785, + "eval_rewards/margins_max": 0.7219139933586121, + "eval_rewards/margins_min": -0.3248654007911682, + "eval_rewards/margins_std": 0.3520183563232422, + "eval_rewards/rejected": -0.24801968038082123, + "eval_runtime": 410.9069, + "eval_samples_per_second": 4.867, + "eval_steps_per_second": 0.153, + "step": 1200 + }, + { + "epoch": 0.32, + "grad_norm": 5.496184499461788, + "learning_rate": 4.319478895245999e-07, + "logits/chosen": -2.8411569595336914, + "logits/rejected": -2.8288633823394775, + "logps/chosen": -314.07757568359375, + "logps/rejected": -281.8154296875, + "loss": 0.6302, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05773833394050598, + "rewards/margins": 0.10406303405761719, + "rewards/margins_max": 0.6046757698059082, + "rewards/margins_min": -0.332234650850296, + "rewards/margins_std": 0.4285581707954407, + "rewards/rejected": -0.16180138289928436, + "step": 1210 + }, + { + "epoch": 0.32, + "grad_norm": 2.828352270952739, + "learning_rate": 4.3037363438577036e-07, + "logits/chosen": -2.8402397632598877, + "logits/rejected": -2.7890241146087646, + "logps/chosen": -292.60516357421875, + "logps/rejected": -295.42816162109375, + "loss": 0.6143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05674747750163078, + "rewards/margins": 0.20263764262199402, + "rewards/margins_max": 0.6524065136909485, + "rewards/margins_min": -0.2658189833164215, + "rewards/margins_std": 0.4090502858161926, + "rewards/rejected": -0.2593851387500763, + "step": 1220 + }, + { + "epoch": 0.32, + "grad_norm": 4.049801979404387, + "learning_rate": 4.2878431810037716e-07, + "logits/chosen": -2.775773286819458, + "logits/rejected": -2.7828445434570312, + "logps/chosen": -281.5233154296875, + "logps/rejected": -288.1645812988281, + "loss": 0.6454, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06439115107059479, + "rewards/margins": 0.09785722941160202, + "rewards/margins_max": 0.46970024704933167, + "rewards/margins_min": -0.18271590769290924, + "rewards/margins_std": 0.29533350467681885, + "rewards/rejected": -0.1622483730316162, + "step": 1230 + }, + { + "epoch": 0.32, + "grad_norm": 3.6965665403197763, + "learning_rate": 4.271800733758729e-07, + "logits/chosen": -2.6110711097717285, + "logits/rejected": -2.645777940750122, + "logps/chosen": -254.79833984375, + "logps/rejected": -238.7711639404297, + "loss": 0.6067, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010612031444907188, + "rewards/margins": 0.25531864166259766, + "rewards/margins_max": 0.6268452405929565, + "rewards/margins_min": -0.0757184848189354, + "rewards/margins_std": 0.30576324462890625, + "rewards/rejected": -0.2447066307067871, + "step": 1240 + }, + { + "epoch": 0.33, + "grad_norm": 8.892128332916844, + "learning_rate": 4.255610341662304e-07, + "logits/chosen": -2.688994884490967, + "logits/rejected": -2.6815290451049805, + "logps/chosen": -278.12890625, + "logps/rejected": -272.32452392578125, + "loss": 0.634, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.058887314051389694, + "rewards/margins": 0.1176818385720253, + "rewards/margins_max": 0.5397322177886963, + "rewards/margins_min": -0.2698483169078827, + "rewards/margins_std": 0.3672744631767273, + "rewards/rejected": -0.1765691339969635, + "step": 1250 + }, + { + "epoch": 0.33, + "grad_norm": 3.8112219247256225, + "learning_rate": 4.2392733566075757e-07, + "logits/chosen": -2.711003065109253, + "logits/rejected": -2.6918251514434814, + "logps/chosen": -244.88818359375, + "logps/rejected": -231.98623657226562, + "loss": 0.6161, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10779497772455215, + "rewards/margins": 0.21141520142555237, + "rewards/margins_max": 0.5362716317176819, + "rewards/margins_min": -0.07695086300373077, + "rewards/margins_std": 0.2779676616191864, + "rewards/rejected": -0.3192101716995239, + "step": 1260 + }, + { + "epoch": 0.33, + "grad_norm": 4.215036523916396, + "learning_rate": 4.2227911427280973e-07, + "logits/chosen": -2.7580220699310303, + "logits/rejected": -2.7843055725097656, + "logps/chosen": -305.9957275390625, + "logps/rejected": -324.3888854980469, + "loss": 0.6395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16160346567630768, + "rewards/margins": 0.13749554753303528, + "rewards/margins_max": 0.4947836995124817, + "rewards/margins_min": -0.19255313277244568, + "rewards/margins_std": 0.3127996325492859, + "rewards/rejected": -0.29909902811050415, + "step": 1270 + }, + { + "epoch": 0.33, + "grad_norm": 5.6234179497220245, + "learning_rate": 4.206165076283982e-07, + "logits/chosen": -2.6351473331451416, + "logits/rejected": -2.6937224864959717, + "logps/chosen": -249.9329071044922, + "logps/rejected": -280.60107421875, + "loss": 0.6451, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25451716780662537, + "rewards/margins": 0.10636570304632187, + "rewards/margins_max": 0.45388349890708923, + "rewards/margins_min": -0.3022248148918152, + "rewards/margins_std": 0.32480502128601074, + "rewards/rejected": -0.36088284850120544, + "step": 1280 + }, + { + "epoch": 0.34, + "grad_norm": 5.401758698775327, + "learning_rate": 4.1893965455469946e-07, + "logits/chosen": -2.7507548332214355, + "logits/rejected": -2.7085213661193848, + "logps/chosen": -263.70330810546875, + "logps/rejected": -286.38421630859375, + "loss": 0.6066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18255284428596497, + "rewards/margins": 0.2684578001499176, + "rewards/margins_max": 0.654596745967865, + "rewards/margins_min": -0.11808772385120392, + "rewards/margins_std": 0.32828763127326965, + "rewards/rejected": -0.45101064443588257, + "step": 1290 + }, + { + "epoch": 0.34, + "grad_norm": 7.197311736622978, + "learning_rate": 4.172486950684626e-07, + "logits/chosen": -2.7713100910186768, + "logits/rejected": -2.740427255630493, + "logps/chosen": -227.8197479248047, + "logps/rejected": -268.5971374511719, + "loss": 0.6297, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1955510973930359, + "rewards/margins": 0.2154124677181244, + "rewards/margins_max": 0.5882132649421692, + "rewards/margins_min": -0.23282089829444885, + "rewards/margins_std": 0.3677813708782196, + "rewards/rejected": -0.41096359491348267, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_logits/chosen": -2.7816379070281982, + "eval_logits/rejected": -2.7489497661590576, + "eval_logps/chosen": -296.62750244140625, + "eval_logps/rejected": -295.93121337890625, + "eval_loss": 0.6214948296546936, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": -0.12132612615823746, + "eval_rewards/margins": 0.21649228036403656, + "eval_rewards/margins_max": 0.8113858699798584, + "eval_rewards/margins_min": -0.3726661801338196, + "eval_rewards/margins_std": 0.4027722477912903, + "eval_rewards/rejected": -0.3378183841705322, + "eval_runtime": 449.0094, + "eval_samples_per_second": 4.454, + "eval_steps_per_second": 0.14, + "step": 1300 + }, + { + "epoch": 0.34, + "grad_norm": 4.705574487685353, + "learning_rate": 4.155437703643181e-07, + "logits/chosen": -2.766702175140381, + "logits/rejected": -2.7349419593811035, + "logps/chosen": -299.5374755859375, + "logps/rejected": -285.49359130859375, + "loss": 0.6264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.033387500792741776, + "rewards/margins": 0.1384209543466568, + "rewards/margins_max": 0.5469298958778381, + "rewards/margins_min": -0.21401043236255646, + "rewards/margins_std": 0.3537515699863434, + "rewards/rejected": -0.17180845141410828, + "step": 1310 + }, + { + "epoch": 0.35, + "grad_norm": 6.955312869002579, + "learning_rate": 4.138250228029881e-07, + "logits/chosen": -2.788144826889038, + "logits/rejected": -2.7471871376037598, + "logps/chosen": -270.0939025878906, + "logps/rejected": -272.76171875, + "loss": 0.6246, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.018323495984077454, + "rewards/margins": 0.10347982496023178, + "rewards/margins_max": 0.6144011616706848, + "rewards/margins_min": -0.28958752751350403, + "rewards/margins_std": 0.41604360938072205, + "rewards/rejected": -0.12180329859256744, + "step": 1320 + }, + { + "epoch": 0.35, + "grad_norm": 3.357946047605645, + "learning_rate": 4.1209259589939935e-07, + "logits/chosen": -2.8271148204803467, + "logits/rejected": -2.827716827392578, + "logps/chosen": -268.408935546875, + "logps/rejected": -285.219970703125, + "loss": 0.6146, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0683058574795723, + "rewards/margins": 0.27818092703819275, + "rewards/margins_max": 0.6448127627372742, + "rewards/margins_min": -0.0076767681166529655, + "rewards/margins_std": 0.29030415415763855, + "rewards/rejected": -0.20987507700920105, + "step": 1330 + }, + { + "epoch": 0.35, + "grad_norm": 3.7712505945717933, + "learning_rate": 4.103466343106998e-07, + "logits/chosen": -2.623561382293701, + "logits/rejected": -2.578320264816284, + "logps/chosen": -359.72686767578125, + "logps/rejected": -292.11468505859375, + "loss": 0.6461, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.18919122219085693, + "rewards/margins": 0.08822552114725113, + "rewards/margins_max": 0.5707260370254517, + "rewards/margins_min": -0.4260918200016022, + "rewards/margins_std": 0.4472007155418396, + "rewards/rejected": -0.2774167060852051, + "step": 1340 + }, + { + "epoch": 0.35, + "grad_norm": 5.303885262998874, + "learning_rate": 4.085872838241796e-07, + "logits/chosen": -2.794119358062744, + "logits/rejected": -2.806316614151001, + "logps/chosen": -303.3953552246094, + "logps/rejected": -278.9642639160156, + "loss": 0.6026, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.07012991607189178, + "rewards/margins": 0.22540870308876038, + "rewards/margins_max": 0.6571460962295532, + "rewards/margins_min": -0.27641963958740234, + "rewards/margins_std": 0.4160434305667877, + "rewards/rejected": -0.2955385744571686, + "step": 1350 + }, + { + "epoch": 0.36, + "grad_norm": 4.086484562445031, + "learning_rate": 4.06814691345098e-07, + "logits/chosen": -2.728405714035034, + "logits/rejected": -2.745792865753174, + "logps/chosen": -225.49758911132812, + "logps/rejected": -222.00820922851562, + "loss": 0.6314, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.08252111822366714, + "rewards/margins": 0.12099442631006241, + "rewards/margins_max": 0.4897603392601013, + "rewards/margins_min": -0.21687188744544983, + "rewards/margins_std": 0.3282715380191803, + "rewards/rejected": -0.20351552963256836, + "step": 1360 + }, + { + "epoch": 0.36, + "grad_norm": 3.5289720597970344, + "learning_rate": 4.0502900488441707e-07, + "logits/chosen": -2.849860191345215, + "logits/rejected": -2.803264617919922, + "logps/chosen": -289.9068298339844, + "logps/rejected": -275.4745178222656, + "loss": 0.6005, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.08761168271303177, + "rewards/margins": 0.20659425854682922, + "rewards/margins_max": 0.6366546750068665, + "rewards/margins_min": -0.1784614771604538, + "rewards/margins_std": 0.3691921830177307, + "rewards/rejected": -0.294205904006958, + "step": 1370 + }, + { + "epoch": 0.36, + "grad_norm": 3.319872730527394, + "learning_rate": 4.032303735464422e-07, + "logits/chosen": -2.755680799484253, + "logits/rejected": -2.7704451084136963, + "logps/chosen": -295.24273681640625, + "logps/rejected": -309.2400817871094, + "loss": 0.6126, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07848439365625381, + "rewards/margins": 0.3005848526954651, + "rewards/margins_max": 0.823573112487793, + "rewards/margins_min": -0.21085770428180695, + "rewards/margins_std": 0.46771398186683655, + "rewards/rejected": -0.3790692090988159, + "step": 1380 + }, + { + "epoch": 0.36, + "grad_norm": 6.9622348043181495, + "learning_rate": 4.014189475163726e-07, + "logits/chosen": -2.844123363494873, + "logits/rejected": -2.759593963623047, + "logps/chosen": -294.88800048828125, + "logps/rejected": -260.39202880859375, + "loss": 0.6488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25521814823150635, + "rewards/margins": 0.21306483447551727, + "rewards/margins_max": 0.6271547675132751, + "rewards/margins_min": -0.27479130029678345, + "rewards/margins_std": 0.40610384941101074, + "rewards/rejected": -0.4682829976081848, + "step": 1390 + }, + { + "epoch": 0.37, + "grad_norm": 4.2343484237231035, + "learning_rate": 3.995948780477605e-07, + "logits/chosen": -2.753905773162842, + "logits/rejected": -2.678260087966919, + "logps/chosen": -312.3526306152344, + "logps/rejected": -269.80914306640625, + "loss": 0.6165, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34685835242271423, + "rewards/margins": 0.14330218732357025, + "rewards/margins_max": 0.5787476301193237, + "rewards/margins_min": -0.3321211636066437, + "rewards/margins_std": 0.41013914346694946, + "rewards/rejected": -0.4901604652404785, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_logits/chosen": -2.7732975482940674, + "eval_logits/rejected": -2.740445613861084, + "eval_logps/chosen": -306.2648010253906, + "eval_logps/rejected": -306.34735107421875, + "eval_loss": 0.6213136911392212, + "eval_rewards/accuracies": 0.7103174328804016, + "eval_rewards/chosen": -0.21769876778125763, + "eval_rewards/margins": 0.22428110241889954, + "eval_rewards/margins_max": 0.8625542521476746, + "eval_rewards/margins_min": -0.402217835187912, + "eval_rewards/margins_std": 0.42641615867614746, + "eval_rewards/rejected": -0.44197985529899597, + "eval_runtime": 404.6615, + "eval_samples_per_second": 4.942, + "eval_steps_per_second": 0.156, + "step": 1400 + }, + { + "epoch": 0.37, + "grad_norm": 4.405409279557366, + "learning_rate": 3.977583174498816e-07, + "logits/chosen": -2.736898899078369, + "logits/rejected": -2.7622437477111816, + "logps/chosen": -249.6926727294922, + "logps/rejected": -280.6753845214844, + "loss": 0.6152, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1920183300971985, + "rewards/margins": 0.2110348492860794, + "rewards/margins_max": 0.6893723011016846, + "rewards/margins_min": -0.2937195301055908, + "rewards/margins_std": 0.42061647772789, + "rewards/rejected": -0.4030531942844391, + "step": 1410 + }, + { + "epoch": 0.37, + "grad_norm": 4.748366828033544, + "learning_rate": 3.9590941907501717e-07, + "logits/chosen": -2.7816436290740967, + "logits/rejected": -2.7102437019348145, + "logps/chosen": -264.3079528808594, + "logps/rejected": -228.6800079345703, + "loss": 0.6377, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15209302306175232, + "rewards/margins": 0.19371427595615387, + "rewards/margins_max": 0.5473133325576782, + "rewards/margins_min": -0.1777452975511551, + "rewards/margins_std": 0.3181108236312866, + "rewards/rejected": -0.345807284116745, + "step": 1420 + }, + { + "epoch": 0.37, + "grad_norm": 4.993229669191762, + "learning_rate": 3.9404833730564974e-07, + "logits/chosen": -2.816363573074341, + "logits/rejected": -2.8435776233673096, + "logps/chosen": -228.8834991455078, + "logps/rejected": -265.7474060058594, + "loss": 0.6178, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1920272707939148, + "rewards/margins": 0.17802871763706207, + "rewards/margins_max": 0.5577576160430908, + "rewards/margins_min": -0.13257615268230438, + "rewards/margins_std": 0.31796520948410034, + "rewards/rejected": -0.3700559735298157, + "step": 1430 + }, + { + "epoch": 0.38, + "grad_norm": 7.488544361982304, + "learning_rate": 3.9217522754157117e-07, + "logits/chosen": -2.7809338569641113, + "logits/rejected": -2.748839855194092, + "logps/chosen": -334.3487548828125, + "logps/rejected": -370.34442138671875, + "loss": 0.5798, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.05274087190628052, + "rewards/margins": 0.4037662148475647, + "rewards/margins_max": 0.8154677152633667, + "rewards/margins_min": -0.11078095436096191, + "rewards/margins_std": 0.4105333387851715, + "rewards/rejected": -0.45650702714920044, + "step": 1440 + }, + { + "epoch": 0.38, + "grad_norm": 8.444139820873001, + "learning_rate": 3.9029024618690785e-07, + "logits/chosen": -2.7985057830810547, + "logits/rejected": -2.7197837829589844, + "logps/chosen": -329.4841613769531, + "logps/rejected": -306.6536865234375, + "loss": 0.6143, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09955091774463654, + "rewards/margins": 0.30447739362716675, + "rewards/margins_max": 0.838217556476593, + "rewards/margins_min": -0.1721421480178833, + "rewards/margins_std": 0.4573248028755188, + "rewards/rejected": -0.4040283262729645, + "step": 1450 + }, + { + "epoch": 0.38, + "grad_norm": 7.577815916699403, + "learning_rate": 3.883935506370605e-07, + "logits/chosen": -2.7470953464508057, + "logits/rejected": -2.69755220413208, + "logps/chosen": -312.834716796875, + "logps/rejected": -289.80242919921875, + "loss": 0.5738, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12933430075645447, + "rewards/margins": 0.36629873514175415, + "rewards/margins_max": 0.8126401901245117, + "rewards/margins_min": -0.21733348071575165, + "rewards/margins_std": 0.466840922832489, + "rewards/rejected": -0.4956330358982086, + "step": 1460 + }, + { + "epoch": 0.38, + "grad_norm": 5.932089177249103, + "learning_rate": 3.864852992655616e-07, + "logits/chosen": -2.822394609451294, + "logits/rejected": -2.797452449798584, + "logps/chosen": -273.2506408691406, + "logps/rejected": -282.4078063964844, + "loss": 0.5938, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21063213050365448, + "rewards/margins": 0.26416927576065063, + "rewards/margins_max": 0.6298851370811462, + "rewards/margins_min": -0.12260746955871582, + "rewards/margins_std": 0.3462390601634979, + "rewards/rejected": -0.4748013913631439, + "step": 1470 + }, + { + "epoch": 0.39, + "grad_norm": 5.654826116747051, + "learning_rate": 3.845656514108515e-07, + "logits/chosen": -2.766648292541504, + "logits/rejected": -2.716283082962036, + "logps/chosen": -267.1504211425781, + "logps/rejected": -311.058837890625, + "loss": 0.6084, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2905041575431824, + "rewards/margins": 0.13299508392810822, + "rewards/margins_max": 0.48174533247947693, + "rewards/margins_min": -0.13761821389198303, + "rewards/margins_std": 0.2668607831001282, + "rewards/rejected": -0.4234992563724518, + "step": 1480 + }, + { + "epoch": 0.39, + "grad_norm": 2.8947622024301345, + "learning_rate": 3.8263476736297375e-07, + "logits/chosen": -2.6635091304779053, + "logits/rejected": -2.67364501953125, + "logps/chosen": -290.59893798828125, + "logps/rejected": -285.15972900390625, + "loss": 0.5999, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1535448133945465, + "rewards/margins": 0.2871786952018738, + "rewards/margins_max": 0.8308157920837402, + "rewards/margins_min": -0.17640534043312073, + "rewards/margins_std": 0.45710426568984985, + "rewards/rejected": -0.4407235085964203, + "step": 1490 + }, + { + "epoch": 0.39, + "grad_norm": 7.535971526738051, + "learning_rate": 3.8069280835019055e-07, + "logits/chosen": -2.8337628841400146, + "logits/rejected": -2.772803544998169, + "logps/chosen": -252.514892578125, + "logps/rejected": -229.8316192626953, + "loss": 0.6185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1284678727388382, + "rewards/margins": 0.1672859489917755, + "rewards/margins_max": 0.5976482629776001, + "rewards/margins_min": -0.21800854802131653, + "rewards/margins_std": 0.3650572597980499, + "rewards/rejected": -0.2957538068294525, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_logits/chosen": -2.774484872817993, + "eval_logits/rejected": -2.742502450942993, + "eval_logps/chosen": -294.7081604003906, + "eval_logps/rejected": -295.71014404296875, + "eval_loss": 0.6162474751472473, + "eval_rewards/accuracies": 0.7063491940498352, + "eval_rewards/chosen": -0.10213224589824677, + "eval_rewards/margins": 0.23347507417201996, + "eval_rewards/margins_max": 0.877864420413971, + "eval_rewards/margins_min": -0.39759406447410583, + "eval_rewards/margins_std": 0.43485260009765625, + "eval_rewards/rejected": -0.3356073796749115, + "eval_runtime": 421.1427, + "eval_samples_per_second": 4.749, + "eval_steps_per_second": 0.15, + "step": 1500 + }, + { + "epoch": 0.4, + "grad_norm": 2.9789724931372166, + "learning_rate": 3.7873993652552073e-07, + "logits/chosen": -2.8239996433258057, + "logits/rejected": -2.7581610679626465, + "logps/chosen": -319.91741943359375, + "logps/rejected": -292.49444580078125, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15871727466583252, + "rewards/margins": 0.172176331281662, + "rewards/margins_max": 0.6388310194015503, + "rewards/margins_min": -0.32647547125816345, + "rewards/margins_std": 0.42792969942092896, + "rewards/rejected": -0.3308935761451721, + "step": 1510 + }, + { + "epoch": 0.4, + "grad_norm": 3.950049851501333, + "learning_rate": 3.767763149531995e-07, + "logits/chosen": -2.786825656890869, + "logits/rejected": -2.7279574871063232, + "logps/chosen": -293.25201416015625, + "logps/rejected": -249.9593963623047, + "loss": 0.614, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0757681354880333, + "rewards/margins": 0.24748125672340393, + "rewards/margins_max": 0.7253497838973999, + "rewards/margins_min": -0.25351768732070923, + "rewards/margins_std": 0.4386812746524811, + "rewards/rejected": -0.3232493996620178, + "step": 1520 + }, + { + "epoch": 0.4, + "grad_norm": 3.752600721701374, + "learning_rate": 3.7480210759506326e-07, + "logits/chosen": -2.725372791290283, + "logits/rejected": -2.7492547035217285, + "logps/chosen": -281.8008117675781, + "logps/rejected": -266.6828308105469, + "loss": 0.598, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04241775721311569, + "rewards/margins": 0.21975021064281464, + "rewards/margins_max": 0.7519022822380066, + "rewards/margins_min": -0.15544210374355316, + "rewards/margins_std": 0.4117540717124939, + "rewards/rejected": -0.26216796040534973, + "step": 1530 + }, + { + "epoch": 0.4, + "grad_norm": 8.733126255909745, + "learning_rate": 3.728174792968582e-07, + "logits/chosen": -2.7404263019561768, + "logits/rejected": -2.7115063667297363, + "logps/chosen": -388.98388671875, + "logps/rejected": -406.96533203125, + "loss": 0.6054, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11297575384378433, + "rewards/margins": 0.17848999798297882, + "rewards/margins_max": 0.5834918022155762, + "rewards/margins_min": -0.2529893219470978, + "rewards/margins_std": 0.37449777126312256, + "rewards/rejected": -0.29146575927734375, + "step": 1540 + }, + { + "epoch": 0.41, + "grad_norm": 5.630213377247779, + "learning_rate": 3.70822595774476e-07, + "logits/chosen": -2.7270076274871826, + "logits/rejected": -2.7804019451141357, + "logps/chosen": -314.51861572265625, + "logps/rejected": -316.8720703125, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07293678820133209, + "rewards/margins": 0.27860817313194275, + "rewards/margins_max": 0.8754470944404602, + "rewards/margins_min": -0.17325973510742188, + "rewards/margins_std": 0.4804156422615051, + "rewards/rejected": -0.35154494643211365, + "step": 1550 + }, + { + "epoch": 0.41, + "grad_norm": 5.609569153042297, + "learning_rate": 3.688176236001168e-07, + "logits/chosen": -2.7386574745178223, + "logits/rejected": -2.7287490367889404, + "logps/chosen": -288.5497741699219, + "logps/rejected": -289.26629638671875, + "loss": 0.6206, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18108539283275604, + "rewards/margins": 0.18185843527317047, + "rewards/margins_max": 0.7833872437477112, + "rewards/margins_min": -0.44136619567871094, + "rewards/margins_std": 0.547116219997406, + "rewards/rejected": -0.3629438281059265, + "step": 1560 + }, + { + "epoch": 0.41, + "grad_norm": 6.646332565578224, + "learning_rate": 3.6680273018838016e-07, + "logits/chosen": -2.7706754207611084, + "logits/rejected": -2.7684202194213867, + "logps/chosen": -378.41668701171875, + "logps/rejected": -303.72967529296875, + "loss": 0.6208, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19015146791934967, + "rewards/margins": 0.25191012024879456, + "rewards/margins_max": 0.7531174421310425, + "rewards/margins_min": -0.26080822944641113, + "rewards/margins_std": 0.44655147194862366, + "rewards/rejected": -0.4420616030693054, + "step": 1570 + }, + { + "epoch": 0.41, + "grad_norm": 5.432585852445357, + "learning_rate": 3.6477808378228596e-07, + "logits/chosen": -2.7753264904022217, + "logits/rejected": -2.8279383182525635, + "logps/chosen": -304.66510009765625, + "logps/rejected": -296.50518798828125, + "loss": 0.6259, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3670610785484314, + "rewards/margins": 0.08154429495334625, + "rewards/margins_max": 0.45335984230041504, + "rewards/margins_min": -0.2849012017250061, + "rewards/margins_std": 0.33231446146965027, + "rewards/rejected": -0.44860538840293884, + "step": 1580 + }, + { + "epoch": 0.42, + "grad_norm": 6.448317836264002, + "learning_rate": 3.6274385343922674e-07, + "logits/chosen": -2.802908420562744, + "logits/rejected": -2.851236581802368, + "logps/chosen": -336.77716064453125, + "logps/rejected": -339.1017761230469, + "loss": 0.6131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28900623321533203, + "rewards/margins": 0.19765332341194153, + "rewards/margins_max": 0.7766116857528687, + "rewards/margins_min": -0.34217551350593567, + "rewards/margins_std": 0.49159669876098633, + "rewards/rejected": -0.48665952682495117, + "step": 1590 + }, + { + "epoch": 0.42, + "grad_norm": 5.088781636832463, + "learning_rate": 3.6070020901685057e-07, + "logits/chosen": -2.699211835861206, + "logits/rejected": -2.7320754528045654, + "logps/chosen": -270.328125, + "logps/rejected": -247.62875366210938, + "loss": 0.6066, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18549546599388123, + "rewards/margins": 0.2464727908372879, + "rewards/margins_max": 0.6963258981704712, + "rewards/margins_min": -0.11806619167327881, + "rewards/margins_std": 0.3634399175643921, + "rewards/rejected": -0.4319682717323303, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_logits/chosen": -2.7689387798309326, + "eval_logits/rejected": -2.7366623878479004, + "eval_logps/chosen": -301.4554443359375, + "eval_logps/rejected": -304.70782470703125, + "eval_loss": 0.6140501499176025, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": -0.16960561275482178, + "eval_rewards/margins": 0.25597894191741943, + "eval_rewards/margins_max": 0.9393980503082275, + "eval_rewards/margins_min": -0.43976885080337524, + "eval_rewards/margins_std": 0.4678189754486084, + "eval_rewards/rejected": -0.4255845844745636, + "eval_runtime": 391.0704, + "eval_samples_per_second": 5.114, + "eval_steps_per_second": 0.161, + "step": 1600 + }, + { + "epoch": 0.42, + "grad_norm": 4.568804421994396, + "learning_rate": 3.5864732115887863e-07, + "logits/chosen": -2.6860828399658203, + "logits/rejected": -2.698554515838623, + "logps/chosen": -285.98992919921875, + "logps/rejected": -287.00689697265625, + "loss": 0.6282, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.10825375467538834, + "rewards/margins": 0.27890637516975403, + "rewards/margins_max": 0.8442277908325195, + "rewards/margins_min": -0.16735580563545227, + "rewards/margins_std": 0.44145527482032776, + "rewards/rejected": -0.38716015219688416, + "step": 1610 + }, + { + "epoch": 0.42, + "grad_norm": 4.893636050791622, + "learning_rate": 3.565853612808562e-07, + "logits/chosen": -2.7702395915985107, + "logits/rejected": -2.7199933528900146, + "logps/chosen": -301.0812072753906, + "logps/rejected": -281.8124084472656, + "loss": 0.6093, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.15281054377555847, + "rewards/margins": 0.22090475261211395, + "rewards/margins_max": 0.7223400473594666, + "rewards/margins_min": -0.25540727376937866, + "rewards/margins_std": 0.43723663687705994, + "rewards/rejected": -0.37371525168418884, + "step": 1620 + }, + { + "epoch": 0.43, + "grad_norm": 5.326166145257277, + "learning_rate": 3.5451450155583984e-07, + "logits/chosen": -2.87768816947937, + "logits/rejected": -2.7891995906829834, + "logps/chosen": -319.95660400390625, + "logps/rejected": -311.11224365234375, + "loss": 0.5885, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2234271764755249, + "rewards/margins": 0.32601475715637207, + "rewards/margins_max": 0.7853580713272095, + "rewards/margins_min": -0.10454492270946503, + "rewards/margins_std": 0.38418078422546387, + "rewards/rejected": -0.549441933631897, + "step": 1630 + }, + { + "epoch": 0.43, + "grad_norm": 4.916572742319347, + "learning_rate": 3.5243491490002055e-07, + "logits/chosen": -2.8464996814727783, + "logits/rejected": -2.820748805999756, + "logps/chosen": -306.3653869628906, + "logps/rejected": -283.6292724609375, + "loss": 0.6174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23571832478046417, + "rewards/margins": 0.2375546395778656, + "rewards/margins_max": 0.8813978433609009, + "rewards/margins_min": -0.3592573404312134, + "rewards/margins_std": 0.5597464442253113, + "rewards/rejected": -0.47327297925949097, + "step": 1640 + }, + { + "epoch": 0.43, + "grad_norm": 6.2942667592711095, + "learning_rate": 3.503467749582857e-07, + "logits/chosen": -2.7997097969055176, + "logits/rejected": -2.696415424346924, + "logps/chosen": -400.3260803222656, + "logps/rejected": -333.4490661621094, + "loss": 0.6013, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09487394243478775, + "rewards/margins": 0.3762526214122772, + "rewards/margins_max": 0.8471622467041016, + "rewards/margins_min": -0.23214320838451385, + "rewards/margins_std": 0.5148395895957947, + "rewards/rejected": -0.4711264967918396, + "step": 1650 + }, + { + "epoch": 0.43, + "grad_norm": 5.904728123206383, + "learning_rate": 3.482502560897194e-07, + "logits/chosen": -2.7432870864868164, + "logits/rejected": -2.7146406173706055, + "logps/chosen": -268.5960998535156, + "logps/rejected": -337.6832275390625, + "loss": 0.6015, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1742277443408966, + "rewards/margins": 0.343140184879303, + "rewards/margins_max": 0.9195839166641235, + "rewards/margins_min": -0.2957271635532379, + "rewards/margins_std": 0.559090256690979, + "rewards/rejected": -0.517367959022522, + "step": 1660 + }, + { + "epoch": 0.44, + "grad_norm": 4.676517489159643, + "learning_rate": 3.4614553335304403e-07, + "logits/chosen": -2.8274528980255127, + "logits/rejected": -2.797477960586548, + "logps/chosen": -272.20355224609375, + "logps/rejected": -274.79754638671875, + "loss": 0.5864, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.09424525499343872, + "rewards/margins": 0.3524290919303894, + "rewards/margins_max": 0.9862232208251953, + "rewards/margins_min": -0.27030831575393677, + "rewards/margins_std": 0.56109619140625, + "rewards/rejected": -0.44667428731918335, + "step": 1670 + }, + { + "epoch": 0.44, + "grad_norm": 8.886704155207795, + "learning_rate": 3.440327824920022e-07, + "logits/chosen": -2.730769157409668, + "logits/rejected": -2.671119451522827, + "logps/chosen": -327.7288513183594, + "logps/rejected": -292.43634033203125, + "loss": 0.6056, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16993173956871033, + "rewards/margins": 0.20374122262001038, + "rewards/margins_max": 0.7561041712760925, + "rewards/margins_min": -0.27063438296318054, + "rewards/margins_std": 0.44191116094589233, + "rewards/rejected": -0.3736729323863983, + "step": 1680 + }, + { + "epoch": 0.44, + "grad_norm": 4.140149589452883, + "learning_rate": 3.4191217992068287e-07, + "logits/chosen": -2.6405558586120605, + "logits/rejected": -2.648857593536377, + "logps/chosen": -262.5699157714844, + "logps/rejected": -292.46710205078125, + "loss": 0.6009, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2204829901456833, + "rewards/margins": 0.15006816387176514, + "rewards/margins_max": 0.6224408149719238, + "rewards/margins_min": -0.24849987030029297, + "rewards/margins_std": 0.40147191286087036, + "rewards/rejected": -0.370551198720932, + "step": 1690 + }, + { + "epoch": 0.44, + "grad_norm": 5.031828284349408, + "learning_rate": 3.3978390270879056e-07, + "logits/chosen": -2.711458444595337, + "logits/rejected": -2.70207142829895, + "logps/chosen": -235.1042938232422, + "logps/rejected": -299.9516296386719, + "loss": 0.6048, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23918704688549042, + "rewards/margins": 0.14093589782714844, + "rewards/margins_max": 0.5437588691711426, + "rewards/margins_min": -0.415393203496933, + "rewards/margins_std": 0.4326951503753662, + "rewards/rejected": -0.38012295961380005, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_logits/chosen": -2.763840913772583, + "eval_logits/rejected": -2.7314860820770264, + "eval_logps/chosen": -296.6920166015625, + "eval_logps/rejected": -299.63214111328125, + "eval_loss": 0.6122699975967407, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": -0.12197133898735046, + "eval_rewards/margins": 0.25285622477531433, + "eval_rewards/margins_max": 0.9410543441772461, + "eval_rewards/margins_min": -0.42348846793174744, + "eval_rewards/margins_std": 0.4655725955963135, + "eval_rewards/rejected": -0.3748275935649872, + "eval_runtime": 406.8599, + "eval_samples_per_second": 4.916, + "eval_steps_per_second": 0.155, + "step": 1700 + }, + { + "epoch": 0.45, + "grad_norm": 5.491564437231135, + "learning_rate": 3.376481285668599e-07, + "logits/chosen": -2.8247580528259277, + "logits/rejected": -2.803906202316284, + "logps/chosen": -268.17291259765625, + "logps/rejected": -270.89752197265625, + "loss": 0.6251, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13041429221630096, + "rewards/margins": 0.15470437705516815, + "rewards/margins_max": 0.6188690066337585, + "rewards/margins_min": -0.32083553075790405, + "rewards/margins_std": 0.4210189878940582, + "rewards/rejected": -0.2851186990737915, + "step": 1710 + }, + { + "epoch": 0.45, + "grad_norm": 5.15693132927551, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -2.7580504417419434, + "logits/rejected": -2.7378337383270264, + "logps/chosen": -272.163330078125, + "logps/rejected": -317.53802490234375, + "loss": 0.6079, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1334218680858612, + "rewards/margins": 0.19889754056930542, + "rewards/margins_max": 0.8179262280464172, + "rewards/margins_min": -0.3128034770488739, + "rewards/margins_std": 0.49218082427978516, + "rewards/rejected": -0.3323194086551666, + "step": 1720 + }, + { + "epoch": 0.45, + "grad_norm": 5.273904420879864, + "learning_rate": 3.33354803450089e-07, + "logits/chosen": -2.8175125122070312, + "logits/rejected": -2.7301135063171387, + "logps/chosen": -293.22607421875, + "logps/rejected": -335.7915954589844, + "loss": 0.5995, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.13616986572742462, + "rewards/margins": 0.20485034584999084, + "rewards/margins_max": 0.8166631460189819, + "rewards/margins_min": -0.3129269778728485, + "rewards/margins_std": 0.4890281558036804, + "rewards/rejected": -0.34102022647857666, + "step": 1730 + }, + { + "epoch": 0.46, + "grad_norm": 5.007519598859198, + "learning_rate": 3.311976109666605e-07, + "logits/chosen": -2.737546443939209, + "logits/rejected": -2.6750173568725586, + "logps/chosen": -339.09405517578125, + "logps/rejected": -295.5556945800781, + "loss": 0.6156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16130061447620392, + "rewards/margins": 0.38655537366867065, + "rewards/margins_max": 0.9086765050888062, + "rewards/margins_min": -0.07011240720748901, + "rewards/margins_std": 0.4501801133155823, + "rewards/rejected": -0.5478559732437134, + "step": 1740 + }, + { + "epoch": 0.46, + "grad_norm": 3.9553334697665714, + "learning_rate": 3.2903363850608317e-07, + "logits/chosen": -2.728745937347412, + "logits/rejected": -2.723440408706665, + "logps/chosen": -292.22637939453125, + "logps/rejected": -322.1505126953125, + "loss": 0.6079, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.18522730469703674, + "rewards/margins": 0.3709561228752136, + "rewards/margins_max": 0.8375279307365417, + "rewards/margins_min": -0.20814350247383118, + "rewards/margins_std": 0.482636034488678, + "rewards/rejected": -0.5561834573745728, + "step": 1750 + }, + { + "epoch": 0.46, + "grad_norm": 5.455115711862542, + "learning_rate": 3.2686306675943477e-07, + "logits/chosen": -2.6396148204803467, + "logits/rejected": -2.6066744327545166, + "logps/chosen": -285.2608947753906, + "logps/rejected": -294.406982421875, + "loss": 0.5924, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.10779108107089996, + "rewards/margins": 0.40661031007766724, + "rewards/margins_max": 0.9361648559570312, + "rewards/margins_min": -0.1543847769498825, + "rewards/margins_std": 0.48866039514541626, + "rewards/rejected": -0.5144013166427612, + "step": 1760 + }, + { + "epoch": 0.46, + "grad_norm": 4.655813721564911, + "learning_rate": 3.2468607696883145e-07, + "logits/chosen": -2.8187601566314697, + "logits/rejected": -2.8162825107574463, + "logps/chosen": -332.2940368652344, + "logps/rejected": -332.9554748535156, + "loss": 0.6213, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.26196056604385376, + "rewards/margins": 0.11825475841760635, + "rewards/margins_max": 0.6965989470481873, + "rewards/margins_min": -0.3889002799987793, + "rewards/margins_std": 0.46893835067749023, + "rewards/rejected": -0.3802153170108795, + "step": 1770 + }, + { + "epoch": 0.47, + "grad_norm": 5.437845854886281, + "learning_rate": 3.2250285091229435e-07, + "logits/chosen": -2.7541983127593994, + "logits/rejected": -2.6833977699279785, + "logps/chosen": -308.24200439453125, + "logps/rejected": -299.4825744628906, + "loss": 0.6049, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2547636032104492, + "rewards/margins": 0.15053074061870575, + "rewards/margins_max": 0.700086772441864, + "rewards/margins_min": -0.45059531927108765, + "rewards/margins_std": 0.5051099061965942, + "rewards/rejected": -0.4052943289279938, + "step": 1780 + }, + { + "epoch": 0.47, + "grad_norm": 4.9280531119048305, + "learning_rate": 3.2031357088857083e-07, + "logits/chosen": -2.780705690383911, + "logits/rejected": -2.7226414680480957, + "logps/chosen": -306.95281982421875, + "logps/rejected": -280.9226379394531, + "loss": 0.6145, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2251468151807785, + "rewards/margins": 0.2609301209449768, + "rewards/margins_max": 0.9194058179855347, + "rewards/margins_min": -0.2734734117984772, + "rewards/margins_std": 0.560528576374054, + "rewards/rejected": -0.4860769808292389, + "step": 1790 + }, + { + "epoch": 0.47, + "grad_norm": 5.744423857667797, + "learning_rate": 3.1811841970191267e-07, + "logits/chosen": -2.806312084197998, + "logits/rejected": -2.697972059249878, + "logps/chosen": -367.9916076660156, + "logps/rejected": -350.6258544921875, + "loss": 0.609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17900875210762024, + "rewards/margins": 0.2853837013244629, + "rewards/margins_max": 0.6966890096664429, + "rewards/margins_min": -0.19959492981433868, + "rewards/margins_std": 0.39137208461761475, + "rewards/rejected": -0.4643924832344055, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_logits/chosen": -2.7574050426483154, + "eval_logits/rejected": -2.725120782852173, + "eval_logps/chosen": -298.734375, + "eval_logps/rejected": -303.3702697753906, + "eval_loss": 0.6089652180671692, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.14239490032196045, + "eval_rewards/margins": 0.2698138952255249, + "eval_rewards/margins_max": 0.9828522205352783, + "eval_rewards/margins_min": -0.44777292013168335, + "eval_rewards/margins_std": 0.4813172221183777, + "eval_rewards/rejected": -0.41220882534980774, + "eval_runtime": 390.7792, + "eval_samples_per_second": 5.118, + "eval_steps_per_second": 0.161, + "step": 1800 + }, + { + "epoch": 0.47, + "grad_norm": 8.622644075034442, + "learning_rate": 3.1591758064681257e-07, + "logits/chosen": -2.761805772781372, + "logits/rejected": -2.762845516204834, + "logps/chosen": -340.2552185058594, + "logps/rejected": -334.52593994140625, + "loss": 0.5934, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07181647419929504, + "rewards/margins": 0.35393524169921875, + "rewards/margins_max": 0.7183262705802917, + "rewards/margins_min": -0.011866944842040539, + "rewards/margins_std": 0.32488614320755005, + "rewards/rejected": -0.4257517457008362, + "step": 1810 + }, + { + "epoch": 0.48, + "grad_norm": 7.716713825957816, + "learning_rate": 3.13711237492698e-07, + "logits/chosen": -2.7310614585876465, + "logits/rejected": -2.6444265842437744, + "logps/chosen": -308.83258056640625, + "logps/rejected": -343.875, + "loss": 0.6304, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17491993308067322, + "rewards/margins": 0.3376871943473816, + "rewards/margins_max": 0.8672064542770386, + "rewards/margins_min": -0.16395722329616547, + "rewards/margins_std": 0.46048134565353394, + "rewards/rejected": -0.5126070380210876, + "step": 1820 + }, + { + "epoch": 0.48, + "grad_norm": 3.5403450837556436, + "learning_rate": 3.1149957446858767e-07, + "logits/chosen": -2.741170883178711, + "logits/rejected": -2.7057688236236572, + "logps/chosen": -329.8138122558594, + "logps/rejected": -458.5897521972656, + "loss": 0.6098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.11135464906692505, + "rewards/margins": 0.3423333168029785, + "rewards/margins_max": 0.9713395833969116, + "rewards/margins_min": -0.2491106539964676, + "rewards/margins_std": 0.5647034645080566, + "rewards/rejected": -0.45368799567222595, + "step": 1830 + }, + { + "epoch": 0.48, + "grad_norm": 8.11344487076226, + "learning_rate": 3.0928277624770736e-07, + "logits/chosen": -2.7511956691741943, + "logits/rejected": -2.7057204246520996, + "logps/chosen": -262.40826416015625, + "logps/rejected": -288.30206298828125, + "loss": 0.6039, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.15768931806087494, + "rewards/margins": 0.31308621168136597, + "rewards/margins_max": 0.7508654594421387, + "rewards/margins_min": -0.12808196246623993, + "rewards/margins_std": 0.40058478713035583, + "rewards/rejected": -0.4707755148410797, + "step": 1840 + }, + { + "epoch": 0.48, + "grad_norm": 6.553648001229806, + "learning_rate": 3.0706102793207073e-07, + "logits/chosen": -2.7565999031066895, + "logits/rejected": -2.673367977142334, + "logps/chosen": -259.0464782714844, + "logps/rejected": -258.9010925292969, + "loss": 0.5926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21598121523857117, + "rewards/margins": 0.20572714507579803, + "rewards/margins_max": 0.6502859592437744, + "rewards/margins_min": -0.3407582938671112, + "rewards/margins_std": 0.4396124482154846, + "rewards/rejected": -0.4217084050178528, + "step": 1850 + }, + { + "epoch": 0.49, + "grad_norm": 5.540610114699936, + "learning_rate": 3.048345150370226e-07, + "logits/chosen": -2.648470163345337, + "logits/rejected": -2.614975690841675, + "logps/chosen": -294.60150146484375, + "logps/rejected": -310.3082580566406, + "loss": 0.5765, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09407683461904526, + "rewards/margins": 0.3583822250366211, + "rewards/margins_max": 0.882233738899231, + "rewards/margins_min": -0.09761301428079605, + "rewards/margins_std": 0.43587416410446167, + "rewards/rejected": -0.45245909690856934, + "step": 1860 + }, + { + "epoch": 0.49, + "grad_norm": 6.10185446144468, + "learning_rate": 3.0260342347574913e-07, + "logits/chosen": -2.65407133102417, + "logits/rejected": -2.6808485984802246, + "logps/chosen": -293.0909729003906, + "logps/rejected": -344.00164794921875, + "loss": 0.554, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1745704561471939, + "rewards/margins": 0.42989611625671387, + "rewards/margins_max": 0.989809513092041, + "rewards/margins_min": -0.06674328446388245, + "rewards/margins_std": 0.4606723189353943, + "rewards/rejected": -0.6044665575027466, + "step": 1870 + }, + { + "epoch": 0.49, + "grad_norm": 6.880650088624609, + "learning_rate": 3.0036793954375357e-07, + "logits/chosen": -2.737017869949341, + "logits/rejected": -2.6729750633239746, + "logps/chosen": -291.97418212890625, + "logps/rejected": -299.54339599609375, + "loss": 0.593, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1755559891462326, + "rewards/margins": 0.23570725321769714, + "rewards/margins_max": 0.739096999168396, + "rewards/margins_min": -0.23385903239250183, + "rewards/margins_std": 0.4358901083469391, + "rewards/rejected": -0.41126322746276855, + "step": 1880 + }, + { + "epoch": 0.49, + "grad_norm": 7.655343138256646, + "learning_rate": 2.9812824990330085e-07, + "logits/chosen": -2.7572693824768066, + "logits/rejected": -2.755995273590088, + "logps/chosen": -341.8429260253906, + "logps/rejected": -404.138671875, + "loss": 0.5979, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4077052175998688, + "rewards/margins": 0.1405329704284668, + "rewards/margins_max": 0.5893855690956116, + "rewards/margins_min": -0.42796167731285095, + "rewards/margins_std": 0.46961015462875366, + "rewards/rejected": -0.5482381582260132, + "step": 1890 + }, + { + "epoch": 0.5, + "grad_norm": 4.420635143922601, + "learning_rate": 2.958845415678316e-07, + "logits/chosen": -2.7897894382476807, + "logits/rejected": -2.7294561862945557, + "logps/chosen": -307.3680419921875, + "logps/rejected": -283.0511779785156, + "loss": 0.5909, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2575165033340454, + "rewards/margins": 0.2712162137031555, + "rewards/margins_max": 0.813406765460968, + "rewards/margins_min": -0.255156934261322, + "rewards/margins_std": 0.4845468997955322, + "rewards/rejected": -0.5287327170372009, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -2.7507169246673584, + "eval_logits/rejected": -2.718559980392456, + "eval_logps/chosen": -308.2264099121094, + "eval_logps/rejected": -314.54217529296875, + "eval_loss": 0.6062055230140686, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.2373151183128357, + "eval_rewards/margins": 0.28661274909973145, + "eval_rewards/margins_max": 1.0475393533706665, + "eval_rewards/margins_min": -0.48601973056793213, + "eval_rewards/margins_std": 0.5181077122688293, + "eval_rewards/rejected": -0.5239278674125671, + "eval_runtime": 409.4751, + "eval_samples_per_second": 4.884, + "eval_steps_per_second": 0.154, + "step": 1900 + }, + { + "epoch": 0.5, + "grad_norm": 3.9890244825429013, + "learning_rate": 2.936370018863459e-07, + "logits/chosen": -2.761441946029663, + "logits/rejected": -2.7136168479919434, + "logps/chosen": -271.4809875488281, + "logps/rejected": -282.964111328125, + "loss": 0.583, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.13114547729492188, + "rewards/margins": 0.3116704225540161, + "rewards/margins_max": 0.7335649728775024, + "rewards/margins_min": -0.053193189203739166, + "rewards/margins_std": 0.3390631675720215, + "rewards/rejected": -0.4428158700466156, + "step": 1910 + }, + { + "epoch": 0.5, + "grad_norm": 5.605318690826813, + "learning_rate": 2.913858185277605e-07, + "logits/chosen": -2.7156014442443848, + "logits/rejected": -2.7027459144592285, + "logps/chosen": -292.0843200683594, + "logps/rejected": -255.1691436767578, + "loss": 0.6048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1965518593788147, + "rewards/margins": 0.2787795066833496, + "rewards/margins_max": 0.865504264831543, + "rewards/margins_min": -0.3373439610004425, + "rewards/margins_std": 0.5349926352500916, + "rewards/rejected": -0.4753313660621643, + "step": 1920 + }, + { + "epoch": 0.51, + "grad_norm": 6.007647409142628, + "learning_rate": 2.89131179465238e-07, + "logits/chosen": -2.803149461746216, + "logits/rejected": -2.6981801986694336, + "logps/chosen": -376.99267578125, + "logps/rejected": -297.24884033203125, + "loss": 0.6187, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21281206607818604, + "rewards/margins": 0.3007994294166565, + "rewards/margins_max": 0.9262828826904297, + "rewards/margins_min": -0.13446655869483948, + "rewards/margins_std": 0.47311100363731384, + "rewards/rejected": -0.5136114358901978, + "step": 1930 + }, + { + "epoch": 0.51, + "grad_norm": 6.445902553483407, + "learning_rate": 2.8687327296049125e-07, + "logits/chosen": -2.7458324432373047, + "logits/rejected": -2.754026412963867, + "logps/chosen": -289.41650390625, + "logps/rejected": -309.2579650878906, + "loss": 0.6108, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.21557316184043884, + "rewards/margins": 0.3037116825580597, + "rewards/margins_max": 0.7462460994720459, + "rewards/margins_min": -0.16689108312129974, + "rewards/margins_std": 0.40999382734298706, + "rewards/rejected": -0.5192848443984985, + "step": 1940 + }, + { + "epoch": 0.51, + "grad_norm": 6.706934844331165, + "learning_rate": 2.846122875480637e-07, + "logits/chosen": -2.7715697288513184, + "logits/rejected": -2.7745981216430664, + "logps/chosen": -308.7226867675781, + "logps/rejected": -326.38604736328125, + "loss": 0.5951, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1484897881746292, + "rewards/margins": 0.3337706923484802, + "rewards/margins_max": 0.705023467540741, + "rewards/margins_min": -0.05409906059503555, + "rewards/margins_std": 0.33222028613090515, + "rewards/rejected": -0.48226046562194824, + "step": 1950 + }, + { + "epoch": 0.51, + "grad_norm": 5.015767033072181, + "learning_rate": 2.8234841201958647e-07, + "logits/chosen": -2.826998472213745, + "logits/rejected": -2.795149803161621, + "logps/chosen": -305.9923400878906, + "logps/rejected": -302.576171875, + "loss": 0.6083, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.09377000480890274, + "rewards/margins": 0.38532763719558716, + "rewards/margins_max": 0.9582147598266602, + "rewards/margins_min": -0.22882786393165588, + "rewards/margins_std": 0.5411325097084045, + "rewards/rejected": -0.4790976643562317, + "step": 1960 + }, + { + "epoch": 0.52, + "grad_norm": 5.118873097263564, + "learning_rate": 2.800818354080148e-07, + "logits/chosen": -2.8394317626953125, + "logits/rejected": -2.8299717903137207, + "logps/chosen": -306.41729736328125, + "logps/rejected": -302.0159606933594, + "loss": 0.6328, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1702316254377365, + "rewards/margins": 0.26725292205810547, + "rewards/margins_max": 0.8620370030403137, + "rewards/margins_min": -0.25390398502349854, + "rewards/margins_std": 0.5177478790283203, + "rewards/rejected": -0.4374845623970032, + "step": 1970 + }, + { + "epoch": 0.52, + "grad_norm": 6.413805269944784, + "learning_rate": 2.778127469718435e-07, + "logits/chosen": -2.772723436355591, + "logits/rejected": -2.729105234146118, + "logps/chosen": -222.18228149414062, + "logps/rejected": -256.9471740722656, + "loss": 0.6147, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11506159603595734, + "rewards/margins": 0.25875991582870483, + "rewards/margins_max": 0.8990314602851868, + "rewards/margins_min": -0.19294139742851257, + "rewards/margins_std": 0.49516162276268005, + "rewards/rejected": -0.37382152676582336, + "step": 1980 + }, + { + "epoch": 0.52, + "grad_norm": 7.4854457782108845, + "learning_rate": 2.755413361793039e-07, + "logits/chosen": -2.7678234577178955, + "logits/rejected": -2.642357349395752, + "logps/chosen": -281.39862060546875, + "logps/rejected": -295.8036193847656, + "loss": 0.6084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.16515931487083435, + "rewards/margins": 0.30200842022895813, + "rewards/margins_max": 0.8513243794441223, + "rewards/margins_min": -0.22569763660430908, + "rewards/margins_std": 0.4902336001396179, + "rewards/rejected": -0.4671677052974701, + "step": 1990 + }, + { + "epoch": 0.52, + "grad_norm": 8.9041595682685, + "learning_rate": 2.7326779269254356e-07, + "logits/chosen": -2.7329964637756348, + "logits/rejected": -2.709454298019409, + "logps/chosen": -250.37020874023438, + "logps/rejected": -257.0063781738281, + "loss": 0.6011, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14309653639793396, + "rewards/margins": 0.17090503871440887, + "rewards/margins_max": 0.6541327834129333, + "rewards/margins_min": -0.30241432785987854, + "rewards/margins_std": 0.4215312600135803, + "rewards/rejected": -0.3140016198158264, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -2.7424988746643066, + "eval_logits/rejected": -2.710038900375366, + "eval_logps/chosen": -297.37890625, + "eval_logps/rejected": -303.2408752441406, + "eval_loss": 0.6047787070274353, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.12884004414081573, + "eval_rewards/margins": 0.2820751965045929, + "eval_rewards/margins_max": 1.0037145614624023, + "eval_rewards/margins_min": -0.4627196788787842, + "eval_rewards/margins_std": 0.4932412803173065, + "eval_rewards/rejected": -0.41091519594192505, + "eval_runtime": 390.4179, + "eval_samples_per_second": 5.123, + "eval_steps_per_second": 0.161, + "step": 2000 + }, + { + "epoch": 0.53, + "grad_norm": 3.4107680847900896, + "learning_rate": 2.709923063517895e-07, + "logits/chosen": -2.676152229309082, + "logits/rejected": -2.7016549110412598, + "logps/chosen": -267.16064453125, + "logps/rejected": -264.1330871582031, + "loss": 0.5852, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.09011226147413254, + "rewards/margins": 0.26958930492401123, + "rewards/margins_max": 0.745033860206604, + "rewards/margins_min": -0.10975122451782227, + "rewards/margins_std": 0.3834989666938782, + "rewards/rejected": -0.35970157384872437, + "step": 2010 + }, + { + "epoch": 0.53, + "grad_norm": 6.011061009644631, + "learning_rate": 2.68715067159496e-07, + "logits/chosen": -2.8204386234283447, + "logits/rejected": -2.760305166244507, + "logps/chosen": -312.7381896972656, + "logps/rejected": -281.01544189453125, + "loss": 0.5914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04576291888952255, + "rewards/margins": 0.37289947271347046, + "rewards/margins_max": 1.0170303583145142, + "rewards/margins_min": -0.21299608051776886, + "rewards/margins_std": 0.5470786690711975, + "rewards/rejected": -0.4186623692512512, + "step": 2020 + }, + { + "epoch": 0.53, + "grad_norm": 6.308727383453058, + "learning_rate": 2.664362652644806e-07, + "logits/chosen": -2.786015748977661, + "logits/rejected": -2.7799692153930664, + "logps/chosen": -300.67376708984375, + "logps/rejected": -306.27313232421875, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14222674071788788, + "rewards/margins": 0.2917899489402771, + "rewards/margins_max": 0.9741031527519226, + "rewards/margins_min": -0.30088645219802856, + "rewards/margins_std": 0.5560330152511597, + "rewards/rejected": -0.43401670455932617, + "step": 2030 + }, + { + "epoch": 0.53, + "grad_norm": 4.378928028770551, + "learning_rate": 2.6415609094604555e-07, + "logits/chosen": -2.5638880729675293, + "logits/rejected": -2.631761074066162, + "logps/chosen": -324.99932861328125, + "logps/rejected": -266.020751953125, + "loss": 0.5797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25322872400283813, + "rewards/margins": 0.2859429121017456, + "rewards/margins_max": 0.8347060084342957, + "rewards/margins_min": -0.2059619426727295, + "rewards/margins_std": 0.4723834991455078, + "rewards/rejected": -0.5391716361045837, + "step": 2040 + }, + { + "epoch": 0.54, + "grad_norm": 6.663836387027101, + "learning_rate": 2.618747345980904e-07, + "logits/chosen": -2.7345101833343506, + "logits/rejected": -2.7416293621063232, + "logps/chosen": -299.2042541503906, + "logps/rejected": -308.9120788574219, + "loss": 0.6035, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.18221673369407654, + "rewards/margins": 0.3735789656639099, + "rewards/margins_max": 0.9540404081344604, + "rewards/margins_min": -0.22572532296180725, + "rewards/margins_std": 0.5106708407402039, + "rewards/rejected": -0.5557957291603088, + "step": 2050 + }, + { + "epoch": 0.54, + "grad_norm": 4.438945695950961, + "learning_rate": 2.595923867132136e-07, + "logits/chosen": -2.75480055809021, + "logits/rejected": -2.73787522315979, + "logps/chosen": -322.1286926269531, + "logps/rejected": -299.0472412109375, + "loss": 0.6146, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1146298497915268, + "rewards/margins": 0.2715553343296051, + "rewards/margins_max": 0.7664214968681335, + "rewards/margins_min": -0.11051158607006073, + "rewards/margins_std": 0.39920473098754883, + "rewards/rejected": -0.3861851692199707, + "step": 2060 + }, + { + "epoch": 0.54, + "grad_norm": 6.8042378851890675, + "learning_rate": 2.5730923786680667e-07, + "logits/chosen": -2.717268228530884, + "logits/rejected": -2.714512825012207, + "logps/chosen": -244.3038787841797, + "logps/rejected": -317.8219299316406, + "loss": 0.5929, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.14704938232898712, + "rewards/margins": 0.23312871158123016, + "rewards/margins_max": 0.8925619125366211, + "rewards/margins_min": -0.340410053730011, + "rewards/margins_std": 0.5334269404411316, + "rewards/rejected": -0.3801780641078949, + "step": 2070 + }, + { + "epoch": 0.54, + "grad_norm": 4.69667396546648, + "learning_rate": 2.5502547870114135e-07, + "logits/chosen": -2.769766330718994, + "logits/rejected": -2.721893787384033, + "logps/chosen": -243.6653289794922, + "logps/rejected": -278.74114990234375, + "loss": 0.5894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20172667503356934, + "rewards/margins": 0.21734285354614258, + "rewards/margins_max": 0.7849537134170532, + "rewards/margins_min": -0.4703293442726135, + "rewards/margins_std": 0.5456236600875854, + "rewards/rejected": -0.4190695881843567, + "step": 2080 + }, + { + "epoch": 0.55, + "grad_norm": 5.401439842131361, + "learning_rate": 2.527412999094506e-07, + "logits/chosen": -2.6740739345550537, + "logits/rejected": -2.667999029159546, + "logps/chosen": -292.1925964355469, + "logps/rejected": -348.8424377441406, + "loss": 0.5848, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22697997093200684, + "rewards/margins": 0.27222976088523865, + "rewards/margins_max": 0.8208610415458679, + "rewards/margins_min": -0.22808341681957245, + "rewards/margins_std": 0.47663742303848267, + "rewards/rejected": -0.4992097020149231, + "step": 2090 + }, + { + "epoch": 0.55, + "grad_norm": 4.414933110353597, + "learning_rate": 2.5045689222000636e-07, + "logits/chosen": -2.7417569160461426, + "logits/rejected": -2.7582132816314697, + "logps/chosen": -268.8819885253906, + "logps/rejected": -253.3248748779297, + "loss": 0.6047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.18695008754730225, + "rewards/margins": 0.2377978265285492, + "rewards/margins_max": 0.7447687983512878, + "rewards/margins_min": -0.19871556758880615, + "rewards/margins_std": 0.4170859456062317, + "rewards/rejected": -0.42474788427352905, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_logits/chosen": -2.7447781562805176, + "eval_logits/rejected": -2.712313175201416, + "eval_logps/chosen": -299.3512268066406, + "eval_logps/rejected": -306.3504943847656, + "eval_loss": 0.6031152606010437, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.1485634595155716, + "eval_rewards/margins": 0.2934476435184479, + "eval_rewards/margins_max": 1.0559176206588745, + "eval_rewards/margins_min": -0.47924622893333435, + "eval_rewards/margins_std": 0.5192877650260925, + "eval_rewards/rejected": -0.4420110881328583, + "eval_runtime": 405.7774, + "eval_samples_per_second": 4.929, + "eval_steps_per_second": 0.155, + "step": 2100 + }, + { + "epoch": 0.55, + "grad_norm": 5.495335074457826, + "learning_rate": 2.481724463801933e-07, + "logits/chosen": -2.695875644683838, + "logits/rejected": -2.639840602874756, + "logps/chosen": -283.6210021972656, + "logps/rejected": -263.7871398925781, + "loss": 0.5956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.16770590841770172, + "rewards/margins": 0.25329190492630005, + "rewards/margins_max": 0.702299952507019, + "rewards/margins_min": -0.11821697652339935, + "rewards/margins_std": 0.37073150277137756, + "rewards/rejected": -0.4209977686405182, + "step": 2110 + }, + { + "epoch": 0.55, + "grad_norm": 5.080562065448951, + "learning_rate": 2.4588815314058154e-07, + "logits/chosen": -2.681507110595703, + "logits/rejected": -2.665618896484375, + "logps/chosen": -248.3491668701172, + "logps/rejected": -298.63800048828125, + "loss": 0.5961, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.055473219603300095, + "rewards/margins": 0.34807246923446655, + "rewards/margins_max": 0.892570972442627, + "rewards/margins_min": -0.011921525001525879, + "rewards/margins_std": 0.40655845403671265, + "rewards/rejected": -0.40354570746421814, + "step": 2120 + }, + { + "epoch": 0.56, + "grad_norm": 5.62089829837403, + "learning_rate": 2.4360420323899917e-07, + "logits/chosen": -2.739928722381592, + "logits/rejected": -2.736339807510376, + "logps/chosen": -222.92544555664062, + "logps/rejected": -286.5703430175781, + "loss": 0.6397, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14108645915985107, + "rewards/margins": 0.1377021074295044, + "rewards/margins_max": 0.680718719959259, + "rewards/margins_min": -0.3931158483028412, + "rewards/margins_std": 0.47464513778686523, + "rewards/rejected": -0.27878856658935547, + "step": 2130 + }, + { + "epoch": 0.56, + "grad_norm": 4.880741357616809, + "learning_rate": 2.4132078738460583e-07, + "logits/chosen": -2.751692533493042, + "logits/rejected": -2.760918378829956, + "logps/chosen": -305.81072998046875, + "logps/rejected": -319.619140625, + "loss": 0.601, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15225140750408173, + "rewards/margins": 0.2561028003692627, + "rewards/margins_max": 0.8320122957229614, + "rewards/margins_min": -0.2644018232822418, + "rewards/margins_std": 0.501566469669342, + "rewards/rejected": -0.4083542227745056, + "step": 2140 + }, + { + "epoch": 0.56, + "grad_norm": 7.645497771200616, + "learning_rate": 2.390380962419682e-07, + "logits/chosen": -2.774425745010376, + "logits/rejected": -2.744809627532959, + "logps/chosen": -305.55316162109375, + "logps/rejected": -260.71697998046875, + "loss": 0.5967, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2034352719783783, + "rewards/margins": 0.2711416780948639, + "rewards/margins_max": 0.7908986806869507, + "rewards/margins_min": -0.18977515399456024, + "rewards/margins_std": 0.44558167457580566, + "rewards/rejected": -0.4745768904685974, + "step": 2150 + }, + { + "epoch": 0.57, + "grad_norm": 16.498459252755808, + "learning_rate": 2.3675632041513977e-07, + "logits/chosen": -2.6031267642974854, + "logits/rejected": -2.616091012954712, + "logps/chosen": -252.59066772460938, + "logps/rejected": -290.72540283203125, + "loss": 0.6145, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3081524968147278, + "rewards/margins": 0.24541540443897247, + "rewards/margins_max": 0.7045064568519592, + "rewards/margins_min": -0.13730569183826447, + "rewards/margins_std": 0.37777501344680786, + "rewards/rejected": -0.5535678863525391, + "step": 2160 + }, + { + "epoch": 0.57, + "grad_norm": 9.656564059289016, + "learning_rate": 2.344756504317453e-07, + "logits/chosen": -2.569385051727295, + "logits/rejected": -2.5988173484802246, + "logps/chosen": -282.082275390625, + "logps/rejected": -303.0838317871094, + "loss": 0.5888, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.30863112211227417, + "rewards/margins": 0.3043331503868103, + "rewards/margins_max": 0.8873383402824402, + "rewards/margins_min": -0.20051121711730957, + "rewards/margins_std": 0.49640101194381714, + "rewards/rejected": -0.6129643321037292, + "step": 2170 + }, + { + "epoch": 0.57, + "grad_norm": 7.1397969039113125, + "learning_rate": 2.3219627672707237e-07, + "logits/chosen": -2.8291194438934326, + "logits/rejected": -2.7747745513916016, + "logps/chosen": -366.5031433105469, + "logps/rejected": -310.3370056152344, + "loss": 0.5995, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2576615512371063, + "rewards/margins": 0.301690012216568, + "rewards/margins_max": 0.920368492603302, + "rewards/margins_min": -0.3287815451622009, + "rewards/margins_std": 0.5595054626464844, + "rewards/rejected": -0.5593516230583191, + "step": 2180 + }, + { + "epoch": 0.57, + "grad_norm": 7.233466402344567, + "learning_rate": 2.2991838962816918e-07, + "logits/chosen": -2.8080074787139893, + "logits/rejected": -2.8282382488250732, + "logps/chosen": -320.45587158203125, + "logps/rejected": -393.7651062011719, + "loss": 0.5856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3831197917461395, + "rewards/margins": 0.24666862189769745, + "rewards/margins_max": 0.7738012075424194, + "rewards/margins_min": -0.326736718416214, + "rewards/margins_std": 0.49492964148521423, + "rewards/rejected": -0.6297883987426758, + "step": 2190 + }, + { + "epoch": 0.58, + "grad_norm": 6.9941910755475885, + "learning_rate": 2.2764217933795297e-07, + "logits/chosen": -2.8005177974700928, + "logits/rejected": -2.727771520614624, + "logps/chosen": -385.36669921875, + "logps/rejected": -349.25970458984375, + "loss": 0.592, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.20833441615104675, + "rewards/margins": 0.48180752992630005, + "rewards/margins_max": 1.033427119255066, + "rewards/margins_min": -0.07096156477928162, + "rewards/margins_std": 0.5294944643974304, + "rewards/rejected": -0.6901419758796692, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_logits/chosen": -2.742300033569336, + "eval_logits/rejected": -2.7100446224212646, + "eval_logps/chosen": -310.72698974609375, + "eval_logps/rejected": -319.92169189453125, + "eval_loss": 0.6010785102844238, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.2623210847377777, + "eval_rewards/margins": 0.31540217995643616, + "eval_rewards/margins_max": 1.1326465606689453, + "eval_rewards/margins_min": -0.5284470319747925, + "eval_rewards/margins_std": 0.5637532472610474, + "eval_rewards/rejected": -0.5777232646942139, + "eval_runtime": 390.6609, + "eval_samples_per_second": 5.12, + "eval_steps_per_second": 0.161, + "step": 2200 + }, + { + "epoch": 0.58, + "grad_norm": 3.815021744012424, + "learning_rate": 2.253678359193278e-07, + "logits/chosen": -2.781501293182373, + "logits/rejected": -2.7869057655334473, + "logps/chosen": -293.3313903808594, + "logps/rejected": -307.9054870605469, + "loss": 0.5916, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.27142855525016785, + "rewards/margins": 0.2509201169013977, + "rewards/margins_max": 0.8431049585342407, + "rewards/margins_min": -0.3988971412181854, + "rewards/margins_std": 0.5531180500984192, + "rewards/rejected": -0.5223486423492432, + "step": 2210 + }, + { + "epoch": 0.58, + "grad_norm": 8.472123153468619, + "learning_rate": 2.230955492793149e-07, + "logits/chosen": -2.768160343170166, + "logits/rejected": -2.764039993286133, + "logps/chosen": -345.43194580078125, + "logps/rejected": -330.89459228515625, + "loss": 0.6261, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23594987392425537, + "rewards/margins": 0.4209807515144348, + "rewards/margins_max": 1.1127345561981201, + "rewards/margins_min": -0.18445590138435364, + "rewards/margins_std": 0.5799250602722168, + "rewards/rejected": -0.6569305658340454, + "step": 2220 + }, + { + "epoch": 0.58, + "grad_norm": 6.327922751879938, + "learning_rate": 2.2082550915319468e-07, + "logits/chosen": -2.720182418823242, + "logits/rejected": -2.720376491546631, + "logps/chosen": -299.11566162109375, + "logps/rejected": -338.6654052734375, + "loss": 0.615, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.39112502336502075, + "rewards/margins": 0.25776180624961853, + "rewards/margins_max": 0.8736675977706909, + "rewards/margins_min": -0.26238715648651123, + "rewards/margins_std": 0.5073090195655823, + "rewards/rejected": -0.6488867998123169, + "step": 2230 + }, + { + "epoch": 0.59, + "grad_norm": 4.047242136911866, + "learning_rate": 2.1855790508866433e-07, + "logits/chosen": -2.7828967571258545, + "logits/rejected": -2.7402825355529785, + "logps/chosen": -312.4578552246094, + "logps/rejected": -286.78448486328125, + "loss": 0.6413, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.24146990478038788, + "rewards/margins": 0.26030173897743225, + "rewards/margins_max": 0.82671058177948, + "rewards/margins_min": -0.3633379340171814, + "rewards/margins_std": 0.5442849397659302, + "rewards/rejected": -0.5017716288566589, + "step": 2240 + }, + { + "epoch": 0.59, + "grad_norm": 7.592939168709741, + "learning_rate": 2.162929264300107e-07, + "logits/chosen": -2.7478950023651123, + "logits/rejected": -2.7511210441589355, + "logps/chosen": -305.8150939941406, + "logps/rejected": -294.320068359375, + "loss": 0.6097, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.18209628760814667, + "rewards/margins": 0.33885785937309265, + "rewards/margins_max": 1.0088571310043335, + "rewards/margins_min": -0.20651838183403015, + "rewards/margins_std": 0.5271486043930054, + "rewards/rejected": -0.5209541916847229, + "step": 2250 + }, + { + "epoch": 0.59, + "grad_norm": 8.284664500965304, + "learning_rate": 2.1403076230230005e-07, + "logits/chosen": -2.709615707397461, + "logits/rejected": -2.6713905334472656, + "logps/chosen": -266.75872802734375, + "logps/rejected": -284.7406921386719, + "loss": 0.5901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2541000247001648, + "rewards/margins": 0.2771957516670227, + "rewards/margins_max": 0.8805142641067505, + "rewards/margins_min": -0.2726813852787018, + "rewards/margins_std": 0.5316154956817627, + "rewards/rejected": -0.5312957763671875, + "step": 2260 + }, + { + "epoch": 0.59, + "grad_norm": 5.341177694919087, + "learning_rate": 2.1177160159558596e-07, + "logits/chosen": -2.7336771488189697, + "logits/rejected": -2.6524386405944824, + "logps/chosen": -280.09869384765625, + "logps/rejected": -315.43670654296875, + "loss": 0.5579, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1666630059480667, + "rewards/margins": 0.4242420792579651, + "rewards/margins_max": 1.089989185333252, + "rewards/margins_min": -0.3301016688346863, + "rewards/margins_std": 0.6329237222671509, + "rewards/rejected": -0.5909050703048706, + "step": 2270 + }, + { + "epoch": 0.6, + "grad_norm": 5.742594386625843, + "learning_rate": 2.0951563294913734e-07, + "logits/chosen": -2.7769455909729004, + "logits/rejected": -2.7597556114196777, + "logps/chosen": -276.4035949707031, + "logps/rejected": -317.9994201660156, + "loss": 0.5836, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1678825169801712, + "rewards/margins": 0.3626595437526703, + "rewards/margins_max": 0.891548752784729, + "rewards/margins_min": -0.2097504585981369, + "rewards/margins_std": 0.5090166330337524, + "rewards/rejected": -0.5305420160293579, + "step": 2280 + }, + { + "epoch": 0.6, + "grad_norm": 10.228176796767716, + "learning_rate": 2.072630447356869e-07, + "logits/chosen": -2.7072107791900635, + "logits/rejected": -2.624112367630005, + "logps/chosen": -239.66162109375, + "logps/rejected": -268.73602294921875, + "loss": 0.5972, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.12984153628349304, + "rewards/margins": 0.3632405400276184, + "rewards/margins_max": 1.0205637216567993, + "rewards/margins_min": -0.2810240685939789, + "rewards/margins_std": 0.5711122751235962, + "rewards/rejected": -0.49308210611343384, + "step": 2290 + }, + { + "epoch": 0.6, + "grad_norm": 4.543538025918651, + "learning_rate": 2.0501402504570232e-07, + "logits/chosen": -2.6988930702209473, + "logits/rejected": -2.71895694732666, + "logps/chosen": -287.51800537109375, + "logps/rejected": -297.48321533203125, + "loss": 0.6285, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3156401216983795, + "rewards/margins": 0.1996089518070221, + "rewards/margins_max": 0.7438315153121948, + "rewards/margins_min": -0.3300771117210388, + "rewards/margins_std": 0.48831623792648315, + "rewards/rejected": -0.5152490139007568, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_logits/chosen": -2.7370452880859375, + "eval_logits/rejected": -2.7043983936309814, + "eval_logps/chosen": -315.4819030761719, + "eval_logps/rejected": -324.216552734375, + "eval_loss": 0.6022123098373413, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.30987000465393066, + "eval_rewards/margins": 0.31080153584480286, + "eval_rewards/margins_max": 1.1253905296325684, + "eval_rewards/margins_min": -0.5181033611297607, + "eval_rewards/margins_std": 0.5569573640823364, + "eval_rewards/rejected": -0.6206715106964111, + "eval_runtime": 398.6658, + "eval_samples_per_second": 5.017, + "eval_steps_per_second": 0.158, + "step": 2300 + }, + { + "epoch": 0.6, + "grad_norm": 7.280211280439723, + "learning_rate": 2.027687616716804e-07, + "logits/chosen": -2.794407844543457, + "logits/rejected": -2.721747398376465, + "logps/chosen": -358.080810546875, + "logps/rejected": -302.70623779296875, + "loss": 0.5931, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.30784812569618225, + "rewards/margins": 0.2711806893348694, + "rewards/margins_max": 0.8400300741195679, + "rewards/margins_min": -0.17863331735134125, + "rewards/margins_std": 0.45989537239074707, + "rewards/rejected": -0.5790287852287292, + "step": 2310 + }, + { + "epoch": 0.61, + "grad_norm": 4.934579879053907, + "learning_rate": 2.005274420924668e-07, + "logits/chosen": -2.719719886779785, + "logits/rejected": -2.680213451385498, + "logps/chosen": -318.9151611328125, + "logps/rejected": -324.38983154296875, + "loss": 0.6018, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3573724329471588, + "rewards/margins": 0.18577995896339417, + "rewards/margins_max": 0.9159051775932312, + "rewards/margins_min": -0.559878945350647, + "rewards/margins_std": 0.6602797508239746, + "rewards/rejected": -0.5431524515151978, + "step": 2320 + }, + { + "epoch": 0.61, + "grad_norm": 5.784526824195216, + "learning_rate": 1.9829025345760121e-07, + "logits/chosen": -2.7709848880767822, + "logits/rejected": -2.760662794113159, + "logps/chosen": -327.946044921875, + "logps/rejected": -397.20880126953125, + "loss": 0.5799, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2780410945415497, + "rewards/margins": 0.5168048143386841, + "rewards/margins_max": 1.1355929374694824, + "rewards/margins_min": -0.1293756067752838, + "rewards/margins_std": 0.568372368812561, + "rewards/rejected": -0.7948459386825562, + "step": 2330 + }, + { + "epoch": 0.61, + "grad_norm": 6.174694643694625, + "learning_rate": 1.960573825716911e-07, + "logits/chosen": -2.7704901695251465, + "logits/rejected": -2.7210559844970703, + "logps/chosen": -376.3761291503906, + "logps/rejected": -384.32733154296875, + "loss": 0.6184, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.4266100823879242, + "rewards/margins": 0.11548767238855362, + "rewards/margins_max": 0.7346783876419067, + "rewards/margins_min": -0.5440298914909363, + "rewards/margins_std": 0.5593437552452087, + "rewards/rejected": -0.5420977473258972, + "step": 2340 + }, + { + "epoch": 0.62, + "grad_norm": 7.8165116653311415, + "learning_rate": 1.9382901587881273e-07, + "logits/chosen": -2.7726988792419434, + "logits/rejected": -2.774543046951294, + "logps/chosen": -339.03997802734375, + "logps/rejected": -307.2474365234375, + "loss": 0.6356, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.280036985874176, + "rewards/margins": 0.23525002598762512, + "rewards/margins_max": 0.7496523857116699, + "rewards/margins_min": -0.22758913040161133, + "rewards/margins_std": 0.42877086997032166, + "rewards/rejected": -0.5152870416641235, + "step": 2350 + }, + { + "epoch": 0.62, + "grad_norm": 6.393257475645912, + "learning_rate": 1.9160533944694364e-07, + "logits/chosen": -2.7256267070770264, + "logits/rejected": -2.6889102458953857, + "logps/chosen": -307.99530029296875, + "logps/rejected": -262.7240905761719, + "loss": 0.6092, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21818378567695618, + "rewards/margins": 0.26890823245048523, + "rewards/margins_max": 0.7914302945137024, + "rewards/margins_min": -0.32647308707237244, + "rewards/margins_std": 0.510516881942749, + "rewards/rejected": -0.4870920181274414, + "step": 2360 + }, + { + "epoch": 0.62, + "grad_norm": 7.60773652470279, + "learning_rate": 1.8938653895242602e-07, + "logits/chosen": -2.7873361110687256, + "logits/rejected": -2.751237154006958, + "logps/chosen": -296.0665588378906, + "logps/rejected": -290.09271240234375, + "loss": 0.66, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.39214691519737244, + "rewards/margins": 0.14480635523796082, + "rewards/margins_max": 0.6504937410354614, + "rewards/margins_min": -0.41229182481765747, + "rewards/margins_std": 0.4770260453224182, + "rewards/rejected": -0.536953330039978, + "step": 2370 + }, + { + "epoch": 0.62, + "grad_norm": 8.841298764478749, + "learning_rate": 1.8717279966446264e-07, + "logits/chosen": -2.7938389778137207, + "logits/rejected": -2.759669065475464, + "logps/chosen": -269.1448059082031, + "logps/rejected": -253.77734375, + "loss": 0.6142, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23903334140777588, + "rewards/margins": 0.1704338788986206, + "rewards/margins_max": 0.7059664726257324, + "rewards/margins_min": -0.3458612263202667, + "rewards/margins_std": 0.47689881920814514, + "rewards/rejected": -0.4094672203063965, + "step": 2380 + }, + { + "epoch": 0.63, + "grad_norm": 7.28763519737504, + "learning_rate": 1.8496430642964694e-07, + "logits/chosen": -2.7890567779541016, + "logits/rejected": -2.7893338203430176, + "logps/chosen": -298.6877746582031, + "logps/rejected": -329.82672119140625, + "loss": 0.5784, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15925773978233337, + "rewards/margins": 0.22392554581165314, + "rewards/margins_max": 0.8634682893753052, + "rewards/margins_min": -0.5708165764808655, + "rewards/margins_std": 0.6212704181671143, + "rewards/rejected": -0.3831833302974701, + "step": 2390 + }, + { + "epoch": 0.63, + "grad_norm": 5.574207629357775, + "learning_rate": 1.8276124365652855e-07, + "logits/chosen": -2.723452091217041, + "logits/rejected": -2.6575615406036377, + "logps/chosen": -285.4541015625, + "logps/rejected": -343.5442810058594, + "loss": 0.6258, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.09620613604784012, + "rewards/margins": 0.3225805163383484, + "rewards/margins_max": 0.8125874400138855, + "rewards/margins_min": -0.13880617916584015, + "rewards/margins_std": 0.42147356271743774, + "rewards/rejected": -0.4187866747379303, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_logits/chosen": -2.7290594577789307, + "eval_logits/rejected": -2.696014404296875, + "eval_logps/chosen": -300.9169616699219, + "eval_logps/rejected": -309.51654052734375, + "eval_loss": 0.6005234718322754, + "eval_rewards/accuracies": 0.7301587462425232, + "eval_rewards/chosen": -0.1642206460237503, + "eval_rewards/margins": 0.30945146083831787, + "eval_rewards/margins_max": 1.0716499090194702, + "eval_rewards/margins_min": -0.49571293592453003, + "eval_rewards/margins_std": 0.5258515477180481, + "eval_rewards/rejected": -0.473672091960907, + "eval_runtime": 390.3517, + "eval_samples_per_second": 5.124, + "eval_steps_per_second": 0.161, + "step": 2400 + }, + { + "epoch": 0.63, + "grad_norm": 6.123233774061434, + "learning_rate": 1.805637953002149e-07, + "logits/chosen": -2.72236704826355, + "logits/rejected": -2.691912889480591, + "logps/chosen": -318.01507568359375, + "logps/rejected": -294.35308837890625, + "loss": 0.5791, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1225033774971962, + "rewards/margins": 0.25491413474082947, + "rewards/margins_max": 0.8299559354782104, + "rewards/margins_min": -0.2943783402442932, + "rewards/margins_std": 0.4992886483669281, + "rewards/rejected": -0.37741750478744507, + "step": 2410 + }, + { + "epoch": 0.63, + "grad_norm": 4.08229453958013, + "learning_rate": 1.7837214484701153e-07, + "logits/chosen": -2.760404586791992, + "logits/rejected": -2.7140660285949707, + "logps/chosen": -374.02447509765625, + "logps/rejected": -341.2372131347656, + "loss": 0.603, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.15294203162193298, + "rewards/margins": 0.283006489276886, + "rewards/margins_max": 0.7429834604263306, + "rewards/margins_min": -0.3383616507053375, + "rewards/margins_std": 0.4728143811225891, + "rewards/rejected": -0.43594852089881897, + "step": 2420 + }, + { + "epoch": 0.64, + "grad_norm": 4.496714558234412, + "learning_rate": 1.761864752991004e-07, + "logits/chosen": -2.7352404594421387, + "logits/rejected": -2.724327802658081, + "logps/chosen": -312.3106994628906, + "logps/rejected": -312.7837829589844, + "loss": 0.6099, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1995433121919632, + "rewards/margins": 0.30550646781921387, + "rewards/margins_max": 0.8856220245361328, + "rewards/margins_min": -0.23194687068462372, + "rewards/margins_std": 0.5116561651229858, + "rewards/rejected": -0.5050498247146606, + "step": 2430 + }, + { + "epoch": 0.64, + "grad_norm": 8.728898680828857, + "learning_rate": 1.7400696915925995e-07, + "logits/chosen": -2.6603329181671143, + "logits/rejected": -2.6154816150665283, + "logps/chosen": -318.36175537109375, + "logps/rejected": -303.51300048828125, + "loss": 0.602, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.253580778837204, + "rewards/margins": 0.20639896392822266, + "rewards/margins_max": 0.7484241724014282, + "rewards/margins_min": -0.2710598111152649, + "rewards/margins_std": 0.4437841773033142, + "rewards/rejected": -0.459979772567749, + "step": 2440 + }, + { + "epoch": 0.64, + "grad_norm": 4.4710742801373655, + "learning_rate": 1.718338084156254e-07, + "logits/chosen": -2.7727761268615723, + "logits/rejected": -2.7827248573303223, + "logps/chosen": -329.8681640625, + "logps/rejected": -396.83331298828125, + "loss": 0.6253, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.34143736958503723, + "rewards/margins": 0.16818933188915253, + "rewards/margins_max": 0.7694539427757263, + "rewards/margins_min": -0.3513100743293762, + "rewards/margins_std": 0.5040308237075806, + "rewards/rejected": -0.5096266865730286, + "step": 2450 + }, + { + "epoch": 0.64, + "grad_norm": 6.010707837153065, + "learning_rate": 1.696671745264937e-07, + "logits/chosen": -2.682893753051758, + "logits/rejected": -2.692431926727295, + "logps/chosen": -282.19866943359375, + "logps/rejected": -294.518798828125, + "loss": 0.6192, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3741334080696106, + "rewards/margins": 0.18039581179618835, + "rewards/margins_max": 0.7664681077003479, + "rewards/margins_min": -0.6025083065032959, + "rewards/margins_std": 0.6326398849487305, + "rewards/rejected": -0.5545291900634766, + "step": 2460 + }, + { + "epoch": 0.65, + "grad_norm": 4.501296174613143, + "learning_rate": 1.67507248405171e-07, + "logits/chosen": -2.7249624729156494, + "logits/rejected": -2.742990016937256, + "logps/chosen": -362.339599609375, + "logps/rejected": -354.41064453125, + "loss": 0.5996, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.21265415847301483, + "rewards/margins": 0.36456912755966187, + "rewards/margins_max": 1.0345338582992554, + "rewards/margins_min": -0.20259599387645721, + "rewards/margins_std": 0.5646041631698608, + "rewards/rejected": -0.5772233605384827, + "step": 2470 + }, + { + "epoch": 0.65, + "grad_norm": 8.360873934682914, + "learning_rate": 1.6535421040486683e-07, + "logits/chosen": -2.84075665473938, + "logits/rejected": -2.8196959495544434, + "logps/chosen": -326.2269592285156, + "logps/rejected": -296.1560363769531, + "loss": 0.5812, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22578425705432892, + "rewards/margins": 0.24927327036857605, + "rewards/margins_max": 0.8202487826347351, + "rewards/margins_min": -0.4271158277988434, + "rewards/margins_std": 0.5501688718795776, + "rewards/rejected": -0.4750575125217438, + "step": 2480 + }, + { + "epoch": 0.65, + "grad_norm": 6.635598396275965, + "learning_rate": 1.6320824030363456e-07, + "logits/chosen": -2.589611530303955, + "logits/rejected": -2.5518672466278076, + "logps/chosen": -342.490966796875, + "logps/rejected": -330.5583190917969, + "loss": 0.5453, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1011362075805664, + "rewards/margins": 0.5414234399795532, + "rewards/margins_max": 1.1826380491256714, + "rewards/margins_min": -0.1259969025850296, + "rewards/margins_std": 0.5601600408554077, + "rewards/rejected": -0.6425596475601196, + "step": 2490 + }, + { + "epoch": 0.65, + "grad_norm": 7.640751812278419, + "learning_rate": 1.6106951728936024e-07, + "logits/chosen": -2.6982195377349854, + "logits/rejected": -2.713944435119629, + "logps/chosen": -288.63330078125, + "logps/rejected": -347.19512939453125, + "loss": 0.5855, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3290866017341614, + "rewards/margins": 0.297451913356781, + "rewards/margins_max": 0.9438754916191101, + "rewards/margins_min": -0.3197605311870575, + "rewards/margins_std": 0.5608260631561279, + "rewards/rejected": -0.6265385150909424, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_logits/chosen": -2.725276231765747, + "eval_logits/rejected": -2.6924445629119873, + "eval_logps/chosen": -305.94183349609375, + "eval_logps/rejected": -315.961669921875, + "eval_loss": 0.5980704426765442, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.21446941792964935, + "eval_rewards/margins": 0.3236537575721741, + "eval_rewards/margins_max": 1.1337225437164307, + "eval_rewards/margins_min": -0.523493230342865, + "eval_rewards/margins_std": 0.5568127632141113, + "eval_rewards/rejected": -0.5381232500076294, + "eval_runtime": 391.0371, + "eval_samples_per_second": 5.115, + "eval_steps_per_second": 0.161, + "step": 2500 + }, + { + "epoch": 0.66, + "grad_norm": 4.652537664540794, + "learning_rate": 1.5893821994479994e-07, + "logits/chosen": -2.62013578414917, + "logits/rejected": -2.569736957550049, + "logps/chosen": -278.39813232421875, + "logps/rejected": -299.04522705078125, + "loss": 0.5681, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.21412518620491028, + "rewards/margins": 0.36885911226272583, + "rewards/margins_max": 0.8680575489997864, + "rewards/margins_min": -0.19697673618793488, + "rewards/margins_std": 0.4716603755950928, + "rewards/rejected": -0.5829842686653137, + "step": 2510 + }, + { + "epoch": 0.66, + "grad_norm": 8.333603307801306, + "learning_rate": 1.5681452623266867e-07, + "logits/chosen": -2.752260446548462, + "logits/rejected": -2.708174705505371, + "logps/chosen": -275.9288330078125, + "logps/rejected": -301.96002197265625, + "loss": 0.5841, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14842669665813446, + "rewards/margins": 0.5135299563407898, + "rewards/margins_max": 1.0399694442749023, + "rewards/margins_min": 0.005812531802803278, + "rewards/margins_std": 0.4823017120361328, + "rewards/rejected": -0.6619566679000854, + "step": 2520 + }, + { + "epoch": 0.66, + "grad_norm": 5.050271040186428, + "learning_rate": 1.546986134807801e-07, + "logits/chosen": -2.7156319618225098, + "logits/rejected": -2.7484681606292725, + "logps/chosen": -309.27435302734375, + "logps/rejected": -361.76947021484375, + "loss": 0.5811, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2536551356315613, + "rewards/margins": 0.4039143919944763, + "rewards/margins_max": 1.1457735300064087, + "rewards/margins_min": -0.2755311131477356, + "rewards/margins_std": 0.6136313676834106, + "rewards/rejected": -0.6575695276260376, + "step": 2530 + }, + { + "epoch": 0.66, + "grad_norm": 7.760709684502372, + "learning_rate": 1.5259065836724034e-07, + "logits/chosen": -2.666165351867676, + "logits/rejected": -2.6718852519989014, + "logps/chosen": -316.0393371582031, + "logps/rejected": -376.5950012207031, + "loss": 0.6157, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.30989760160446167, + "rewards/margins": 0.21957314014434814, + "rewards/margins_max": 0.7271320819854736, + "rewards/margins_min": -0.35212546586990356, + "rewards/margins_std": 0.4791449010372162, + "rewards/rejected": -0.5294707417488098, + "step": 2540 + }, + { + "epoch": 0.67, + "grad_norm": 5.962533959505952, + "learning_rate": 1.5049083690569454e-07, + "logits/chosen": -2.731842517852783, + "logits/rejected": -2.7230496406555176, + "logps/chosen": -291.4844970703125, + "logps/rejected": -330.8465576171875, + "loss": 0.5686, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.1689339280128479, + "rewards/margins": 0.3390408456325531, + "rewards/margins_max": 0.8470038175582886, + "rewards/margins_min": -0.13603731989860535, + "rewards/margins_std": 0.44846653938293457, + "rewards/rejected": -0.5079747438430786, + "step": 2550 + }, + { + "epoch": 0.67, + "grad_norm": 7.823626598208363, + "learning_rate": 1.4839932443063056e-07, + "logits/chosen": -2.6088666915893555, + "logits/rejected": -2.566678524017334, + "logps/chosen": -216.99929809570312, + "logps/rejected": -242.52877807617188, + "loss": 0.5759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16937707364559174, + "rewards/margins": 0.3563659191131592, + "rewards/margins_max": 0.920008659362793, + "rewards/margins_min": -0.17174455523490906, + "rewards/margins_std": 0.49960392713546753, + "rewards/rejected": -0.5257430076599121, + "step": 2560 + }, + { + "epoch": 0.67, + "grad_norm": 8.981375603272072, + "learning_rate": 1.46316295582738e-07, + "logits/chosen": -2.7724390029907227, + "logits/rejected": -2.743107318878174, + "logps/chosen": -301.3759460449219, + "logps/rejected": -332.14349365234375, + "loss": 0.5702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.15795984864234924, + "rewards/margins": 0.3856964707374573, + "rewards/margins_max": 0.9565441012382507, + "rewards/margins_min": -0.1837233006954193, + "rewards/margins_std": 0.5072107315063477, + "rewards/rejected": -0.5436563491821289, + "step": 2570 + }, + { + "epoch": 0.68, + "grad_norm": 10.222784989755842, + "learning_rate": 1.4424192429432655e-07, + "logits/chosen": -2.7305712699890137, + "logits/rejected": -2.7124509811401367, + "logps/chosen": -286.04913330078125, + "logps/rejected": -337.43353271484375, + "loss": 0.5701, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17382414638996124, + "rewards/margins": 0.4440658986568451, + "rewards/margins_max": 0.9443173408508301, + "rewards/margins_min": -0.116560198366642, + "rewards/margins_std": 0.4755174517631531, + "rewards/rejected": -0.6178901195526123, + "step": 2580 + }, + { + "epoch": 0.68, + "grad_norm": 5.963802823896382, + "learning_rate": 1.4217638377480158e-07, + "logits/chosen": -2.6959242820739746, + "logits/rejected": -2.666779041290283, + "logps/chosen": -324.8652038574219, + "logps/rejected": -300.7135314941406, + "loss": 0.5801, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2510308027267456, + "rewards/margins": 0.3596518933773041, + "rewards/margins_max": 0.8655659556388855, + "rewards/margins_min": -0.2538580596446991, + "rewards/margins_std": 0.5226938128471375, + "rewards/rejected": -0.6106826663017273, + "step": 2590 + }, + { + "epoch": 0.68, + "grad_norm": 4.698417868032439, + "learning_rate": 1.401198464962021e-07, + "logits/chosen": -2.6267876625061035, + "logits/rejected": -2.6794533729553223, + "logps/chosen": -258.80279541015625, + "logps/rejected": -309.348876953125, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2940548062324524, + "rewards/margins": 0.1740504801273346, + "rewards/margins_max": 0.6885368227958679, + "rewards/margins_min": -0.28622788190841675, + "rewards/margins_std": 0.4522281587123871, + "rewards/rejected": -0.4681052565574646, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_logits/chosen": -2.718674659729004, + "eval_logits/rejected": -2.6859068870544434, + "eval_logps/chosen": -308.6578674316406, + "eval_logps/rejected": -319.3885192871094, + "eval_loss": 0.5969670414924622, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.24162966012954712, + "eval_rewards/margins": 0.33076155185699463, + "eval_rewards/margins_max": 1.1752591133117676, + "eval_rewards/margins_min": -0.5363927483558655, + "eval_rewards/margins_std": 0.5756190419197083, + "eval_rewards/rejected": -0.5723912119865417, + "eval_runtime": 417.9329, + "eval_samples_per_second": 4.785, + "eval_steps_per_second": 0.151, + "step": 2600 + }, + { + "epoch": 0.68, + "grad_norm": 5.297060082726977, + "learning_rate": 1.3807248417879894e-07, + "logits/chosen": -2.7008800506591797, + "logits/rejected": -2.648380756378174, + "logps/chosen": -278.50616455078125, + "logps/rejected": -240.8035125732422, + "loss": 0.6026, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1719607710838318, + "rewards/margins": 0.35253241658210754, + "rewards/margins_max": 0.977734386920929, + "rewards/margins_min": -0.265885591506958, + "rewards/margins_std": 0.5538880228996277, + "rewards/rejected": -0.5244931578636169, + "step": 2610 + }, + { + "epoch": 0.69, + "grad_norm": 7.539478666055644, + "learning_rate": 1.3603446777675665e-07, + "logits/chosen": -2.6150364875793457, + "logits/rejected": -2.619123935699463, + "logps/chosen": -277.9858093261719, + "logps/rejected": -317.23883056640625, + "loss": 0.6161, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2710026502609253, + "rewards/margins": 0.278492271900177, + "rewards/margins_max": 0.905040442943573, + "rewards/margins_min": -0.2542542815208435, + "rewards/margins_std": 0.5333602428436279, + "rewards/rejected": -0.5494948625564575, + "step": 2620 + }, + { + "epoch": 0.69, + "grad_norm": 5.592595668583226, + "learning_rate": 1.3400596746385814e-07, + "logits/chosen": -2.707918643951416, + "logits/rejected": -2.7343533039093018, + "logps/chosen": -289.794189453125, + "logps/rejected": -306.36846923828125, + "loss": 0.5776, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3244673013687134, + "rewards/margins": 0.25360649824142456, + "rewards/margins_max": 0.830174446105957, + "rewards/margins_min": -0.2910890579223633, + "rewards/margins_std": 0.5009501576423645, + "rewards/rejected": -0.5780739188194275, + "step": 2630 + }, + { + "epoch": 0.69, + "grad_norm": 11.394775639091556, + "learning_rate": 1.3198715261929586e-07, + "logits/chosen": -2.7883169651031494, + "logits/rejected": -2.800386428833008, + "logps/chosen": -345.37933349609375, + "logps/rejected": -344.2216796875, + "loss": 0.5827, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2949279248714447, + "rewards/margins": 0.2135731428861618, + "rewards/margins_max": 0.7514768838882446, + "rewards/margins_min": -0.40849557518959045, + "rewards/margins_std": 0.5237277150154114, + "rewards/rejected": -0.5085010528564453, + "step": 2640 + }, + { + "epoch": 0.69, + "grad_norm": 5.5940964234558805, + "learning_rate": 1.299781918135282e-07, + "logits/chosen": -2.781946897506714, + "logits/rejected": -2.7119410037994385, + "logps/chosen": -254.15719604492188, + "logps/rejected": -278.99871826171875, + "loss": 0.5644, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.09353184700012207, + "rewards/margins": 0.5513363480567932, + "rewards/margins_max": 1.0748264789581299, + "rewards/margins_min": 0.008779443800449371, + "rewards/margins_std": 0.5036975145339966, + "rewards/rejected": -0.6448681950569153, + "step": 2650 + }, + { + "epoch": 0.7, + "grad_norm": 5.300278551950972, + "learning_rate": 1.279792527942045e-07, + "logits/chosen": -2.7470862865448, + "logits/rejected": -2.7562458515167236, + "logps/chosen": -350.07025146484375, + "logps/rejected": -305.99859619140625, + "loss": 0.5941, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.19993433356285095, + "rewards/margins": 0.34072503447532654, + "rewards/margins_max": 0.9613269567489624, + "rewards/margins_min": -0.17679674923419952, + "rewards/margins_std": 0.5153160691261292, + "rewards/rejected": -0.5406594276428223, + "step": 2660 + }, + { + "epoch": 0.7, + "grad_norm": 5.787468941854434, + "learning_rate": 1.259905024721576e-07, + "logits/chosen": -2.7918848991394043, + "logits/rejected": -2.715488910675049, + "logps/chosen": -320.099853515625, + "logps/rejected": -292.3807373046875, + "loss": 0.5872, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3071865439414978, + "rewards/margins": 0.2795398235321045, + "rewards/margins_max": 0.9265762567520142, + "rewards/margins_min": -0.2768297493457794, + "rewards/margins_std": 0.5300559997558594, + "rewards/rejected": -0.5867263078689575, + "step": 2670 + }, + { + "epoch": 0.7, + "grad_norm": 5.946451606853156, + "learning_rate": 1.2401210690746703e-07, + "logits/chosen": -2.7749440670013428, + "logits/rejected": -2.747144937515259, + "logps/chosen": -378.4425048828125, + "logps/rejected": -445.81134033203125, + "loss": 0.6004, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.29681429266929626, + "rewards/margins": 0.388523131608963, + "rewards/margins_max": 1.2408030033111572, + "rewards/margins_min": -0.4802486300468445, + "rewards/margins_std": 0.7567611932754517, + "rewards/rejected": -0.6853374242782593, + "step": 2680 + }, + { + "epoch": 0.7, + "grad_norm": 7.152892545336937, + "learning_rate": 1.2204423129559305e-07, + "logits/chosen": -2.609441041946411, + "logits/rejected": -2.593297243118286, + "logps/chosen": -225.9248809814453, + "logps/rejected": -297.6705322265625, + "loss": 0.5911, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26983457803726196, + "rewards/margins": 0.39215803146362305, + "rewards/margins_max": 1.1170307397842407, + "rewards/margins_min": -0.33665913343429565, + "rewards/margins_std": 0.6525898575782776, + "rewards/rejected": -0.6619926691055298, + "step": 2690 + }, + { + "epoch": 0.71, + "grad_norm": 8.525524635616273, + "learning_rate": 1.2008703995358299e-07, + "logits/chosen": -2.6563806533813477, + "logits/rejected": -2.6100051403045654, + "logps/chosen": -281.10260009765625, + "logps/rejected": -289.41412353515625, + "loss": 0.6013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.34073784947395325, + "rewards/margins": 0.198334738612175, + "rewards/margins_max": 0.7223410606384277, + "rewards/margins_min": -0.3622845709323883, + "rewards/margins_std": 0.49760836362838745, + "rewards/rejected": -0.5390725135803223, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_logits/chosen": -2.717006206512451, + "eval_logits/rejected": -2.6845290660858154, + "eval_logps/chosen": -308.9903259277344, + "eval_logps/rejected": -320.0433044433594, + "eval_loss": 0.5961260795593262, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.2449544370174408, + "eval_rewards/margins": 0.3339848220348358, + "eval_rewards/margins_max": 1.192410945892334, + "eval_rewards/margins_min": -0.5460187196731567, + "eval_rewards/margins_std": 0.582958459854126, + "eval_rewards/rejected": -0.5789392590522766, + "eval_runtime": 390.1468, + "eval_samples_per_second": 5.126, + "eval_steps_per_second": 0.161, + "step": 2700 + }, + { + "epoch": 0.71, + "grad_norm": 10.65186888724301, + "learning_rate": 1.1814069630635068e-07, + "logits/chosen": -2.8419432640075684, + "logits/rejected": -2.800723075866699, + "logps/chosen": -309.03997802734375, + "logps/rejected": -330.2586975097656, + "loss": 0.6119, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2224183827638626, + "rewards/margins": 0.3151085078716278, + "rewards/margins_max": 0.9302600026130676, + "rewards/margins_min": -0.2143104076385498, + "rewards/margins_std": 0.5069065093994141, + "rewards/rejected": -0.5375269055366516, + "step": 2710 + }, + { + "epoch": 0.71, + "grad_norm": 6.844292606111765, + "learning_rate": 1.1620536287303051e-07, + "logits/chosen": -2.7169649600982666, + "logits/rejected": -2.668610095977783, + "logps/chosen": -248.28549194335938, + "logps/rejected": -288.42724609375, + "loss": 0.588, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27430906891822815, + "rewards/margins": 0.2432362586259842, + "rewards/margins_max": 0.7510233521461487, + "rewards/margins_min": -0.330477237701416, + "rewards/margins_std": 0.4812886118888855, + "rewards/rejected": -0.5175453424453735, + "step": 2720 + }, + { + "epoch": 0.71, + "grad_norm": 12.991934988902129, + "learning_rate": 1.1428120125340716e-07, + "logits/chosen": -2.624035358428955, + "logits/rejected": -2.647022008895874, + "logps/chosen": -352.6827087402344, + "logps/rejected": -348.1404724121094, + "loss": 0.6139, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.17648416757583618, + "rewards/margins": 0.23796233534812927, + "rewards/margins_max": 1.0430485010147095, + "rewards/margins_min": -0.5048769116401672, + "rewards/margins_std": 0.6909404397010803, + "rewards/rejected": -0.41444650292396545, + "step": 2730 + }, + { + "epoch": 0.72, + "grad_norm": 8.927018001025035, + "learning_rate": 1.123683721144223e-07, + "logits/chosen": -2.7429280281066895, + "logits/rejected": -2.761756658554077, + "logps/chosen": -248.6212921142578, + "logps/rejected": -317.7116394042969, + "loss": 0.6115, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24835343658924103, + "rewards/margins": 0.30689066648483276, + "rewards/margins_max": 1.0265804529190063, + "rewards/margins_min": -0.2528151869773865, + "rewards/margins_std": 0.5803526639938354, + "rewards/rejected": -0.5552440881729126, + "step": 2740 + }, + { + "epoch": 0.72, + "grad_norm": 4.601988793722117, + "learning_rate": 1.1046703517675845e-07, + "logits/chosen": -2.7221856117248535, + "logits/rejected": -2.7139735221862793, + "logps/chosen": -293.9876708984375, + "logps/rejected": -287.68341064453125, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18598800897598267, + "rewards/margins": 0.3705143332481384, + "rewards/margins_max": 0.9818255305290222, + "rewards/margins_min": -0.25609129667282104, + "rewards/margins_std": 0.5622454285621643, + "rewards/rejected": -0.5565023422241211, + "step": 2750 + }, + { + "epoch": 0.72, + "grad_norm": 5.20953249541653, + "learning_rate": 1.085773492015028e-07, + "logits/chosen": -2.7503018379211426, + "logits/rejected": -2.672482967376709, + "logps/chosen": -393.72222900390625, + "logps/rejected": -360.5134582519531, + "loss": 0.5704, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.23460975289344788, + "rewards/margins": 0.4232187867164612, + "rewards/margins_max": 1.0697230100631714, + "rewards/margins_min": -0.23043997585773468, + "rewards/margins_std": 0.585904061794281, + "rewards/rejected": -0.6578284502029419, + "step": 2760 + }, + { + "epoch": 0.72, + "grad_norm": 6.0035154852096735, + "learning_rate": 1.0669947197689033e-07, + "logits/chosen": -2.815408229827881, + "logits/rejected": -2.7703001499176025, + "logps/chosen": -337.4261169433594, + "logps/rejected": -345.00726318359375, + "loss": 0.5908, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3059375584125519, + "rewards/margins": 0.45007753372192383, + "rewards/margins_max": 1.2291467189788818, + "rewards/margins_min": -0.27866214513778687, + "rewards/margins_std": 0.6930111050605774, + "rewards/rejected": -0.7560150623321533, + "step": 2770 + }, + { + "epoch": 0.73, + "grad_norm": 5.691431719658178, + "learning_rate": 1.048335603051291e-07, + "logits/chosen": -2.6867356300354004, + "logits/rejected": -2.642735481262207, + "logps/chosen": -370.48822021484375, + "logps/rejected": -292.55181884765625, + "loss": 0.6039, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2714003324508667, + "rewards/margins": 0.2806457281112671, + "rewards/margins_max": 0.8909802436828613, + "rewards/margins_min": -0.2825238108634949, + "rewards/margins_std": 0.545309841632843, + "rewards/rejected": -0.5520460605621338, + "step": 2780 + }, + { + "epoch": 0.73, + "grad_norm": 5.364906237121527, + "learning_rate": 1.0297976998930663e-07, + "logits/chosen": -2.8341474533081055, + "logits/rejected": -2.7649788856506348, + "logps/chosen": -384.8304138183594, + "logps/rejected": -344.5574035644531, + "loss": 0.5965, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.23515446484088898, + "rewards/margins": 0.33084505796432495, + "rewards/margins_max": 0.9416014552116394, + "rewards/margins_min": -0.19013361632823944, + "rewards/margins_std": 0.5239640474319458, + "rewards/rejected": -0.5659995079040527, + "step": 2790 + }, + { + "epoch": 0.73, + "grad_norm": 5.435577403458107, + "learning_rate": 1.0113825582038077e-07, + "logits/chosen": -2.7341742515563965, + "logits/rejected": -2.7210845947265625, + "logps/chosen": -270.52496337890625, + "logps/rejected": -281.5241394042969, + "loss": 0.6233, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.29497817158699036, + "rewards/margins": 0.19213594496250153, + "rewards/margins_max": 0.9739352464675903, + "rewards/margins_min": -0.48513802886009216, + "rewards/margins_std": 0.6357932090759277, + "rewards/rejected": -0.4871141016483307, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_logits/chosen": -2.717419147491455, + "eval_logits/rejected": -2.685154438018799, + "eval_logps/chosen": -308.7550354003906, + "eval_logps/rejected": -320.0176696777344, + "eval_loss": 0.5953718423843384, + "eval_rewards/accuracies": 0.7301587462425232, + "eval_rewards/chosen": -0.24260133504867554, + "eval_rewards/margins": 0.336081862449646, + "eval_rewards/margins_max": 1.2014644145965576, + "eval_rewards/margins_min": -0.5490952134132385, + "eval_rewards/margins_std": 0.5881600975990295, + "eval_rewards/rejected": -0.5786832571029663, + "eval_runtime": 390.11, + "eval_samples_per_second": 5.127, + "eval_steps_per_second": 0.161, + "step": 2800 + }, + { + "epoch": 0.74, + "grad_norm": 6.834511570030363, + "learning_rate": 9.930917156425475e-08, + "logits/chosen": -2.650859832763672, + "logits/rejected": -2.6968650817871094, + "logps/chosen": -262.65093994140625, + "logps/rejected": -269.3209533691406, + "loss": 0.5742, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21521730720996857, + "rewards/margins": 0.3685315251350403, + "rewards/margins_max": 0.9042445421218872, + "rewards/margins_min": -0.19705520570278168, + "rewards/margins_std": 0.5029382705688477, + "rewards/rejected": -0.5837489366531372, + "step": 2810 + }, + { + "epoch": 0.74, + "grad_norm": 6.801551069744732, + "learning_rate": 9.749266994893754e-08, + "logits/chosen": -2.642615795135498, + "logits/rejected": -2.620197296142578, + "logps/chosen": -261.7588806152344, + "logps/rejected": -321.48809814453125, + "loss": 0.5723, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.204143688082695, + "rewards/margins": 0.32847902178764343, + "rewards/margins_max": 0.9341036677360535, + "rewards/margins_min": -0.2671884596347809, + "rewards/margins_std": 0.5338630676269531, + "rewards/rejected": -0.5326226949691772, + "step": 2820 + }, + { + "epoch": 0.74, + "grad_norm": 6.487749145662984, + "learning_rate": 9.568890265179128e-08, + "logits/chosen": -2.6506879329681396, + "logits/rejected": -2.6231513023376465, + "logps/chosen": -321.41497802734375, + "logps/rejected": -485.8807678222656, + "loss": 0.5902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3649356961250305, + "rewards/margins": 0.24924424290657043, + "rewards/margins_max": 0.7932173013687134, + "rewards/margins_min": -0.32447996735572815, + "rewards/margins_std": 0.4989289343357086, + "rewards/rejected": -0.6141799688339233, + "step": 2830 + }, + { + "epoch": 0.74, + "grad_norm": 8.654816767065862, + "learning_rate": 9.389802028686616e-08, + "logits/chosen": -2.635981559753418, + "logits/rejected": -2.6222739219665527, + "logps/chosen": -235.28378295898438, + "logps/rejected": -236.2818603515625, + "loss": 0.5719, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.26213663816452026, + "rewards/margins": 0.251953661441803, + "rewards/margins_max": 0.8292096257209778, + "rewards/margins_min": -0.3137845993041992, + "rewards/margins_std": 0.49395066499710083, + "rewards/rejected": -0.5140902996063232, + "step": 2840 + }, + { + "epoch": 0.75, + "grad_norm": 7.156426973111958, + "learning_rate": 9.212017239232426e-08, + "logits/chosen": -2.7401416301727295, + "logits/rejected": -2.764477014541626, + "logps/chosen": -319.3464660644531, + "logps/rejected": -320.43927001953125, + "loss": 0.6002, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3854105770587921, + "rewards/margins": 0.2975367605686188, + "rewards/margins_max": 0.9738914370536804, + "rewards/margins_min": -0.49170565605163574, + "rewards/margins_std": 0.6692911386489868, + "rewards/rejected": -0.6829473972320557, + "step": 2850 + }, + { + "epoch": 0.75, + "grad_norm": 6.833366983222295, + "learning_rate": 9.035550741795328e-08, + "logits/chosen": -2.735140800476074, + "logits/rejected": -2.7011420726776123, + "logps/chosen": -290.060302734375, + "logps/rejected": -292.348388671875, + "loss": 0.5648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.26191139221191406, + "rewards/margins": 0.23322126269340515, + "rewards/margins_max": 0.7896550297737122, + "rewards/margins_min": -0.3425723910331726, + "rewards/margins_std": 0.5026351809501648, + "rewards/rejected": -0.4951326847076416, + "step": 2860 + }, + { + "epoch": 0.75, + "grad_norm": 6.672116277472777, + "learning_rate": 8.860417271277065e-08, + "logits/chosen": -2.5774030685424805, + "logits/rejected": -2.520181179046631, + "logps/chosen": -314.0809631347656, + "logps/rejected": -326.8440856933594, + "loss": 0.5494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.24950452148914337, + "rewards/margins": 0.4084310531616211, + "rewards/margins_max": 1.0167620182037354, + "rewards/margins_min": -0.0996861681342125, + "rewards/margins_std": 0.5135147571563721, + "rewards/rejected": -0.6579355001449585, + "step": 2870 + }, + { + "epoch": 0.75, + "grad_norm": 8.0954107399704, + "learning_rate": 8.686631451272029e-08, + "logits/chosen": -2.7452874183654785, + "logits/rejected": -2.682813882827759, + "logps/chosen": -328.0068359375, + "logps/rejected": -304.7123107910156, + "loss": 0.5892, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2597874701023102, + "rewards/margins": 0.4096033573150635, + "rewards/margins_max": 1.0029513835906982, + "rewards/margins_min": -0.16356924176216125, + "rewards/margins_std": 0.5237914323806763, + "rewards/rejected": -0.669390857219696, + "step": 2880 + }, + { + "epoch": 0.76, + "grad_norm": 8.425698850721338, + "learning_rate": 8.514207792846168e-08, + "logits/chosen": -2.6590969562530518, + "logits/rejected": -2.642465353012085, + "logps/chosen": -311.7236022949219, + "logps/rejected": -362.00439453125, + "loss": 0.5829, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.28082284331321716, + "rewards/margins": 0.26530036330223083, + "rewards/margins_max": 0.6965596675872803, + "rewards/margins_min": -0.13605263829231262, + "rewards/margins_std": 0.36559614539146423, + "rewards/rejected": -0.5461231470108032, + "step": 2890 + }, + { + "epoch": 0.76, + "grad_norm": 7.265333600507286, + "learning_rate": 8.343160693325355e-08, + "logits/chosen": -2.770352602005005, + "logits/rejected": -2.7508273124694824, + "logps/chosen": -335.2548522949219, + "logps/rejected": -316.0888977050781, + "loss": 0.6119, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3089531660079956, + "rewards/margins": 0.31339365243911743, + "rewards/margins_max": 0.9748009443283081, + "rewards/margins_min": -0.3580326437950134, + "rewards/margins_std": 0.587861180305481, + "rewards/rejected": -0.6223467588424683, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_logits/chosen": -2.717559337615967, + "eval_logits/rejected": -2.685288667678833, + "eval_logps/chosen": -310.6289367675781, + "eval_logps/rejected": -322.4700927734375, + "eval_loss": 0.5943607687950134, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.26134011149406433, + "eval_rewards/margins": 0.3418668508529663, + "eval_rewards/margins_max": 1.2206302881240845, + "eval_rewards/margins_min": -0.559513509273529, + "eval_rewards/margins_std": 0.6005640029907227, + "eval_rewards/rejected": -0.603206992149353, + "eval_runtime": 390.6155, + "eval_samples_per_second": 5.12, + "eval_steps_per_second": 0.161, + "step": 2900 + }, + { + "epoch": 0.76, + "grad_norm": 7.5722769598670645, + "learning_rate": 8.173504435093173e-08, + "logits/chosen": -2.6910836696624756, + "logits/rejected": -2.620335817337036, + "logps/chosen": -307.61895751953125, + "logps/rejected": -285.076904296875, + "loss": 0.5744, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3138976991176605, + "rewards/margins": 0.29238349199295044, + "rewards/margins_max": 0.8948743939399719, + "rewards/margins_min": -0.16550301015377045, + "rewards/margins_std": 0.48930853605270386, + "rewards/rejected": -0.6062812805175781, + "step": 2910 + }, + { + "epoch": 0.76, + "grad_norm": 11.590729106518845, + "learning_rate": 8.005253184398359e-08, + "logits/chosen": -2.64176082611084, + "logits/rejected": -2.6071128845214844, + "logps/chosen": -303.20892333984375, + "logps/rejected": -265.7433166503906, + "loss": 0.5808, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.21988987922668457, + "rewards/margins": 0.3122277557849884, + "rewards/margins_max": 0.9941678047180176, + "rewards/margins_min": -0.2490687072277069, + "rewards/margins_std": 0.57354336977005, + "rewards/rejected": -0.5321176648139954, + "step": 2920 + }, + { + "epoch": 0.77, + "grad_norm": 10.596926952652064, + "learning_rate": 7.838420990171926e-08, + "logits/chosen": -2.644495964050293, + "logits/rejected": -2.6213231086730957, + "logps/chosen": -324.3955993652344, + "logps/rejected": -304.0296936035156, + "loss": 0.5889, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.14799830317497253, + "rewards/margins": 0.4530050754547119, + "rewards/margins_max": 1.0575478076934814, + "rewards/margins_min": -0.12277814000844955, + "rewards/margins_std": 0.5148282051086426, + "rewards/rejected": -0.6010034084320068, + "step": 2930 + }, + { + "epoch": 0.77, + "grad_norm": 12.227670052671204, + "learning_rate": 7.673021782854083e-08, + "logits/chosen": -2.71203351020813, + "logits/rejected": -2.7618517875671387, + "logps/chosen": -320.0538635253906, + "logps/rejected": -331.66156005859375, + "loss": 0.622, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.19437643885612488, + "rewards/margins": 0.4152396619319916, + "rewards/margins_max": 1.0705549716949463, + "rewards/margins_min": -0.15296433866024017, + "rewards/margins_std": 0.5555770993232727, + "rewards/rejected": -0.6096161603927612, + "step": 2940 + }, + { + "epoch": 0.77, + "grad_norm": 7.39065493684476, + "learning_rate": 7.509069373231039e-08, + "logits/chosen": -2.8370635509490967, + "logits/rejected": -2.7984962463378906, + "logps/chosen": -310.8374328613281, + "logps/rejected": -367.3779296875, + "loss": 0.5661, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2661990225315094, + "rewards/margins": 0.5342287421226501, + "rewards/margins_max": 1.1428627967834473, + "rewards/margins_min": -0.18307234346866608, + "rewards/margins_std": 0.5940524339675903, + "rewards/rejected": -0.8004277944564819, + "step": 2950 + }, + { + "epoch": 0.77, + "grad_norm": 6.265495013555799, + "learning_rate": 7.346577451281821e-08, + "logits/chosen": -2.6487300395965576, + "logits/rejected": -2.648134708404541, + "logps/chosen": -294.40447998046875, + "logps/rejected": -297.02008056640625, + "loss": 0.5847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.36863070726394653, + "rewards/margins": 0.1254711151123047, + "rewards/margins_max": 0.7439612150192261, + "rewards/margins_min": -0.46583014726638794, + "rewards/margins_std": 0.5817449688911438, + "rewards/rejected": -0.4941018521785736, + "step": 2960 + }, + { + "epoch": 0.78, + "grad_norm": 9.805501008955183, + "learning_rate": 7.185559585035136e-08, + "logits/chosen": -2.7487006187438965, + "logits/rejected": -2.730909824371338, + "logps/chosen": -266.269775390625, + "logps/rejected": -262.413818359375, + "loss": 0.6258, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3177337646484375, + "rewards/margins": 0.2543289065361023, + "rewards/margins_max": 0.9349082112312317, + "rewards/margins_min": -0.2697645127773285, + "rewards/margins_std": 0.5387292504310608, + "rewards/rejected": -0.5720627307891846, + "step": 2970 + }, + { + "epoch": 0.78, + "grad_norm": 7.888836044110651, + "learning_rate": 7.026029219436502e-08, + "logits/chosen": -2.600877285003662, + "logits/rejected": -2.5777783393859863, + "logps/chosen": -331.73944091796875, + "logps/rejected": -369.98162841796875, + "loss": 0.5656, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24253609776496887, + "rewards/margins": 0.38004228472709656, + "rewards/margins_max": 1.2120200395584106, + "rewards/margins_min": -0.4623018205165863, + "rewards/margins_std": 0.7432944774627686, + "rewards/rejected": -0.6225783228874207, + "step": 2980 + }, + { + "epoch": 0.78, + "grad_norm": 7.858044490852342, + "learning_rate": 6.867999675225522e-08, + "logits/chosen": -2.6897332668304443, + "logits/rejected": -2.621100425720215, + "logps/chosen": -335.5071105957031, + "logps/rejected": -370.0787048339844, + "loss": 0.581, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2772907316684723, + "rewards/margins": 0.3420386016368866, + "rewards/margins_max": 0.8591931462287903, + "rewards/margins_min": -0.2229001522064209, + "rewards/margins_std": 0.4767986834049225, + "rewards/rejected": -0.6193293333053589, + "step": 2990 + }, + { + "epoch": 0.79, + "grad_norm": 3.908410657322484, + "learning_rate": 6.711484147823662e-08, + "logits/chosen": -2.696946382522583, + "logits/rejected": -2.663543224334717, + "logps/chosen": -255.0500030517578, + "logps/rejected": -321.6811828613281, + "loss": 0.5644, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.14326031506061554, + "rewards/margins": 0.41077256202697754, + "rewards/margins_max": 1.0658059120178223, + "rewards/margins_min": -0.060289181768894196, + "rewards/margins_std": 0.5096723437309265, + "rewards/rejected": -0.5540328621864319, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_logits/chosen": -2.714999198913574, + "eval_logits/rejected": -2.6826276779174805, + "eval_logps/chosen": -306.671630859375, + "eval_logps/rejected": -318.6262512207031, + "eval_loss": 0.5937923192977905, + "eval_rewards/accuracies": 0.72817462682724, + "eval_rewards/chosen": -0.22176718711853027, + "eval_rewards/margins": 0.34300175309181213, + "eval_rewards/margins_max": 1.1988600492477417, + "eval_rewards/margins_min": -0.5312032699584961, + "eval_rewards/margins_std": 0.5871840715408325, + "eval_rewards/rejected": -0.5647689700126648, + "eval_runtime": 403.5625, + "eval_samples_per_second": 4.956, + "eval_steps_per_second": 0.156, + "step": 3000 + }, + { + "epoch": 0.79, + "grad_norm": 5.789905234501832, + "learning_rate": 6.556495706232412e-08, + "logits/chosen": -2.721482753753662, + "logits/rejected": -2.696732997894287, + "logps/chosen": -334.65374755859375, + "logps/rejected": -329.54901123046875, + "loss": 0.5884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2582997977733612, + "rewards/margins": 0.4593956470489502, + "rewards/margins_max": 1.0422502756118774, + "rewards/margins_min": -0.19781765341758728, + "rewards/margins_std": 0.5769368410110474, + "rewards/rejected": -0.7176954746246338, + "step": 3010 + }, + { + "epoch": 0.79, + "grad_norm": 6.616579282297295, + "learning_rate": 6.403047291942057e-08, + "logits/chosen": -2.650972843170166, + "logits/rejected": -2.6220126152038574, + "logps/chosen": -257.07452392578125, + "logps/rejected": -244.9949188232422, + "loss": 0.5908, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.16235917806625366, + "rewards/margins": 0.35695740580558777, + "rewards/margins_max": 0.8692172765731812, + "rewards/margins_min": -0.19138233363628387, + "rewards/margins_std": 0.4762743413448334, + "rewards/rejected": -0.519316554069519, + "step": 3020 + }, + { + "epoch": 0.79, + "grad_norm": 8.905408745904964, + "learning_rate": 6.251151717851021e-08, + "logits/chosen": -2.688934564590454, + "logits/rejected": -2.7008392810821533, + "logps/chosen": -246.4873504638672, + "logps/rejected": -352.30047607421875, + "loss": 0.6095, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.20623517036437988, + "rewards/margins": 0.38540560007095337, + "rewards/margins_max": 1.0291602611541748, + "rewards/margins_min": -0.24164995551109314, + "rewards/margins_std": 0.5588585138320923, + "rewards/rejected": -0.5916407108306885, + "step": 3030 + }, + { + "epoch": 0.8, + "grad_norm": 6.895131823612239, + "learning_rate": 6.100821667196041e-08, + "logits/chosen": -2.7304978370666504, + "logits/rejected": -2.666607618331909, + "logps/chosen": -310.72625732421875, + "logps/rejected": -379.23480224609375, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08143069595098495, + "rewards/margins": 0.40141692757606506, + "rewards/margins_max": 0.9610453844070435, + "rewards/margins_min": -0.13242550194263458, + "rewards/margins_std": 0.4750441908836365, + "rewards/rejected": -0.4828476309776306, + "step": 3040 + }, + { + "epoch": 0.8, + "grad_norm": 7.60192330769485, + "learning_rate": 5.952069692493061e-08, + "logits/chosen": -2.7332518100738525, + "logits/rejected": -2.706421375274658, + "logps/chosen": -320.4873352050781, + "logps/rejected": -341.2845764160156, + "loss": 0.5654, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.23261427879333496, + "rewards/margins": 0.4263758063316345, + "rewards/margins_max": 0.9979115724563599, + "rewards/margins_min": -0.15502162277698517, + "rewards/margins_std": 0.5413002967834473, + "rewards/rejected": -0.6589901447296143, + "step": 3050 + }, + { + "epoch": 0.8, + "grad_norm": 6.732063197582168, + "learning_rate": 5.8049082144891794e-08, + "logits/chosen": -2.69242787361145, + "logits/rejected": -2.6815574169158936, + "logps/chosen": -345.8653564453125, + "logps/rejected": -348.386962890625, + "loss": 0.5784, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.39133328199386597, + "rewards/margins": 0.2583714723587036, + "rewards/margins_max": 1.0790903568267822, + "rewards/margins_min": -0.48557180166244507, + "rewards/margins_std": 0.6925069689750671, + "rewards/rejected": -0.6497048139572144, + "step": 3060 + }, + { + "epoch": 0.8, + "grad_norm": 7.1858770777282786, + "learning_rate": 5.659349521125459e-08, + "logits/chosen": -2.536653995513916, + "logits/rejected": -2.526789426803589, + "logps/chosen": -278.2074890136719, + "logps/rejected": -279.0950622558594, + "loss": 0.6178, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2687070369720459, + "rewards/margins": 0.29638925194740295, + "rewards/margins_max": 0.8798637390136719, + "rewards/margins_min": -0.2759655714035034, + "rewards/margins_std": 0.5355228781700134, + "rewards/rejected": -0.5650962591171265, + "step": 3070 + }, + { + "epoch": 0.81, + "grad_norm": 5.697834058461485, + "learning_rate": 5.5154057665109e-08, + "logits/chosen": -2.70261549949646, + "logits/rejected": -2.6901609897613525, + "logps/chosen": -273.2662658691406, + "logps/rejected": -326.7794189453125, + "loss": 0.593, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.3306647837162018, + "rewards/margins": 0.2638179659843445, + "rewards/margins_max": 1.0077003240585327, + "rewards/margins_min": -0.42087656259536743, + "rewards/margins_std": 0.6408329010009766, + "rewards/rejected": -0.5944827198982239, + "step": 3080 + }, + { + "epoch": 0.81, + "grad_norm": 8.222627022661833, + "learning_rate": 5.3730889699075853e-08, + "logits/chosen": -2.7889244556427, + "logits/rejected": -2.684028148651123, + "logps/chosen": -355.8638000488281, + "logps/rejected": -320.7940979003906, + "loss": 0.6113, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46841734647750854, + "rewards/margins": 0.1820753812789917, + "rewards/margins_max": 0.8383633494377136, + "rewards/margins_min": -0.41440504789352417, + "rewards/margins_std": 0.5822398066520691, + "rewards/rejected": -0.650492787361145, + "step": 3090 + }, + { + "epoch": 0.81, + "grad_norm": 8.847970124123545, + "learning_rate": 5.2324110147270893e-08, + "logits/chosen": -2.7285666465759277, + "logits/rejected": -2.715951681137085, + "logps/chosen": -321.43890380859375, + "logps/rejected": -347.58465576171875, + "loss": 0.5946, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26107698678970337, + "rewards/margins": 0.3440849184989929, + "rewards/margins_max": 0.9513567686080933, + "rewards/margins_min": -0.3097357749938965, + "rewards/margins_std": 0.5655455589294434, + "rewards/rejected": -0.6051618456840515, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_logits/chosen": -2.709005355834961, + "eval_logits/rejected": -2.6761624813079834, + "eval_logps/chosen": -312.1255798339844, + "eval_logps/rejected": -324.53759765625, + "eval_loss": 0.5932390093803406, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.27630698680877686, + "eval_rewards/margins": 0.34757524728775024, + "eval_rewards/margins_max": 1.2358723878860474, + "eval_rewards/margins_min": -0.5639290809631348, + "eval_rewards/margins_std": 0.6093804836273193, + "eval_rewards/rejected": -0.6238822340965271, + "eval_runtime": 404.2912, + "eval_samples_per_second": 4.947, + "eval_steps_per_second": 0.156, + "step": 3100 + }, + { + "epoch": 0.81, + "grad_norm": 7.343619877088542, + "learning_rate": 5.0933836475381795e-08, + "logits/chosen": -2.7327513694763184, + "logits/rejected": -2.670869827270508, + "logps/chosen": -370.6684875488281, + "logps/rejected": -350.8794860839844, + "loss": 0.6048, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28505900502204895, + "rewards/margins": 0.26767823100090027, + "rewards/margins_max": 0.856292724609375, + "rewards/margins_min": -0.31981807947158813, + "rewards/margins_std": 0.5388545393943787, + "rewards/rejected": -0.5527373552322388, + "step": 3110 + }, + { + "epoch": 0.82, + "grad_norm": 7.4463143570267105, + "learning_rate": 4.956018477086005e-08, + "logits/chosen": -2.722538709640503, + "logits/rejected": -2.6825101375579834, + "logps/chosen": -297.9403381347656, + "logps/rejected": -321.73187255859375, + "loss": 0.591, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.22652511298656464, + "rewards/margins": 0.4005630910396576, + "rewards/margins_max": 0.9710624814033508, + "rewards/margins_min": -0.24973396956920624, + "rewards/margins_std": 0.5539124608039856, + "rewards/rejected": -0.6270883083343506, + "step": 3120 + }, + { + "epoch": 0.82, + "grad_norm": 6.754198817043829, + "learning_rate": 4.820326973322763e-08, + "logits/chosen": -2.691364288330078, + "logits/rejected": -2.680224895477295, + "logps/chosen": -284.72198486328125, + "logps/rejected": -293.7550354003906, + "loss": 0.5818, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2586904466152191, + "rewards/margins": 0.3613867163658142, + "rewards/margins_max": 1.0273208618164062, + "rewards/margins_min": -0.12638597190380096, + "rewards/margins_std": 0.5184222459793091, + "rewards/rejected": -0.6200771927833557, + "step": 3130 + }, + { + "epoch": 0.82, + "grad_norm": 11.947824856384033, + "learning_rate": 4.686320466449981e-08, + "logits/chosen": -2.596799373626709, + "logits/rejected": -2.6422441005706787, + "logps/chosen": -249.69284057617188, + "logps/rejected": -282.60528564453125, + "loss": 0.5861, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.25577065348625183, + "rewards/margins": 0.4310334324836731, + "rewards/margins_max": 1.0814671516418457, + "rewards/margins_min": -0.08930900692939758, + "rewards/margins_std": 0.5281911492347717, + "rewards/rejected": -0.6868040561676025, + "step": 3140 + }, + { + "epoch": 0.82, + "grad_norm": 5.958786882686882, + "learning_rate": 4.554010145972417e-08, + "logits/chosen": -2.6933982372283936, + "logits/rejected": -2.7229723930358887, + "logps/chosen": -329.3271789550781, + "logps/rejected": -346.91168212890625, + "loss": 0.6023, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.39186182618141174, + "rewards/margins": 0.1715582311153412, + "rewards/margins_max": 0.9218946695327759, + "rewards/margins_min": -0.6532711982727051, + "rewards/margins_std": 0.7135480642318726, + "rewards/rejected": -0.5634200572967529, + "step": 3150 + }, + { + "epoch": 0.83, + "grad_norm": 5.985598446557181, + "learning_rate": 4.423407059763745e-08, + "logits/chosen": -2.8195552825927734, + "logits/rejected": -2.7619147300720215, + "logps/chosen": -354.84820556640625, + "logps/rejected": -282.5863952636719, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24859032034873962, + "rewards/margins": 0.36673232913017273, + "rewards/margins_max": 1.0588020086288452, + "rewards/margins_min": -0.21221475303173065, + "rewards/margins_std": 0.5806155204772949, + "rewards/rejected": -0.6153227090835571, + "step": 3160 + }, + { + "epoch": 0.83, + "grad_norm": 4.333016506252547, + "learning_rate": 4.294522113144078e-08, + "logits/chosen": -2.716684579849243, + "logits/rejected": -2.7140746116638184, + "logps/chosen": -335.5931701660156, + "logps/rejected": -347.0995788574219, + "loss": 0.5993, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.20392827689647675, + "rewards/margins": 0.3492998480796814, + "rewards/margins_max": 0.9610215425491333, + "rewards/margins_min": -0.21924524009227753, + "rewards/margins_std": 0.5495181083679199, + "rewards/rejected": -0.5532280802726746, + "step": 3170 + }, + { + "epoch": 0.83, + "grad_norm": 9.314067938997741, + "learning_rate": 4.1673660679693804e-08, + "logits/chosen": -2.591862440109253, + "logits/rejected": -2.589052200317383, + "logps/chosen": -310.06903076171875, + "logps/rejected": -299.1510314941406, + "loss": 0.6028, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.23691824078559875, + "rewards/margins": 0.33606940507888794, + "rewards/margins_max": 0.8604933023452759, + "rewards/margins_min": -0.1909988671541214, + "rewards/margins_std": 0.46743878722190857, + "rewards/rejected": -0.5729876756668091, + "step": 3180 + }, + { + "epoch": 0.83, + "grad_norm": 8.007400427659823, + "learning_rate": 4.041949541732825e-08, + "logits/chosen": -2.663602113723755, + "logits/rejected": -2.54884672164917, + "logps/chosen": -273.75390625, + "logps/rejected": -295.95123291015625, + "loss": 0.5808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.26674574613571167, + "rewards/margins": 0.37703603506088257, + "rewards/margins_max": 1.079987645149231, + "rewards/margins_min": -0.1934051811695099, + "rewards/margins_std": 0.5782135128974915, + "rewards/rejected": -0.6437817811965942, + "step": 3190 + }, + { + "epoch": 0.84, + "grad_norm": 5.634056091592707, + "learning_rate": 3.9182830066782605e-08, + "logits/chosen": -2.6069934368133545, + "logits/rejected": -2.619546413421631, + "logps/chosen": -311.1683349609375, + "logps/rejected": -305.6144104003906, + "loss": 0.5961, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.349717915058136, + "rewards/margins": 0.26992395520210266, + "rewards/margins_max": 0.846529483795166, + "rewards/margins_min": -0.2108994424343109, + "rewards/margins_std": 0.5038012266159058, + "rewards/rejected": -0.619641900062561, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_logits/chosen": -2.7139713764190674, + "eval_logits/rejected": -2.6815237998962402, + "eval_logps/chosen": -311.6203308105469, + "eval_logps/rejected": -324.1453857421875, + "eval_loss": 0.5929512977600098, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": -0.2712542712688446, + "eval_rewards/margins": 0.34870612621307373, + "eval_rewards/margins_max": 1.2365152835845947, + "eval_rewards/margins_min": -0.5594983696937561, + "eval_rewards/margins_std": 0.6090325713157654, + "eval_rewards/rejected": -0.6199604272842407, + "eval_runtime": 405.2947, + "eval_samples_per_second": 4.935, + "eval_steps_per_second": 0.155, + "step": 3200 + }, + { + "epoch": 0.84, + "grad_norm": 6.232447888048972, + "learning_rate": 3.79637678892577e-08, + "logits/chosen": -2.736973524093628, + "logits/rejected": -2.7184245586395264, + "logps/chosen": -289.7675476074219, + "logps/rejected": -330.50396728515625, + "loss": 0.6115, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.33654657006263733, + "rewards/margins": 0.3094564378261566, + "rewards/margins_max": 0.9844071269035339, + "rewards/margins_min": -0.2724164128303528, + "rewards/margins_std": 0.5658156871795654, + "rewards/rejected": -0.6460030674934387, + "step": 3210 + }, + { + "epoch": 0.84, + "grad_norm": 5.507104619847467, + "learning_rate": 3.6762410676094645e-08, + "logits/chosen": -2.6315269470214844, + "logits/rejected": -2.584350347518921, + "logps/chosen": -290.31268310546875, + "logps/rejected": -312.1468811035156, + "loss": 0.5962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2527444660663605, + "rewards/margins": 0.19917862117290497, + "rewards/margins_max": 0.8185330629348755, + "rewards/margins_min": -0.37445664405822754, + "rewards/margins_std": 0.5346145033836365, + "rewards/rejected": -0.45192307233810425, + "step": 3220 + }, + { + "epoch": 0.85, + "grad_norm": 5.812794667640044, + "learning_rate": 3.557885874027497e-08, + "logits/chosen": -2.693312168121338, + "logits/rejected": -2.6754708290100098, + "logps/chosen": -351.64837646484375, + "logps/rejected": -385.64642333984375, + "loss": 0.618, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3593120872974396, + "rewards/margins": 0.24738220870494843, + "rewards/margins_max": 1.0550869703292847, + "rewards/margins_min": -0.614630401134491, + "rewards/margins_std": 0.7396687269210815, + "rewards/rejected": -0.6066943407058716, + "step": 3230 + }, + { + "epoch": 0.85, + "grad_norm": 9.015977875730622, + "learning_rate": 3.441321090804469e-08, + "logits/chosen": -2.8247809410095215, + "logits/rejected": -2.763528347015381, + "logps/chosen": -296.9433288574219, + "logps/rejected": -358.7182312011719, + "loss": 0.609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33461323380470276, + "rewards/margins": 0.27054521441459656, + "rewards/margins_max": 0.8843480348587036, + "rewards/margins_min": -0.2987326681613922, + "rewards/margins_std": 0.5240088105201721, + "rewards/rejected": -0.6051583886146545, + "step": 3240 + }, + { + "epoch": 0.85, + "grad_norm": 8.085799742184017, + "learning_rate": 3.326556451066234e-08, + "logits/chosen": -2.76572847366333, + "logits/rejected": -2.735654830932617, + "logps/chosen": -333.885009765625, + "logps/rejected": -355.4205017089844, + "loss": 0.603, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.33911195397377014, + "rewards/margins": 0.3562234044075012, + "rewards/margins_max": 0.8414816856384277, + "rewards/margins_min": -0.15393468737602234, + "rewards/margins_std": 0.4514690339565277, + "rewards/rejected": -0.6953352689743042, + "step": 3250 + }, + { + "epoch": 0.85, + "grad_norm": 6.531538815477996, + "learning_rate": 3.2136015376271946e-08, + "logits/chosen": -2.7383930683135986, + "logits/rejected": -2.7428669929504395, + "logps/chosen": -277.9571838378906, + "logps/rejected": -343.3814392089844, + "loss": 0.5613, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.25333690643310547, + "rewards/margins": 0.43777981400489807, + "rewards/margins_max": 1.1137754917144775, + "rewards/margins_min": -0.17860360443592072, + "rewards/margins_std": 0.5537047386169434, + "rewards/rejected": -0.6911166906356812, + "step": 3260 + }, + { + "epoch": 0.86, + "grad_norm": 6.2670863247762325, + "learning_rate": 3.102465782190106e-08, + "logits/chosen": -2.706207752227783, + "logits/rejected": -2.68100643157959, + "logps/chosen": -285.34320068359375, + "logps/rejected": -318.13238525390625, + "loss": 0.5837, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.33466050028800964, + "rewards/margins": 0.27287086844444275, + "rewards/margins_max": 0.9959131479263306, + "rewards/margins_min": -0.2743028998374939, + "rewards/margins_std": 0.5605227947235107, + "rewards/rejected": -0.6075314283370972, + "step": 3270 + }, + { + "epoch": 0.86, + "grad_norm": 8.272890182705053, + "learning_rate": 2.993158464558565e-08, + "logits/chosen": -2.7643463611602783, + "logits/rejected": -2.683652400970459, + "logps/chosen": -243.48403930664062, + "logps/rejected": -251.48684692382812, + "loss": 0.6264, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.30335482954978943, + "rewards/margins": 0.36434128880500793, + "rewards/margins_max": 1.0788029432296753, + "rewards/margins_min": -0.34486401081085205, + "rewards/margins_std": 0.6268325448036194, + "rewards/rejected": -0.6676961183547974, + "step": 3280 + }, + { + "epoch": 0.86, + "grad_norm": 6.393054304560884, + "learning_rate": 2.8856887118621358e-08, + "logits/chosen": -2.6674551963806152, + "logits/rejected": -2.6625630855560303, + "logps/chosen": -317.61895751953125, + "logps/rejected": -367.6047668457031, + "loss": 0.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19959239661693573, + "rewards/margins": 0.5897967219352722, + "rewards/margins_max": 1.2776075601577759, + "rewards/margins_min": -0.057800717651844025, + "rewards/margins_std": 0.6155193448066711, + "rewards/rejected": -0.7893891334533691, + "step": 3290 + }, + { + "epoch": 0.86, + "grad_norm": 6.761887996148778, + "learning_rate": 2.7800654977942482e-08, + "logits/chosen": -2.7923572063446045, + "logits/rejected": -2.75392484664917, + "logps/chosen": -360.2318115234375, + "logps/rejected": -394.7492370605469, + "loss": 0.5841, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.15893498063087463, + "rewards/margins": 0.38969096541404724, + "rewards/margins_max": 0.8884608149528503, + "rewards/margins_min": -0.16191323101520538, + "rewards/margins_std": 0.49352961778640747, + "rewards/rejected": -0.5486259460449219, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_logits/chosen": -2.7156574726104736, + "eval_logits/rejected": -2.6833531856536865, + "eval_logps/chosen": -311.35205078125, + "eval_logps/rejected": -323.91748046875, + "eval_loss": 0.5927214622497559, + "eval_rewards/accuracies": 0.7301587462425232, + "eval_rewards/chosen": -0.26857179403305054, + "eval_rewards/margins": 0.3491092920303345, + "eval_rewards/margins_max": 1.2361600399017334, + "eval_rewards/margins_min": -0.5601840615272522, + "eval_rewards/margins_std": 0.6093178391456604, + "eval_rewards/rejected": -0.6176810264587402, + "eval_runtime": 393.2175, + "eval_samples_per_second": 5.086, + "eval_steps_per_second": 0.16, + "step": 3300 + }, + { + "epoch": 0.87, + "grad_norm": 6.72786899332963, + "learning_rate": 2.676297641862879e-08, + "logits/chosen": -2.6579041481018066, + "logits/rejected": -2.600795269012451, + "logps/chosen": -247.5041046142578, + "logps/rejected": -292.17694091796875, + "loss": 0.5693, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2854161560535431, + "rewards/margins": 0.4283584654331207, + "rewards/margins_max": 1.2070612907409668, + "rewards/margins_min": -0.2502340078353882, + "rewards/margins_std": 0.6416813135147095, + "rewards/rejected": -0.713774561882019, + "step": 3310 + }, + { + "epoch": 0.87, + "grad_norm": 7.817285155640022, + "learning_rate": 2.5743938086541352e-08, + "logits/chosen": -2.473435878753662, + "logits/rejected": -2.5120933055877686, + "logps/chosen": -332.21026611328125, + "logps/rejected": -295.612548828125, + "loss": 0.585, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.273102343082428, + "rewards/margins": 0.2702943980693817, + "rewards/margins_max": 1.0244884490966797, + "rewards/margins_min": -0.3169720768928528, + "rewards/margins_std": 0.5895292162895203, + "rewards/rejected": -0.5433966517448425, + "step": 3320 + }, + { + "epoch": 0.87, + "grad_norm": 7.649642397432298, + "learning_rate": 2.474362507108757e-08, + "logits/chosen": -2.7463507652282715, + "logits/rejected": -2.7699356079101562, + "logps/chosen": -309.05999755859375, + "logps/rejected": -355.2874755859375, + "loss": 0.5965, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.27403926849365234, + "rewards/margins": 0.20573583245277405, + "rewards/margins_max": 0.803459644317627, + "rewards/margins_min": -0.3785861134529114, + "rewards/margins_std": 0.5141891241073608, + "rewards/rejected": -0.4797751307487488, + "step": 3330 + }, + { + "epoch": 0.87, + "grad_norm": 4.099745923985455, + "learning_rate": 2.3762120898116495e-08, + "logits/chosen": -2.7196621894836426, + "logits/rejected": -2.7530341148376465, + "logps/chosen": -302.53997802734375, + "logps/rejected": -378.96478271484375, + "loss": 0.5808, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3169907331466675, + "rewards/margins": 0.25496000051498413, + "rewards/margins_max": 0.7872250080108643, + "rewards/margins_min": -0.3017077147960663, + "rewards/margins_std": 0.4912707805633545, + "rewards/rejected": -0.5719506144523621, + "step": 3340 + }, + { + "epoch": 0.88, + "grad_norm": 9.305830442502666, + "learning_rate": 2.2799507522944044e-08, + "logits/chosen": -2.7755537033081055, + "logits/rejected": -2.6925406455993652, + "logps/chosen": -322.5201110839844, + "logps/rejected": -323.92767333984375, + "loss": 0.5883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2820035517215729, + "rewards/margins": 0.3853180408477783, + "rewards/margins_max": 1.0537234544754028, + "rewards/margins_min": -0.23948998749256134, + "rewards/margins_std": 0.5725919008255005, + "rewards/rejected": -0.6673215627670288, + "step": 3350 + }, + { + "epoch": 0.88, + "grad_norm": 9.897131733625372, + "learning_rate": 2.1855865323510054e-08, + "logits/chosen": -2.793555498123169, + "logits/rejected": -2.7378363609313965, + "logps/chosen": -320.41827392578125, + "logps/rejected": -396.91790771484375, + "loss": 0.5986, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3882889151573181, + "rewards/margins": 0.2911258637905121, + "rewards/margins_max": 0.9842821359634399, + "rewards/margins_min": -0.2959573268890381, + "rewards/margins_std": 0.5749455690383911, + "rewards/rejected": -0.6794147491455078, + "step": 3360 + }, + { + "epoch": 0.88, + "grad_norm": 6.043913044352584, + "learning_rate": 2.0931273093666573e-08, + "logits/chosen": -2.731083631515503, + "logits/rejected": -2.7401182651519775, + "logps/chosen": -294.62384033203125, + "logps/rejected": -333.05389404296875, + "loss": 0.5972, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.23699335753917694, + "rewards/margins": 0.31576278805732727, + "rewards/margins_max": 1.1510951519012451, + "rewards/margins_min": -0.2709348201751709, + "rewards/margins_std": 0.6509772539138794, + "rewards/rejected": -0.5527561902999878, + "step": 3370 + }, + { + "epoch": 0.88, + "grad_norm": 7.505224315688945, + "learning_rate": 2.002580803659873e-08, + "logits/chosen": -2.7698025703430176, + "logits/rejected": -2.667750358581543, + "logps/chosen": -326.37548828125, + "logps/rejected": -302.03155517578125, + "loss": 0.6284, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.43116623163223267, + "rewards/margins": 0.19796697795391083, + "rewards/margins_max": 0.9477956891059875, + "rewards/margins_min": -0.45327895879745483, + "rewards/margins_std": 0.6303554773330688, + "rewards/rejected": -0.6291331648826599, + "step": 3380 + }, + { + "epoch": 0.89, + "grad_norm": 8.583362863236774, + "learning_rate": 1.9139545758378256e-08, + "logits/chosen": -2.703469753265381, + "logits/rejected": -2.672950267791748, + "logps/chosen": -284.4007873535156, + "logps/rejected": -342.1329040527344, + "loss": 0.5817, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.2795010507106781, + "rewards/margins": 0.4712679386138916, + "rewards/margins_max": 1.2589962482452393, + "rewards/margins_min": -0.2588602900505066, + "rewards/margins_std": 0.6906100511550903, + "rewards/rejected": -0.7507689595222473, + "step": 3390 + }, + { + "epoch": 0.89, + "grad_norm": 6.574479857782714, + "learning_rate": 1.8272560261650277e-08, + "logits/chosen": -2.703399181365967, + "logits/rejected": -2.678226947784424, + "logps/chosen": -305.5192565917969, + "logps/rejected": -346.4285583496094, + "loss": 0.611, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3076348900794983, + "rewards/margins": 0.3126983046531677, + "rewards/margins_max": 0.9637606739997864, + "rewards/margins_min": -0.2729613184928894, + "rewards/margins_std": 0.5488346815109253, + "rewards/rejected": -0.620333194732666, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_logits/chosen": -2.714493989944458, + "eval_logits/rejected": -2.6820969581604004, + "eval_logps/chosen": -309.3476867675781, + "eval_logps/rejected": -321.9356384277344, + "eval_loss": 0.5924811363220215, + "eval_rewards/accuracies": 0.7361111044883728, + "eval_rewards/chosen": -0.2485279142856598, + "eval_rewards/margins": 0.3493346869945526, + "eval_rewards/margins_max": 1.228116750717163, + "eval_rewards/margins_min": -0.5495842099189758, + "eval_rewards/margins_std": 0.6022553443908691, + "eval_rewards/rejected": -0.5978626012802124, + "eval_runtime": 403.4356, + "eval_samples_per_second": 4.957, + "eval_steps_per_second": 0.156, + "step": 3400 + }, + { + "epoch": 0.89, + "grad_norm": 12.159200668163672, + "learning_rate": 1.742492393945427e-08, + "logits/chosen": -2.509911298751831, + "logits/rejected": -2.533151149749756, + "logps/chosen": -277.87677001953125, + "logps/rejected": -330.4190673828125, + "loss": 0.6158, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3746136426925659, + "rewards/margins": 0.2461293637752533, + "rewards/margins_max": 0.811921238899231, + "rewards/margins_min": -0.3718070089817047, + "rewards/margins_std": 0.5313544869422913, + "rewards/rejected": -0.6207430362701416, + "step": 3410 + }, + { + "epoch": 0.9, + "grad_norm": 10.531949073839339, + "learning_rate": 1.6596707569179302e-08, + "logits/chosen": -2.7206101417541504, + "logits/rejected": -2.6509861946105957, + "logps/chosen": -388.4510192871094, + "logps/rejected": -366.21478271484375, + "loss": 0.5599, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2500371038913727, + "rewards/margins": 0.5358761548995972, + "rewards/margins_max": 1.0645267963409424, + "rewards/margins_min": -0.004802307579666376, + "rewards/margins_std": 0.48022469878196716, + "rewards/rejected": -0.7859132885932922, + "step": 3420 + }, + { + "epoch": 0.9, + "grad_norm": 5.604320916342105, + "learning_rate": 1.5787980306653848e-08, + "logits/chosen": -2.7387804985046387, + "logits/rejected": -2.685904026031494, + "logps/chosen": -358.57269287109375, + "logps/rejected": -354.5420227050781, + "loss": 0.5722, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.24225082993507385, + "rewards/margins": 0.428288072347641, + "rewards/margins_max": 1.0563759803771973, + "rewards/margins_min": -0.23835131525993347, + "rewards/margins_std": 0.5812400579452515, + "rewards/rejected": -0.6705388426780701, + "step": 3430 + }, + { + "epoch": 0.9, + "grad_norm": 16.357481459779535, + "learning_rate": 1.499880968037165e-08, + "logits/chosen": -2.6220734119415283, + "logits/rejected": -2.6470353603363037, + "logps/chosen": -308.3330078125, + "logps/rejected": -339.4562072753906, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25692135095596313, + "rewards/margins": 0.2672504782676697, + "rewards/margins_max": 0.851662814617157, + "rewards/margins_min": -0.3628670573234558, + "rewards/margins_std": 0.5492266416549683, + "rewards/rejected": -0.5241718888282776, + "step": 3440 + }, + { + "epoch": 0.9, + "grad_norm": 4.917456784312999, + "learning_rate": 1.4229261585852803e-08, + "logits/chosen": -2.7236084938049316, + "logits/rejected": -2.712661027908325, + "logps/chosen": -300.86358642578125, + "logps/rejected": -314.1263427734375, + "loss": 0.5798, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.26423364877700806, + "rewards/margins": 0.32090646028518677, + "rewards/margins_max": 0.7215771675109863, + "rewards/margins_min": -0.15194258093833923, + "rewards/margins_std": 0.3852207064628601, + "rewards/rejected": -0.58514004945755, + "step": 3450 + }, + { + "epoch": 0.91, + "grad_norm": 5.7045916617959636, + "learning_rate": 1.3479400280141883e-08, + "logits/chosen": -2.7118189334869385, + "logits/rejected": -2.6771388053894043, + "logps/chosen": -337.8304443359375, + "logps/rejected": -281.6158752441406, + "loss": 0.5557, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.202952578663826, + "rewards/margins": 0.4622286856174469, + "rewards/margins_max": 1.0748082399368286, + "rewards/margins_min": -0.22423362731933594, + "rewards/margins_std": 0.6075157523155212, + "rewards/rejected": -0.6651812791824341, + "step": 3460 + }, + { + "epoch": 0.91, + "grad_norm": 7.204534628542443, + "learning_rate": 1.2749288376442042e-08, + "logits/chosen": -2.7061727046966553, + "logits/rejected": -2.6499149799346924, + "logps/chosen": -312.50433349609375, + "logps/rejected": -360.14935302734375, + "loss": 0.6018, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3530515134334564, + "rewards/margins": 0.37734928727149963, + "rewards/margins_max": 0.910068690776825, + "rewards/margins_min": -0.1972331702709198, + "rewards/margins_std": 0.5132545232772827, + "rewards/rejected": -0.7304007411003113, + "step": 3470 + }, + { + "epoch": 0.91, + "grad_norm": 3.4187511660279233, + "learning_rate": 1.2038986838887127e-08, + "logits/chosen": -2.721287250518799, + "logits/rejected": -2.6296310424804688, + "logps/chosen": -262.59075927734375, + "logps/rejected": -297.3605651855469, + "loss": 0.603, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3634189963340759, + "rewards/margins": 0.267557829618454, + "rewards/margins_max": 0.9734705686569214, + "rewards/margins_min": -0.3448030948638916, + "rewards/margins_std": 0.607043445110321, + "rewards/rejected": -0.6309767961502075, + "step": 3480 + }, + { + "epoch": 0.91, + "grad_norm": 5.515381891442084, + "learning_rate": 1.1348554977451131e-08, + "logits/chosen": -2.6783525943756104, + "logits/rejected": -2.716297149658203, + "logps/chosen": -283.3968200683594, + "logps/rejected": -277.41357421875, + "loss": 0.5906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29233378171920776, + "rewards/margins": 0.26629239320755005, + "rewards/margins_max": 0.7649834752082825, + "rewards/margins_min": -0.3916316032409668, + "rewards/margins_std": 0.5158937573432922, + "rewards/rejected": -0.558626115322113, + "step": 3490 + }, + { + "epoch": 0.92, + "grad_norm": 11.620194292922706, + "learning_rate": 1.06780504429958e-08, + "logits/chosen": -2.767073154449463, + "logits/rejected": -2.7504372596740723, + "logps/chosen": -376.61322021484375, + "logps/rejected": -349.7640380859375, + "loss": 0.5458, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.12740927934646606, + "rewards/margins": 0.5775086879730225, + "rewards/margins_max": 1.2688848972320557, + "rewards/margins_min": -0.0012775674695149064, + "rewards/margins_std": 0.5677827000617981, + "rewards/rejected": -0.7049180269241333, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_logits/chosen": -2.711836814880371, + "eval_logits/rejected": -2.679222583770752, + "eval_logps/chosen": -309.4359130859375, + "eval_logps/rejected": -322.02557373046875, + "eval_loss": 0.5925173163414001, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.24941037595272064, + "eval_rewards/margins": 0.34935152530670166, + "eval_rewards/margins_max": 1.2280077934265137, + "eval_rewards/margins_min": -0.5516058802604675, + "eval_rewards/margins_std": 0.6025034785270691, + "eval_rewards/rejected": -0.5987619161605835, + "eval_runtime": 418.4985, + "eval_samples_per_second": 4.779, + "eval_steps_per_second": 0.151, + "step": 3500 + }, + { + "epoch": 0.92, + "grad_norm": 9.366195002544181, + "learning_rate": 1.0027529222456754e-08, + "logits/chosen": -2.677546501159668, + "logits/rejected": -2.632441759109497, + "logps/chosen": -325.7339172363281, + "logps/rejected": -324.6513671875, + "loss": 0.6056, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3515588641166687, + "rewards/margins": 0.24877603352069855, + "rewards/margins_max": 0.8012617826461792, + "rewards/margins_min": -0.23805885016918182, + "rewards/margins_std": 0.47890353202819824, + "rewards/rejected": -0.600334882736206, + "step": 3510 + }, + { + "epoch": 0.92, + "grad_norm": 8.943503757501823, + "learning_rate": 9.397045634168766e-09, + "logits/chosen": -2.7142739295959473, + "logits/rejected": -2.756129741668701, + "logps/chosen": -300.27777099609375, + "logps/rejected": -325.3738708496094, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.36485520005226135, + "rewards/margins": 0.32245519757270813, + "rewards/margins_max": 1.0698516368865967, + "rewards/margins_min": -0.3229106068611145, + "rewards/margins_std": 0.6372673511505127, + "rewards/rejected": -0.6873103380203247, + "step": 3520 + }, + { + "epoch": 0.92, + "grad_norm": 5.3530389016125905, + "learning_rate": 8.78665232332998e-09, + "logits/chosen": -2.7878715991973877, + "logits/rejected": -2.7221691608428955, + "logps/chosen": -304.5697021484375, + "logps/rejected": -298.0013122558594, + "loss": 0.5803, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.22994065284729004, + "rewards/margins": 0.30692702531814575, + "rewards/margins_max": 0.9463814496994019, + "rewards/margins_min": -0.27998870611190796, + "rewards/margins_std": 0.5402547121047974, + "rewards/rejected": -0.536867618560791, + "step": 3530 + }, + { + "epoch": 0.93, + "grad_norm": 13.06786473549207, + "learning_rate": 8.196400257606206e-09, + "logits/chosen": -2.6720714569091797, + "logits/rejected": -2.583261013031006, + "logps/chosen": -307.7283935546875, + "logps/rejected": -296.9068603515625, + "loss": 0.6042, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.26437506079673767, + "rewards/margins": 0.3110542595386505, + "rewards/margins_max": 0.9296269416809082, + "rewards/margins_min": -0.408257395029068, + "rewards/margins_std": 0.5870265364646912, + "rewards/rejected": -0.5754293203353882, + "step": 3540 + }, + { + "epoch": 0.93, + "grad_norm": 4.947374923227614, + "learning_rate": 7.626338722875075e-09, + "logits/chosen": -2.6875720024108887, + "logits/rejected": -2.673475742340088, + "logps/chosen": -280.8397521972656, + "logps/rejected": -319.99053955078125, + "loss": 0.6098, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.17705103754997253, + "rewards/margins": 0.4027441143989563, + "rewards/margins_max": 1.0875641107559204, + "rewards/margins_min": -0.16975267231464386, + "rewards/margins_std": 0.5702294111251831, + "rewards/rejected": -0.5797951221466064, + "step": 3550 + }, + { + "epoch": 0.93, + "grad_norm": 6.958747288008049, + "learning_rate": 7.0765153191106875e-09, + "logits/chosen": -2.645277500152588, + "logits/rejected": -2.5900075435638428, + "logps/chosen": -307.39569091796875, + "logps/rejected": -331.3942565917969, + "loss": 0.5548, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2713877260684967, + "rewards/margins": 0.5076281428337097, + "rewards/margins_max": 1.0530295372009277, + "rewards/margins_min": -0.06234753876924515, + "rewards/margins_std": 0.49917277693748474, + "rewards/rejected": -0.7790158987045288, + "step": 3560 + }, + { + "epoch": 0.93, + "grad_norm": 5.831223387826132, + "learning_rate": 6.54697595640899e-09, + "logits/chosen": -2.66957426071167, + "logits/rejected": -2.6759583950042725, + "logps/chosen": -353.065185546875, + "logps/rejected": -325.03021240234375, + "loss": 0.5685, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.29838138818740845, + "rewards/margins": 0.30865147709846497, + "rewards/margins_max": 0.9437069892883301, + "rewards/margins_min": -0.1918685883283615, + "rewards/margins_std": 0.5047564506530762, + "rewards/rejected": -0.6070328950881958, + "step": 3570 + }, + { + "epoch": 0.94, + "grad_norm": 6.694433023206466, + "learning_rate": 6.037764851154425e-09, + "logits/chosen": -2.773677349090576, + "logits/rejected": -2.7026915550231934, + "logps/chosen": -341.8813781738281, + "logps/rejected": -339.5367736816406, + "loss": 0.6175, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.3384380638599396, + "rewards/margins": 0.3745259940624237, + "rewards/margins_max": 1.0870534181594849, + "rewards/margins_min": -0.3961326777935028, + "rewards/margins_std": 0.6399065256118774, + "rewards/rejected": -0.7129641175270081, + "step": 3580 + }, + { + "epoch": 0.94, + "grad_norm": 6.319544421326211, + "learning_rate": 5.548924522327747e-09, + "logits/chosen": -2.673095703125, + "logits/rejected": -2.688293218612671, + "logps/chosen": -231.4519500732422, + "logps/rejected": -261.11181640625, + "loss": 0.5851, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.2275831252336502, + "rewards/margins": 0.31233933568000793, + "rewards/margins_max": 0.8043087720870972, + "rewards/margins_min": -0.2469126433134079, + "rewards/margins_std": 0.47967201471328735, + "rewards/rejected": -0.5399224758148193, + "step": 3590 + }, + { + "epoch": 0.94, + "grad_norm": 6.157036280887109, + "learning_rate": 5.080495787955691e-09, + "logits/chosen": -2.5723958015441895, + "logits/rejected": -2.5558385848999023, + "logps/chosen": -280.0655822753906, + "logps/rejected": -335.08319091796875, + "loss": 0.5926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.22753851115703583, + "rewards/margins": 0.31550633907318115, + "rewards/margins_max": 0.999724268913269, + "rewards/margins_min": -0.4253227710723877, + "rewards/margins_std": 0.6316407322883606, + "rewards/rejected": -0.5430448651313782, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_logits/chosen": -2.71598744392395, + "eval_logits/rejected": -2.6837034225463867, + "eval_logps/chosen": -309.69091796875, + "eval_logps/rejected": -322.2860107421875, + "eval_loss": 0.5925108790397644, + "eval_rewards/accuracies": 0.7321428656578064, + "eval_rewards/chosen": -0.25196000933647156, + "eval_rewards/margins": 0.34940657019615173, + "eval_rewards/margins_max": 1.2311718463897705, + "eval_rewards/margins_min": -0.5539427399635315, + "eval_rewards/margins_std": 0.6042289137840271, + "eval_rewards/rejected": -0.6013665795326233, + "eval_runtime": 391.0383, + "eval_samples_per_second": 5.115, + "eval_steps_per_second": 0.161, + "step": 3600 + }, + { + "epoch": 0.94, + "grad_norm": 5.684683636910484, + "learning_rate": 4.632517761702814e-09, + "logits/chosen": -2.5821776390075684, + "logits/rejected": -2.6038641929626465, + "logps/chosen": -266.82366943359375, + "logps/rejected": -272.4218444824219, + "loss": 0.5947, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.2806350290775299, + "rewards/margins": 0.18183407187461853, + "rewards/margins_max": 0.7556573748588562, + "rewards/margins_min": -0.3702376186847687, + "rewards/margins_std": 0.5055059194564819, + "rewards/rejected": -0.46246910095214844, + "step": 3610 + }, + { + "epoch": 0.95, + "grad_norm": 4.123887808012746, + "learning_rate": 4.205027849605358e-09, + "logits/chosen": -2.6593799591064453, + "logits/rejected": -2.618945598602295, + "logps/chosen": -259.2201232910156, + "logps/rejected": -299.99310302734375, + "loss": 0.5795, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.23785845935344696, + "rewards/margins": 0.32053202390670776, + "rewards/margins_max": 0.9505325555801392, + "rewards/margins_min": -0.25549110770225525, + "rewards/margins_std": 0.5379956960678101, + "rewards/rejected": -0.5583904981613159, + "step": 3620 + }, + { + "epoch": 0.95, + "grad_norm": 8.431975187724774, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -2.590376853942871, + "logits/rejected": -2.5361576080322266, + "logps/chosen": -289.1084289550781, + "logps/rejected": -378.27325439453125, + "loss": 0.5938, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4047611355781555, + "rewards/margins": 0.3168661892414093, + "rewards/margins_max": 1.0654369592666626, + "rewards/margins_min": -0.29270845651626587, + "rewards/margins_std": 0.5953770279884338, + "rewards/rejected": -0.7216273546218872, + "step": 3630 + }, + { + "epoch": 0.95, + "grad_norm": 5.580846811663433, + "learning_rate": 3.411653435283157e-09, + "logits/chosen": -2.6906750202178955, + "logits/rejected": -2.6559128761291504, + "logps/chosen": -293.71551513671875, + "logps/rejected": -285.6443176269531, + "loss": 0.6131, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2762609124183655, + "rewards/margins": 0.22689524292945862, + "rewards/margins_max": 0.8799875378608704, + "rewards/margins_min": -0.41888904571533203, + "rewards/margins_std": 0.5791952610015869, + "rewards/rejected": -0.5031560659408569, + "step": 3640 + }, + { + "epoch": 0.96, + "grad_norm": 4.904850851390861, + "learning_rate": 3.0458351795936698e-09, + "logits/chosen": -2.657867670059204, + "logits/rejected": -2.688446283340454, + "logps/chosen": -307.5293884277344, + "logps/rejected": -378.6922912597656, + "loss": 0.6085, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.2467610388994217, + "rewards/margins": 0.2510070204734802, + "rewards/margins_max": 0.9476425051689148, + "rewards/margins_min": -0.3223420977592468, + "rewards/margins_std": 0.6017767190933228, + "rewards/rejected": -0.49776801466941833, + "step": 3650 + }, + { + "epoch": 0.96, + "grad_norm": 5.589750938258145, + "learning_rate": 2.700637525598598e-09, + "logits/chosen": -2.8244712352752686, + "logits/rejected": -2.8069536685943604, + "logps/chosen": -314.80596923828125, + "logps/rejected": -311.9951171875, + "loss": 0.6099, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3564375042915344, + "rewards/margins": 0.11295261234045029, + "rewards/margins_max": 0.6051047444343567, + "rewards/margins_min": -0.5534665584564209, + "rewards/margins_std": 0.5239076614379883, + "rewards/rejected": -0.4693901538848877, + "step": 3660 + }, + { + "epoch": 0.96, + "grad_norm": 7.725092332405448, + "learning_rate": 2.3760892972027324e-09, + "logits/chosen": -2.6816861629486084, + "logits/rejected": -2.671363353729248, + "logps/chosen": -312.21136474609375, + "logps/rejected": -368.08270263671875, + "loss": 0.5669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.1678880751132965, + "rewards/margins": 0.46078020334243774, + "rewards/margins_max": 1.3308370113372803, + "rewards/margins_min": -0.19109635055065155, + "rewards/margins_std": 0.7035477161407471, + "rewards/rejected": -0.6286682486534119, + "step": 3670 + }, + { + "epoch": 0.96, + "grad_norm": 4.390601758793581, + "learning_rate": 2.0722175940897645e-09, + "logits/chosen": -2.6544220447540283, + "logits/rejected": -2.6338818073272705, + "logps/chosen": -313.9645080566406, + "logps/rejected": -336.74053955078125, + "loss": 0.6051, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19321490824222565, + "rewards/margins": 0.41402778029441833, + "rewards/margins_max": 1.0859754085540771, + "rewards/margins_min": -0.17861047387123108, + "rewards/margins_std": 0.5534173250198364, + "rewards/rejected": -0.6072427034378052, + "step": 3680 + }, + { + "epoch": 0.97, + "grad_norm": 5.943484211944768, + "learning_rate": 1.7890477894593748e-09, + "logits/chosen": -2.6832566261291504, + "logits/rejected": -2.666738748550415, + "logps/chosen": -352.7740173339844, + "logps/rejected": -311.6842956542969, + "loss": 0.5596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21857281029224396, + "rewards/margins": 0.408399760723114, + "rewards/margins_max": 0.9420955777168274, + "rewards/margins_min": -0.19959206879138947, + "rewards/margins_std": 0.5040971636772156, + "rewards/rejected": -0.6269725561141968, + "step": 3690 + }, + { + "epoch": 0.97, + "grad_norm": 5.998762320974629, + "learning_rate": 1.5266035279088708e-09, + "logits/chosen": -2.8190507888793945, + "logits/rejected": -2.7878916263580322, + "logps/chosen": -363.9957580566406, + "logps/rejected": -346.9015808105469, + "loss": 0.6096, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3295542597770691, + "rewards/margins": 0.3019460141658783, + "rewards/margins_max": 0.8790268898010254, + "rewards/margins_min": -0.3847871422767639, + "rewards/margins_std": 0.5682782530784607, + "rewards/rejected": -0.6315003633499146, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_logits/chosen": -2.711902141571045, + "eval_logits/rejected": -2.679262638092041, + "eval_logps/chosen": -309.6683349609375, + "eval_logps/rejected": -322.296630859375, + "eval_loss": 0.5925595164299011, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.25173425674438477, + "eval_rewards/margins": 0.34973862767219543, + "eval_rewards/margins_max": 1.2312732934951782, + "eval_rewards/margins_min": -0.5539093613624573, + "eval_rewards/margins_std": 0.6042241454124451, + "eval_rewards/rejected": -0.601472795009613, + "eval_runtime": 417.4121, + "eval_samples_per_second": 4.791, + "eval_steps_per_second": 0.151, + "step": 3700 + }, + { + "epoch": 0.97, + "grad_norm": 6.459355263202339, + "learning_rate": 1.2849067234584621e-09, + "logits/chosen": -2.5690150260925293, + "logits/rejected": -2.587456464767456, + "logps/chosen": -320.1042175292969, + "logps/rejected": -328.36163330078125, + "loss": 0.5926, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1667606681585312, + "rewards/margins": 0.3529542088508606, + "rewards/margins_max": 0.977542519569397, + "rewards/margins_min": -0.15057794749736786, + "rewards/margins_std": 0.5112577676773071, + "rewards/rejected": -0.519714891910553, + "step": 3710 + }, + { + "epoch": 0.97, + "grad_norm": 6.065392187761433, + "learning_rate": 1.0639775577218625e-09, + "logits/chosen": -2.7068657875061035, + "logits/rejected": -2.708185911178589, + "logps/chosen": -319.75604248046875, + "logps/rejected": -318.9110107421875, + "loss": 0.5683, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19278459250926971, + "rewards/margins": 0.33805471658706665, + "rewards/margins_max": 1.0843052864074707, + "rewards/margins_min": -0.323042631149292, + "rewards/margins_std": 0.6268844604492188, + "rewards/rejected": -0.5308393239974976, + "step": 3720 + }, + { + "epoch": 0.98, + "grad_norm": 5.450830014920893, + "learning_rate": 8.638344782207485e-10, + "logits/chosen": -2.8192014694213867, + "logits/rejected": -2.7536661624908447, + "logps/chosen": -366.6589660644531, + "logps/rejected": -323.0194091796875, + "loss": 0.6117, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.26820605993270874, + "rewards/margins": 0.3417043685913086, + "rewards/margins_max": 1.1187412738800049, + "rewards/margins_min": -0.41833925247192383, + "rewards/margins_std": 0.687247633934021, + "rewards/rejected": -0.6099103689193726, + "step": 3730 + }, + { + "epoch": 0.98, + "grad_norm": 4.962181692220312, + "learning_rate": 6.844941968447149e-10, + "logits/chosen": -2.7343554496765137, + "logits/rejected": -2.685408353805542, + "logps/chosen": -355.3741455078125, + "logps/rejected": -378.8301086425781, + "loss": 0.5615, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2062954157590866, + "rewards/margins": 0.29231908917427063, + "rewards/margins_max": 0.9197826385498047, + "rewards/margins_min": -0.2937725782394409, + "rewards/margins_std": 0.5359052419662476, + "rewards/rejected": -0.49861449003219604, + "step": 3740 + }, + { + "epoch": 0.98, + "grad_norm": 6.4302736481002984, + "learning_rate": 5.25971688455612e-10, + "logits/chosen": -2.6765501499176025, + "logits/rejected": -2.7161264419555664, + "logps/chosen": -329.92041015625, + "logps/rejected": -313.0812683105469, + "loss": 0.5786, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36000367999076843, + "rewards/margins": 0.12194408476352692, + "rewards/margins_max": 0.6862843632698059, + "rewards/margins_min": -0.41331878304481506, + "rewards/margins_std": 0.49094897508621216, + "rewards/rejected": -0.48194774985313416, + "step": 3750 + }, + { + "epoch": 0.98, + "grad_norm": 5.256887912936208, + "learning_rate": 3.882801896372967e-10, + "logits/chosen": -2.6834053993225098, + "logits/rejected": -2.7166876792907715, + "logps/chosen": -261.3063049316406, + "logps/rejected": -307.7838439941406, + "loss": 0.5494, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2648637890815735, + "rewards/margins": 0.43512624502182007, + "rewards/margins_max": 1.2276852130889893, + "rewards/margins_min": -0.2708841860294342, + "rewards/margins_std": 0.6867440342903137, + "rewards/rejected": -0.6999900937080383, + "step": 3760 + }, + { + "epoch": 0.99, + "grad_norm": 11.77513259197508, + "learning_rate": 2.714311975902661e-10, + "logits/chosen": -2.696057081222534, + "logits/rejected": -2.6460297107696533, + "logps/chosen": -325.557373046875, + "logps/rejected": -396.30078125, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18080511689186096, + "rewards/margins": 0.4406723380088806, + "rewards/margins_max": 1.0763943195343018, + "rewards/margins_min": -0.3152617812156677, + "rewards/margins_std": 0.5964141488075256, + "rewards/rejected": -0.6214774250984192, + "step": 3770 + }, + { + "epoch": 0.99, + "grad_norm": 12.194026197378115, + "learning_rate": 1.754344691717591e-10, + "logits/chosen": -2.6871161460876465, + "logits/rejected": -2.6279549598693848, + "logps/chosen": -313.11083984375, + "logps/rejected": -330.28546142578125, + "loss": 0.6094, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.16916698217391968, + "rewards/margins": 0.3526850938796997, + "rewards/margins_max": 1.220383644104004, + "rewards/margins_min": -0.3871513605117798, + "rewards/margins_std": 0.7053273916244507, + "rewards/rejected": -0.5218520760536194, + "step": 3780 + }, + { + "epoch": 0.99, + "grad_norm": 8.574382917705133, + "learning_rate": 1.0029802008096333e-10, + "logits/chosen": -2.723098039627075, + "logits/rejected": -2.7211620807647705, + "logps/chosen": -291.99481201171875, + "logps/rejected": -288.8013000488281, + "loss": 0.6151, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.4443100094795227, + "rewards/margins": 0.2617095410823822, + "rewards/margins_max": 1.1447874307632446, + "rewards/margins_min": -0.5205368995666504, + "rewards/margins_std": 0.7487602233886719, + "rewards/rejected": -0.7060195207595825, + "step": 3790 + }, + { + "epoch": 0.99, + "grad_norm": 11.982128823076994, + "learning_rate": 4.602812418974533e-11, + "logits/chosen": -2.589489459991455, + "logits/rejected": -2.5803260803222656, + "logps/chosen": -287.29522705078125, + "logps/rejected": -249.25637817382812, + "loss": 0.5865, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.15798085927963257, + "rewards/margins": 0.34595808386802673, + "rewards/margins_max": 0.8616958856582642, + "rewards/margins_min": -0.1021503433585167, + "rewards/margins_std": 0.43077605962753296, + "rewards/rejected": -0.5039389133453369, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_logits/chosen": -2.712641477584839, + "eval_logits/rejected": -2.6800715923309326, + "eval_logps/chosen": -309.6683654785156, + "eval_logps/rejected": -322.34332275390625, + "eval_loss": 0.5924710631370544, + "eval_rewards/accuracies": 0.7341269850730896, + "eval_rewards/chosen": -0.25173452496528625, + "eval_rewards/margins": 0.35020482540130615, + "eval_rewards/margins_max": 1.2316042184829712, + "eval_rewards/margins_min": -0.5546290278434753, + "eval_rewards/margins_std": 0.6038333773612976, + "eval_rewards/rejected": -0.6019393801689148, + "eval_runtime": 446.5167, + "eval_samples_per_second": 4.479, + "eval_steps_per_second": 0.141, + "step": 3800 + }, + { + "epoch": 1.0, + "grad_norm": 4.300346608941104, + "learning_rate": 1.2629313018819309e-11, + "logits/chosen": -2.74137020111084, + "logits/rejected": -2.7238645553588867, + "logps/chosen": -339.169677734375, + "logps/rejected": -319.907470703125, + "loss": 0.5959, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2621849477291107, + "rewards/margins": 0.2273492068052292, + "rewards/margins_max": 0.8272867202758789, + "rewards/margins_min": -0.27956342697143555, + "rewards/margins_std": 0.49359187483787537, + "rewards/rejected": -0.4895341992378235, + "step": 3810 + }, + { + "epoch": 1.0, + "grad_norm": 9.115278526315292, + "learning_rate": 1.0437535929996855e-13, + "logits/chosen": -2.672020673751831, + "logits/rejected": -2.67209792137146, + "logps/chosen": -372.24615478515625, + "logps/rejected": -273.6216125488281, + "loss": 0.5833, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3809719681739807, + "rewards/margins": 0.1818961203098297, + "rewards/margins_max": 0.8466414213180542, + "rewards/margins_min": -0.474606454372406, + "rewards/margins_std": 0.603650689125061, + "rewards/rejected": -0.5628681182861328, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.619671463092813, + "train_runtime": 44477.573, + "train_samples_per_second": 1.374, + "train_steps_per_second": 0.086 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}