{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.9159905783826223,
  "eval_steps": 500,
  "global_step": 3500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0,
      "grad_norm": 50.2683219909668,
      "kl": 0.0,
      "learning_rate": 4.998691442030882e-07,
      "logps/chosen": -205.0401611328125,
      "logps/rejected": -172.3661346435547,
      "loss": 0.4375,
      "rewards/chosen": 0.0,
      "rewards/margins": 0.0,
      "rewards/rejected": 0.0,
      "step": 1
    },
    {
      "epoch": 0.0,
      "grad_norm": 52.03371810913086,
      "kl": 0.8273243308067322,
      "learning_rate": 4.997382884061764e-07,
      "logps/chosen": -228.83558654785156,
      "logps/rejected": -244.7559051513672,
      "loss": 0.3796,
      "rewards/chosen": 0.06499359011650085,
      "rewards/margins": 0.07474303990602493,
      "rewards/rejected": -0.009749448858201504,
      "step": 2
    },
    {
      "epoch": 0.0,
      "grad_norm": 47.92995834350586,
      "kl": 0.0,
      "learning_rate": 4.996074326092646e-07,
      "logps/chosen": -222.45486450195312,
      "logps/rejected": -224.06056213378906,
      "loss": 0.2934,
      "rewards/chosen": 0.3142276406288147,
      "rewards/margins": 0.35068318247795105,
      "rewards/rejected": -0.036455538123846054,
      "step": 3
    },
    {
      "epoch": 0.0,
      "grad_norm": 48.66846466064453,
      "kl": 0.03168666362762451,
      "learning_rate": 4.994765768123528e-07,
      "logps/chosen": -215.8352508544922,
      "logps/rejected": -168.82923889160156,
      "loss": 0.4269,
      "rewards/chosen": 0.29763078689575195,
      "rewards/margins": -0.13886231184005737,
      "rewards/rejected": 0.4364930987358093,
      "step": 4
    },
    {
      "epoch": 0.0,
      "grad_norm": 49.69014358520508,
      "kl": 0.0,
      "learning_rate": 4.99345721015441e-07,
      "logps/chosen": -260.2557067871094,
      "logps/rejected": -195.94869995117188,
      "loss": 0.3903,
      "rewards/chosen": -0.442703515291214,
      "rewards/margins": 0.013102918863296509,
      "rewards/rejected": -0.4558064341545105,
      "step": 5
    },
    {
      "epoch": 0.0,
      "grad_norm": 59.5657958984375,
      "kl": 0.0,
      "learning_rate": 4.992148652185292e-07,
      "logps/chosen": -278.39373779296875,
      "logps/rejected": -198.203125,
      "loss": 0.562,
      "rewards/chosen": -0.8296091556549072,
      "rewards/margins": -0.6596475839614868,
      "rewards/rejected": -0.1699615716934204,
      "step": 6
    },
    {
      "epoch": 0.0,
      "grad_norm": 43.889366149902344,
      "kl": 0.0,
      "learning_rate": 4.990840094216174e-07,
      "logps/chosen": -270.32000732421875,
      "logps/rejected": -178.87966918945312,
      "loss": 0.494,
      "rewards/chosen": -0.641089677810669,
      "rewards/margins": -1.1119567155838013,
      "rewards/rejected": 0.47086700797080994,
      "step": 7
    },
    {
      "epoch": 0.0,
      "grad_norm": 47.326114654541016,
      "kl": 0.1817803978919983,
      "learning_rate": 4.989531536247055e-07,
      "logps/chosen": -211.3541259765625,
      "logps/rejected": -253.94935607910156,
      "loss": 0.4547,
      "rewards/chosen": -0.5205121040344238,
      "rewards/margins": -0.384090781211853,
      "rewards/rejected": -0.136421337723732,
      "step": 8
    },
    {
      "epoch": 0.0,
      "grad_norm": 48.22457504272461,
      "kl": 0.0,
      "learning_rate": 4.988222978277937e-07,
      "logps/chosen": -273.5563659667969,
      "logps/rejected": -226.06747436523438,
      "loss": 0.3257,
      "rewards/chosen": -0.0483764223754406,
      "rewards/margins": 0.6438623666763306,
      "rewards/rejected": -0.6922388076782227,
      "step": 9
    },
    {
      "epoch": 0.0,
      "grad_norm": 42.13737869262695,
      "kl": 0.0,
      "learning_rate": 4.986914420308819e-07,
      "logps/chosen": -192.82940673828125,
      "logps/rejected": -182.11260986328125,
      "loss": 0.4647,
      "rewards/chosen": -0.48483985662460327,
      "rewards/margins": 0.11868029832839966,
      "rewards/rejected": -0.6035201549530029,
      "step": 10
    },
    {
      "epoch": 0.0,
      "grad_norm": 46.52305603027344,
      "kl": 0.0,
      "learning_rate": 4.985605862339701e-07,
      "logps/chosen": -257.1351318359375,
      "logps/rejected": -213.33688354492188,
      "loss": 0.3875,
      "rewards/chosen": 0.08887295424938202,
      "rewards/margins": 0.19311153888702393,
      "rewards/rejected": -0.1042385920882225,
      "step": 11
    },
    {
      "epoch": 0.0,
      "grad_norm": 46.00929260253906,
      "kl": 0.0,
      "learning_rate": 4.984297304370583e-07,
      "logps/chosen": -206.80709838867188,
      "logps/rejected": -302.8717956542969,
      "loss": 0.3693,
      "rewards/chosen": -0.29487642645835876,
      "rewards/margins": 0.7638503313064575,
      "rewards/rejected": -1.0587267875671387,
      "step": 12
    },
    {
      "epoch": 0.0,
      "grad_norm": 66.953369140625,
      "kl": 0.0,
      "learning_rate": 4.982988746401465e-07,
      "logps/chosen": -281.1011047363281,
      "logps/rejected": -272.95831298828125,
      "loss": 0.4793,
      "rewards/chosen": -1.2275511026382446,
      "rewards/margins": 0.7767127752304077,
      "rewards/rejected": -2.0042638778686523,
      "step": 13
    },
    {
      "epoch": 0.0,
      "grad_norm": 42.68067932128906,
      "kl": 0.0,
      "learning_rate": 4.981680188432347e-07,
      "logps/chosen": -197.5496826171875,
      "logps/rejected": -233.1874542236328,
      "loss": 0.3359,
      "rewards/chosen": -0.7852997183799744,
      "rewards/margins": -0.11471927165985107,
      "rewards/rejected": -0.6705804467201233,
      "step": 14
    },
    {
      "epoch": 0.0,
      "grad_norm": 36.0564079284668,
      "kl": 0.0,
      "learning_rate": 4.980371630463229e-07,
      "logps/chosen": -159.7126922607422,
      "logps/rejected": -230.08297729492188,
      "loss": 0.5013,
      "rewards/chosen": -0.8807596564292908,
      "rewards/margins": 0.22696465253829956,
      "rewards/rejected": -1.1077243089675903,
      "step": 15
    },
    {
      "epoch": 0.0,
      "grad_norm": 42.2777214050293,
      "kl": 0.0,
      "learning_rate": 4.979063072494111e-07,
      "logps/chosen": -285.4176330566406,
      "logps/rejected": -212.86843872070312,
      "loss": 0.3413,
      "rewards/chosen": -0.02461055852472782,
      "rewards/margins": 1.1725351810455322,
      "rewards/rejected": -1.197145700454712,
      "step": 16
    },
    {
      "epoch": 0.0,
      "grad_norm": 37.475677490234375,
      "kl": 0.0,
      "learning_rate": 4.977754514524993e-07,
      "logps/chosen": -277.5311279296875,
      "logps/rejected": -273.7735290527344,
      "loss": 0.3365,
      "rewards/chosen": -0.8537225127220154,
      "rewards/margins": 0.5020245909690857,
      "rewards/rejected": -1.355747103691101,
      "step": 17
    },
    {
      "epoch": 0.0,
      "grad_norm": 50.058441162109375,
      "kl": 0.0,
      "learning_rate": 4.976445956555875e-07,
      "logps/chosen": -255.5559539794922,
      "logps/rejected": -251.7951202392578,
      "loss": 0.3898,
      "rewards/chosen": -1.131912112236023,
      "rewards/margins": -0.5449322462081909,
      "rewards/rejected": -0.586979866027832,
      "step": 18
    },
    {
      "epoch": 0.0,
      "grad_norm": 48.28940200805664,
      "kl": 0.0,
      "learning_rate": 4.975137398586757e-07,
      "logps/chosen": -169.98117065429688,
      "logps/rejected": -234.54136657714844,
      "loss": 0.4461,
      "rewards/chosen": -0.8914527893066406,
      "rewards/margins": 0.0006075501441955566,
      "rewards/rejected": -0.8920603394508362,
      "step": 19
    },
    {
      "epoch": 0.01,
      "grad_norm": 52.11145782470703,
      "kl": 0.0,
      "learning_rate": 4.973828840617639e-07,
      "logps/chosen": -265.4768371582031,
      "logps/rejected": -242.55845642089844,
      "loss": 0.4579,
      "rewards/chosen": -1.322069764137268,
      "rewards/margins": -0.0047348737716674805,
      "rewards/rejected": -1.3173348903656006,
      "step": 20
    },
    {
      "epoch": 0.01,
      "grad_norm": 37.193660736083984,
      "kl": 0.0,
      "learning_rate": 4.972520282648521e-07,
      "logps/chosen": -219.7395477294922,
      "logps/rejected": -177.62509155273438,
      "loss": 0.3403,
      "rewards/chosen": 0.3125017583370209,
      "rewards/margins": 0.7497882843017578,
      "rewards/rejected": -0.4372865557670593,
      "step": 21
    },
    {
      "epoch": 0.01,
      "grad_norm": 47.88676452636719,
      "kl": 0.0,
      "learning_rate": 4.971211724679403e-07,
      "logps/chosen": -342.21783447265625,
      "logps/rejected": -224.13197326660156,
      "loss": 0.4,
      "rewards/chosen": -0.6000944375991821,
      "rewards/margins": 1.1481363773345947,
      "rewards/rejected": -1.7482308149337769,
      "step": 22
    },
    {
      "epoch": 0.01,
      "grad_norm": 35.711021423339844,
      "kl": 0.0,
      "learning_rate": 4.969903166710285e-07,
      "logps/chosen": -149.5780487060547,
      "logps/rejected": -152.07003784179688,
      "loss": 0.3721,
      "rewards/chosen": -0.5182877779006958,
      "rewards/margins": 0.29111266136169434,
      "rewards/rejected": -0.8094004392623901,
      "step": 23
    },
    {
      "epoch": 0.01,
      "grad_norm": 40.70458221435547,
      "kl": 0.0,
      "learning_rate": 4.968594608741167e-07,
      "logps/chosen": -249.3180389404297,
      "logps/rejected": -186.05471801757812,
      "loss": 0.3619,
      "rewards/chosen": -0.8323618769645691,
      "rewards/margins": 0.7159193158149719,
      "rewards/rejected": -1.548281192779541,
      "step": 24
    },
    {
      "epoch": 0.01,
      "grad_norm": 42.917049407958984,
      "kl": 0.0,
      "learning_rate": 4.967286050772049e-07,
      "logps/chosen": -177.19561767578125,
      "logps/rejected": -248.12649536132812,
      "loss": 0.4387,
      "rewards/chosen": -0.6950159668922424,
      "rewards/margins": 0.9509089589118958,
      "rewards/rejected": -1.6459249258041382,
      "step": 25
    },
    {
      "epoch": 0.01,
      "grad_norm": 39.620574951171875,
      "kl": 0.0,
      "learning_rate": 4.965977492802931e-07,
      "logps/chosen": -315.2325439453125,
      "logps/rejected": -218.47903442382812,
      "loss": 0.3679,
      "rewards/chosen": -1.3541030883789062,
      "rewards/margins": 0.7431886196136475,
      "rewards/rejected": -2.0972917079925537,
      "step": 26
    },
    {
      "epoch": 0.01,
      "grad_norm": 37.08391571044922,
      "kl": 0.0,
      "learning_rate": 4.964668934833813e-07,
      "logps/chosen": -230.52078247070312,
      "logps/rejected": -221.7032470703125,
      "loss": 0.3973,
      "rewards/chosen": -1.261672019958496,
      "rewards/margins": 0.273656964302063,
      "rewards/rejected": -1.535328984260559,
      "step": 27
    },
    {
      "epoch": 0.01,
      "grad_norm": 41.10235595703125,
      "kl": 0.0,
      "learning_rate": 4.963360376864695e-07,
      "logps/chosen": -191.61048889160156,
      "logps/rejected": -193.62025451660156,
      "loss": 0.3261,
      "rewards/chosen": -1.396644949913025,
      "rewards/margins": -0.22787630558013916,
      "rewards/rejected": -1.1687686443328857,
      "step": 28
    },
    {
      "epoch": 0.01,
      "grad_norm": 42.94221115112305,
      "kl": 0.0,
      "learning_rate": 4.962051818895577e-07,
      "logps/chosen": -275.1850891113281,
      "logps/rejected": -223.00143432617188,
      "loss": 0.3074,
      "rewards/chosen": -0.8943480253219604,
      "rewards/margins": 0.9237412214279175,
      "rewards/rejected": -1.818089246749878,
      "step": 29
    },
    {
      "epoch": 0.01,
      "grad_norm": 55.69268035888672,
      "kl": 0.0,
      "learning_rate": 4.960743260926459e-07,
      "logps/chosen": -266.8174133300781,
      "logps/rejected": -200.5430908203125,
      "loss": 0.3224,
      "rewards/chosen": -0.08692857623100281,
      "rewards/margins": 1.410178542137146,
      "rewards/rejected": -1.4971071481704712,
      "step": 30
    },
    {
      "epoch": 0.01,
      "grad_norm": 36.69532012939453,
      "kl": 0.0,
      "learning_rate": 4.959434702957341e-07,
      "logps/chosen": -239.9783935546875,
      "logps/rejected": -285.19903564453125,
      "loss": 0.2192,
      "rewards/chosen": -0.2260018140077591,
      "rewards/margins": 1.6547338962554932,
      "rewards/rejected": -1.8807357549667358,
      "step": 31
    },
    {
      "epoch": 0.01,
      "grad_norm": 42.48405838012695,
      "kl": 0.0,
      "learning_rate": 4.958126144988223e-07,
      "logps/chosen": -277.462890625,
      "logps/rejected": -230.81143188476562,
      "loss": 0.4113,
      "rewards/chosen": -1.1050209999084473,
      "rewards/margins": 1.3367903232574463,
      "rewards/rejected": -2.4418113231658936,
      "step": 32
    },
    {
      "epoch": 0.01,
      "grad_norm": 40.719364166259766,
      "kl": 0.0,
      "learning_rate": 4.956817587019104e-07,
      "logps/chosen": -243.24012756347656,
      "logps/rejected": -265.06121826171875,
      "loss": 0.3527,
      "rewards/chosen": -1.113293170928955,
      "rewards/margins": 0.796079158782959,
      "rewards/rejected": -1.909372329711914,
      "step": 33
    },
    {
      "epoch": 0.01,
      "grad_norm": 40.56840133666992,
      "kl": 0.0,
      "learning_rate": 4.955509029049986e-07,
      "logps/chosen": -357.2673034667969,
      "logps/rejected": -271.1274108886719,
      "loss": 0.3782,
      "rewards/chosen": -1.7618191242218018,
      "rewards/margins": -0.4248075485229492,
      "rewards/rejected": -1.3370115756988525,
      "step": 34
    },
    {
      "epoch": 0.01,
      "grad_norm": 45.011009216308594,
      "kl": 0.0,
      "learning_rate": 4.954200471080868e-07,
      "logps/chosen": -202.9520721435547,
      "logps/rejected": -186.841796875,
      "loss": 0.426,
      "rewards/chosen": -0.8729759454727173,
      "rewards/margins": 0.19871211051940918,
      "rewards/rejected": -1.0716880559921265,
      "step": 35
    },
    {
      "epoch": 0.01,
      "grad_norm": 39.886924743652344,
      "kl": 0.0,
      "learning_rate": 4.95289191311175e-07,
      "logps/chosen": -165.83241271972656,
      "logps/rejected": -280.1267395019531,
      "loss": 0.2954,
      "rewards/chosen": -0.43662160634994507,
      "rewards/margins": 1.597978115081787,
      "rewards/rejected": -2.034599781036377,
      "step": 36
    },
    {
      "epoch": 0.01,
      "grad_norm": 26.526390075683594,
      "kl": 0.0,
      "learning_rate": 4.951583355142632e-07,
      "logps/chosen": -159.53414916992188,
      "logps/rejected": -185.03199768066406,
      "loss": 0.4893,
      "rewards/chosen": -1.8748741149902344,
      "rewards/margins": 0.013196945190429688,
      "rewards/rejected": -1.888071060180664,
      "step": 37
    },
    {
      "epoch": 0.01,
      "grad_norm": 25.534835815429688,
      "kl": 0.0,
      "learning_rate": 4.950274797173514e-07,
      "logps/chosen": -200.60745239257812,
      "logps/rejected": -214.85702514648438,
      "loss": 0.4536,
      "rewards/chosen": -2.5081725120544434,
      "rewards/margins": -0.036855220794677734,
      "rewards/rejected": -2.4713172912597656,
      "step": 38
    },
    {
      "epoch": 0.01,
      "grad_norm": 37.80105209350586,
      "kl": 0.0,
      "learning_rate": 4.948966239204396e-07,
      "logps/chosen": -280.5340576171875,
      "logps/rejected": -287.7911682128906,
      "loss": 0.4128,
      "rewards/chosen": -1.4261023998260498,
      "rewards/margins": 1.8715763092041016,
      "rewards/rejected": -3.2976787090301514,
      "step": 39
    },
    {
      "epoch": 0.01,
      "grad_norm": 43.570186614990234,
      "kl": 0.0,
      "learning_rate": 4.947657681235278e-07,
      "logps/chosen": -215.7322998046875,
      "logps/rejected": -245.70297241210938,
      "loss": 0.3551,
      "rewards/chosen": -0.5773531794548035,
      "rewards/margins": 1.1560332775115967,
      "rewards/rejected": -1.7333863973617554,
      "step": 40
    },
    {
      "epoch": 0.01,
      "grad_norm": 37.78433609008789,
      "kl": 0.0,
      "learning_rate": 4.94634912326616e-07,
      "logps/chosen": -233.8341522216797,
      "logps/rejected": -292.6675109863281,
      "loss": 0.2997,
      "rewards/chosen": -1.021287202835083,
      "rewards/margins": 0.9503535032272339,
      "rewards/rejected": -1.971640706062317,
      "step": 41
    },
    {
      "epoch": 0.01,
      "grad_norm": 36.56157302856445,
      "kl": 0.0,
      "learning_rate": 4.945040565297042e-07,
      "logps/chosen": -257.9923095703125,
      "logps/rejected": -228.11431884765625,
      "loss": 0.4875,
      "rewards/chosen": -1.3468284606933594,
      "rewards/margins": 0.05934798717498779,
      "rewards/rejected": -1.4061764478683472,
      "step": 42
    },
    {
      "epoch": 0.01,
      "grad_norm": 30.533437728881836,
      "kl": 0.0,
      "learning_rate": 4.943732007327924e-07,
      "logps/chosen": -215.95953369140625,
      "logps/rejected": -383.3542175292969,
      "loss": 0.3071,
      "rewards/chosen": -0.46231940388679504,
      "rewards/margins": 3.4177749156951904,
      "rewards/rejected": -3.880094289779663,
      "step": 43
    },
    {
      "epoch": 0.01,
      "grad_norm": 32.82918930053711,
      "kl": 0.0,
      "learning_rate": 4.942423449358806e-07,
      "logps/chosen": -203.03924560546875,
      "logps/rejected": -230.98194885253906,
      "loss": 0.2845,
      "rewards/chosen": 0.07298585772514343,
      "rewards/margins": 2.675164222717285,
      "rewards/rejected": -2.6021783351898193,
      "step": 44
    },
    {
      "epoch": 0.01,
      "grad_norm": 29.841644287109375,
      "kl": 0.0,
      "learning_rate": 4.941114891389688e-07,
      "logps/chosen": -197.1206817626953,
      "logps/rejected": -178.6546173095703,
      "loss": 0.3349,
      "rewards/chosen": -1.011289119720459,
      "rewards/margins": 1.0256037712097168,
      "rewards/rejected": -2.036892890930176,
      "step": 45
    },
    {
      "epoch": 0.01,
      "grad_norm": 39.413970947265625,
      "kl": 0.0,
      "learning_rate": 4.93980633342057e-07,
      "logps/chosen": -243.208984375,
      "logps/rejected": -170.91363525390625,
      "loss": 0.3565,
      "rewards/chosen": -1.0538300275802612,
      "rewards/margins": 0.08940017223358154,
      "rewards/rejected": -1.1432301998138428,
      "step": 46
    },
    {
      "epoch": 0.01,
      "grad_norm": 40.43966293334961,
      "kl": 0.0,
      "learning_rate": 4.938497775451452e-07,
      "logps/chosen": -243.11795043945312,
      "logps/rejected": -250.5272674560547,
      "loss": 0.3056,
      "rewards/chosen": -1.732240080833435,
      "rewards/margins": 1.5034791231155396,
      "rewards/rejected": -3.2357192039489746,
      "step": 47
    },
    {
      "epoch": 0.01,
      "grad_norm": 48.8133544921875,
      "kl": 0.0,
      "learning_rate": 4.937189217482334e-07,
      "logps/chosen": -225.61911010742188,
      "logps/rejected": -303.3346252441406,
      "loss": 0.4549,
      "rewards/chosen": -1.1798169612884521,
      "rewards/margins": 0.7035973072052002,
      "rewards/rejected": -1.8834142684936523,
      "step": 48
    },
    {
      "epoch": 0.01,
      "grad_norm": 28.380380630493164,
      "kl": 0.0,
      "learning_rate": 4.935880659513216e-07,
      "logps/chosen": -164.39273071289062,
      "logps/rejected": -314.02972412109375,
      "loss": 0.4319,
      "rewards/chosen": -1.009232521057129,
      "rewards/margins": 0.519694447517395,
      "rewards/rejected": -1.528926968574524,
      "step": 49
    },
    {
      "epoch": 0.01,
      "grad_norm": 37.58095169067383,
      "kl": 0.0,
      "learning_rate": 4.934572101544098e-07,
      "logps/chosen": -348.113525390625,
      "logps/rejected": -210.80386352539062,
      "loss": 0.4788,
      "rewards/chosen": -3.091629981994629,
      "rewards/margins": -2.1929235458374023,
      "rewards/rejected": -0.8987063765525818,
      "step": 50
    },
    {
      "epoch": 0.01,
      "grad_norm": 36.6107063293457,
      "kl": 0.0,
      "learning_rate": 4.933263543574981e-07,
      "logps/chosen": -212.70262145996094,
      "logps/rejected": -291.3959045410156,
      "loss": 0.4458,
      "rewards/chosen": -1.1933636665344238,
      "rewards/margins": 0.39327001571655273,
      "rewards/rejected": -1.5866336822509766,
      "step": 51
    },
    {
      "epoch": 0.01,
      "grad_norm": 40.384883880615234,
      "kl": 0.0,
      "learning_rate": 4.931954985605863e-07,
      "logps/chosen": -257.5487976074219,
      "logps/rejected": -211.3709259033203,
      "loss": 0.2835,
      "rewards/chosen": -1.0930713415145874,
      "rewards/margins": 0.26484453678131104,
      "rewards/rejected": -1.3579158782958984,
      "step": 52
    },
    {
      "epoch": 0.01,
      "grad_norm": 35.08112716674805,
      "kl": 0.0,
      "learning_rate": 4.930646427636745e-07,
      "logps/chosen": -231.74722290039062,
      "logps/rejected": -246.1368865966797,
      "loss": 0.4926,
      "rewards/chosen": -1.5169533491134644,
      "rewards/margins": 0.9005342721939087,
      "rewards/rejected": -2.417487621307373,
      "step": 53
    },
    {
      "epoch": 0.01,
      "grad_norm": 39.13615036010742,
      "kl": 0.0,
      "learning_rate": 4.929337869667627e-07,
      "logps/chosen": -233.39125061035156,
      "logps/rejected": -182.48666381835938,
      "loss": 0.4699,
      "rewards/chosen": -1.9513063430786133,
      "rewards/margins": -0.586869478225708,
      "rewards/rejected": -1.3644368648529053,
      "step": 54
    },
    {
      "epoch": 0.01,
      "grad_norm": 43.6704216003418,
      "kl": 0.0,
      "learning_rate": 4.928029311698508e-07,
      "logps/chosen": -302.2059020996094,
      "logps/rejected": -216.83807373046875,
      "loss": 0.4721,
      "rewards/chosen": -1.3140746355056763,
      "rewards/margins": -0.25487399101257324,
      "rewards/rejected": -1.059200644493103,
      "step": 55
    },
    {
      "epoch": 0.01,
      "grad_norm": 44.78774642944336,
      "kl": 0.0,
      "learning_rate": 4.92672075372939e-07,
      "logps/chosen": -310.4451599121094,
      "logps/rejected": -285.1246337890625,
      "loss": 0.2802,
      "rewards/chosen": 0.11061844974756241,
      "rewards/margins": 2.974390983581543,
      "rewards/rejected": -2.8637726306915283,
      "step": 56
    },
    {
      "epoch": 0.01,
      "grad_norm": 46.488983154296875,
      "kl": 0.0,
      "learning_rate": 4.925412195760272e-07,
      "logps/chosen": -257.123291015625,
      "logps/rejected": -258.96588134765625,
      "loss": 0.4438,
      "rewards/chosen": -1.130954384803772,
      "rewards/margins": 2.214172840118408,
      "rewards/rejected": -3.3451271057128906,
      "step": 57
    },
    {
      "epoch": 0.02,
      "grad_norm": 38.61354064941406,
      "kl": 0.0,
      "learning_rate": 4.924103637791154e-07,
      "logps/chosen": -175.5467071533203,
      "logps/rejected": -285.81341552734375,
      "loss": 0.4081,
      "rewards/chosen": -1.0343022346496582,
      "rewards/margins": 0.7621434926986694,
      "rewards/rejected": -1.7964457273483276,
      "step": 58
    },
    {
      "epoch": 0.02,
      "grad_norm": 32.77366256713867,
      "kl": 0.0,
      "learning_rate": 4.922795079822035e-07,
      "logps/chosen": -166.06712341308594,
      "logps/rejected": -210.75096130371094,
      "loss": 0.4244,
      "rewards/chosen": -0.37483546137809753,
      "rewards/margins": 1.1932029724121094,
      "rewards/rejected": -1.5680384635925293,
      "step": 59
    },
    {
      "epoch": 0.02,
      "grad_norm": 42.49043273925781,
      "kl": 0.0,
      "learning_rate": 4.921486521852917e-07,
      "logps/chosen": -252.02198791503906,
      "logps/rejected": -188.50100708007812,
      "loss": 0.3225,
      "rewards/chosen": 0.3132118582725525,
      "rewards/margins": 1.1008962392807007,
      "rewards/rejected": -0.7876843810081482,
      "step": 60
    },
    {
      "epoch": 0.02,
      "grad_norm": 32.550514221191406,
      "kl": 0.0,
      "learning_rate": 4.920177963883799e-07,
      "logps/chosen": -186.67495727539062,
      "logps/rejected": -221.72752380371094,
      "loss": 0.3053,
      "rewards/chosen": -0.44850480556488037,
      "rewards/margins": 2.5049924850463867,
      "rewards/rejected": -2.9534971714019775,
      "step": 61
    },
    {
      "epoch": 0.02,
      "grad_norm": 44.155418395996094,
      "kl": 0.0,
      "learning_rate": 4.918869405914681e-07,
      "logps/chosen": -225.32420349121094,
      "logps/rejected": -250.5755615234375,
      "loss": 0.4224,
      "rewards/chosen": -0.37531524896621704,
      "rewards/margins": 1.9962267875671387,
      "rewards/rejected": -2.371541976928711,
      "step": 62
    },
    {
      "epoch": 0.02,
      "grad_norm": 47.24485778808594,
      "kl": 0.0,
      "learning_rate": 4.917560847945563e-07,
      "logps/chosen": -195.40579223632812,
      "logps/rejected": -278.23345947265625,
      "loss": 0.3477,
      "rewards/chosen": -0.42761945724487305,
      "rewards/margins": 2.3172523975372314,
      "rewards/rejected": -2.7448718547821045,
      "step": 63
    },
    {
      "epoch": 0.02,
      "grad_norm": 38.27678680419922,
      "kl": 0.0,
      "learning_rate": 4.916252289976446e-07,
      "logps/chosen": -220.49884033203125,
      "logps/rejected": -188.00048828125,
      "loss": 0.4889,
      "rewards/chosen": -0.5155230760574341,
      "rewards/margins": 0.9867794513702393,
      "rewards/rejected": -1.5023025274276733,
      "step": 64
    },
    {
      "epoch": 0.02,
      "grad_norm": 33.94312286376953,
      "kl": 0.0,
      "learning_rate": 4.914943732007328e-07,
      "logps/chosen": -196.26255798339844,
      "logps/rejected": -173.22503662109375,
      "loss": 0.3842,
      "rewards/chosen": -1.0869052410125732,
      "rewards/margins": 0.193229079246521,
      "rewards/rejected": -1.2801343202590942,
      "step": 65
    },
    {
      "epoch": 0.02,
      "grad_norm": 41.270484924316406,
      "kl": 0.0,
      "learning_rate": 4.91363517403821e-07,
      "logps/chosen": -162.62060546875,
      "logps/rejected": -208.2407684326172,
      "loss": 0.4074,
      "rewards/chosen": -1.0006041526794434,
      "rewards/margins": 0.45082569122314453,
      "rewards/rejected": -1.451429843902588,
      "step": 66
    },
    {
      "epoch": 0.02,
      "grad_norm": 31.421226501464844,
      "kl": 0.0,
      "learning_rate": 4.912326616069092e-07,
      "logps/chosen": -266.94854736328125,
      "logps/rejected": -159.35922241210938,
      "loss": 0.4062,
      "rewards/chosen": -0.47417137026786804,
      "rewards/margins": 1.2088725566864014,
      "rewards/rejected": -1.6830439567565918,
      "step": 67
    },
    {
      "epoch": 0.02,
      "grad_norm": 40.290950775146484,
      "kl": 0.0,
      "learning_rate": 4.911018058099974e-07,
      "logps/chosen": -294.2968444824219,
      "logps/rejected": -238.9677276611328,
      "loss": 0.4053,
      "rewards/chosen": -0.8080188035964966,
      "rewards/margins": 0.8553787469863892,
      "rewards/rejected": -1.6633975505828857,
      "step": 68
    },
    {
      "epoch": 0.02,
      "grad_norm": 37.789405822753906,
      "kl": 0.0,
      "learning_rate": 4.909709500130856e-07,
      "logps/chosen": -287.0626220703125,
      "logps/rejected": -279.5071716308594,
      "loss": 0.2481,
      "rewards/chosen": -0.2926979064941406,
      "rewards/margins": 2.139275074005127,
      "rewards/rejected": -2.4319729804992676,
      "step": 69
    },
    {
      "epoch": 0.02,
      "grad_norm": 36.02606964111328,
      "kl": 0.0,
      "learning_rate": 4.908400942161737e-07,
      "logps/chosen": -175.38587951660156,
      "logps/rejected": -172.9152069091797,
      "loss": 0.3982,
      "rewards/chosen": -0.6662830710411072,
      "rewards/margins": 0.7709174752235413,
      "rewards/rejected": -1.4372005462646484,
      "step": 70
    },
    {
      "epoch": 0.02,
      "grad_norm": 27.810359954833984,
      "kl": 0.0,
      "learning_rate": 4.907092384192619e-07,
      "logps/chosen": -220.51766967773438,
      "logps/rejected": -250.37957763671875,
      "loss": 0.2556,
      "rewards/chosen": -0.9891327619552612,
      "rewards/margins": 1.5172969102859497,
      "rewards/rejected": -2.506429672241211,
      "step": 71
    },
    {
      "epoch": 0.02,
      "grad_norm": 36.54177474975586,
      "kl": 0.0,
      "learning_rate": 4.905783826223501e-07,
      "logps/chosen": -196.7530975341797,
      "logps/rejected": -243.111328125,
      "loss": 0.2657,
      "rewards/chosen": -0.8695046305656433,
      "rewards/margins": 2.0881636142730713,
      "rewards/rejected": -2.9576683044433594,
      "step": 72
    },
    {
      "epoch": 0.02,
      "grad_norm": 32.64535903930664,
      "kl": 0.0,
      "learning_rate": 4.904475268254383e-07,
      "logps/chosen": -227.05311584472656,
      "logps/rejected": -179.5013885498047,
      "loss": 0.3447,
      "rewards/chosen": -1.2679059505462646,
      "rewards/margins": -0.13299989700317383,
      "rewards/rejected": -1.1349060535430908,
      "step": 73
    },
    {
      "epoch": 0.02,
      "grad_norm": 34.087646484375,
      "kl": 0.0,
      "learning_rate": 4.903166710285265e-07,
      "logps/chosen": -245.4705810546875,
      "logps/rejected": -211.25796508789062,
      "loss": 0.1794,
      "rewards/chosen": 0.6509444713592529,
      "rewards/margins": 2.1154017448425293,
      "rewards/rejected": -1.4644572734832764,
      "step": 74
    },
    {
      "epoch": 0.02,
      "grad_norm": 38.3884391784668,
      "kl": 0.0,
      "learning_rate": 4.901858152316147e-07,
      "logps/chosen": -182.15093994140625,
      "logps/rejected": -184.2251739501953,
      "loss": 0.3708,
      "rewards/chosen": -0.33757108449935913,
      "rewards/margins": 1.606074571609497,
      "rewards/rejected": -1.943645715713501,
      "step": 75
    },
    {
      "epoch": 0.02,
      "grad_norm": 32.64462661743164,
      "kl": 0.0,
      "learning_rate": 4.900549594347029e-07,
      "logps/chosen": -212.72073364257812,
      "logps/rejected": -197.57626342773438,
      "loss": 0.2817,
      "rewards/chosen": 0.027129173278808594,
      "rewards/margins": 2.553429126739502,
      "rewards/rejected": -2.5262999534606934,
      "step": 76
    },
    {
      "epoch": 0.02,
      "grad_norm": 40.509376525878906,
      "kl": 0.0,
      "learning_rate": 4.899241036377912e-07,
      "logps/chosen": -260.6392822265625,
      "logps/rejected": -246.5421600341797,
      "loss": 0.3453,
      "rewards/chosen": -0.5853943228721619,
      "rewards/margins": 1.0417983531951904,
      "rewards/rejected": -1.6271926164627075,
      "step": 77
    },
    {
      "epoch": 0.02,
      "grad_norm": 37.12883758544922,
      "kl": 0.0,
      "learning_rate": 4.897932478408794e-07,
      "logps/chosen": -204.54745483398438,
      "logps/rejected": -235.77371215820312,
      "loss": 0.3932,
      "rewards/chosen": -1.1263680458068848,
      "rewards/margins": 1.0550079345703125,
      "rewards/rejected": -2.1813759803771973,
      "step": 78
    },
    {
      "epoch": 0.02,
      "grad_norm": 29.98268699645996,
      "kl": 0.0,
      "learning_rate": 4.896623920439676e-07,
      "logps/chosen": -274.41754150390625,
      "logps/rejected": -291.3458251953125,
      "loss": 0.3497,
      "rewards/chosen": -2.2485036849975586,
      "rewards/margins": 0.7142941951751709,
      "rewards/rejected": -2.9627978801727295,
      "step": 79
    },
    {
      "epoch": 0.02,
      "grad_norm": 35.18719482421875,
      "kl": 0.0,
      "learning_rate": 4.895315362470558e-07,
      "logps/chosen": -203.51724243164062,
      "logps/rejected": -268.3537292480469,
      "loss": 0.4434,
      "rewards/chosen": -1.129686951637268,
      "rewards/margins": 1.2693904638290405,
      "rewards/rejected": -2.3990774154663086,
      "step": 80
    },
    {
      "epoch": 0.02,
      "grad_norm": 34.704044342041016,
      "kl": 0.0,
      "learning_rate": 4.89400680450144e-07,
      "logps/chosen": -236.12542724609375,
      "logps/rejected": -261.73687744140625,
      "loss": 0.3943,
      "rewards/chosen": -1.494764804840088,
      "rewards/margins": 0.6487529277801514,
      "rewards/rejected": -2.1435177326202393,
      "step": 81
    },
    {
      "epoch": 0.02,
      "grad_norm": 38.02219772338867,
      "kl": 0.0,
      "learning_rate": 4.892698246532322e-07,
      "logps/chosen": -236.21002197265625,
      "logps/rejected": -219.7363739013672,
      "loss": 0.5544,
      "rewards/chosen": -2.262413501739502,
      "rewards/margins": -1.9126307964324951,
      "rewards/rejected": -0.34978270530700684,
      "step": 82
    },
    {
      "epoch": 0.02,
      "grad_norm": 32.88979721069336,
      "kl": 0.0,
      "learning_rate": 4.891389688563204e-07,
      "logps/chosen": -207.0186309814453,
      "logps/rejected": -227.69236755371094,
      "loss": 0.3109,
      "rewards/chosen": -0.8603959679603577,
      "rewards/margins": 1.2926530838012695,
      "rewards/rejected": -2.1530489921569824,
      "step": 83
    },
    {
      "epoch": 0.02,
      "grad_norm": 37.908660888671875,
      "kl": 0.0,
      "learning_rate": 4.890081130594086e-07,
      "logps/chosen": -295.50836181640625,
      "logps/rejected": -281.04058837890625,
      "loss": 0.354,
      "rewards/chosen": 0.09849908947944641,
      "rewards/margins": 3.176734685897827,
      "rewards/rejected": -3.078235626220703,
      "step": 84
    },
    {
      "epoch": 0.02,
      "grad_norm": 37.847747802734375,
      "kl": 0.0,
      "learning_rate": 4.888772572624968e-07,
      "logps/chosen": -174.21282958984375,
      "logps/rejected": -305.0511169433594,
      "loss": 0.2727,
      "rewards/chosen": -0.06367160379886627,
      "rewards/margins": 1.2645058631896973,
      "rewards/rejected": -1.3281774520874023,
      "step": 85
    },
    {
      "epoch": 0.02,
      "grad_norm": 41.107765197753906,
      "kl": 0.0,
      "learning_rate": 4.887464014655848e-07,
      "logps/chosen": -207.59597778320312,
      "logps/rejected": -291.0322265625,
      "loss": 0.3398,
      "rewards/chosen": -0.1212925836443901,
      "rewards/margins": 2.127479076385498,
      "rewards/rejected": -2.2487716674804688,
      "step": 86
    },
    {
      "epoch": 0.02,
      "grad_norm": 37.753013610839844,
      "kl": 0.0,
      "learning_rate": 4.88615545668673e-07,
      "logps/chosen": -243.60118103027344,
      "logps/rejected": -201.14027404785156,
      "loss": 0.4513,
      "rewards/chosen": -0.5243691205978394,
      "rewards/margins": 0.809841513633728,
      "rewards/rejected": -1.3342106342315674,
      "step": 87
    },
    {
      "epoch": 0.02,
      "grad_norm": 30.350231170654297,
      "kl": 0.0,
      "learning_rate": 4.884846898717612e-07,
      "logps/chosen": -210.97064208984375,
      "logps/rejected": -187.00070190429688,
      "loss": 0.4391,
      "rewards/chosen": -2.0821855068206787,
      "rewards/margins": -0.4192476272583008,
      "rewards/rejected": -1.662937879562378,
      "step": 88
    },
    {
      "epoch": 0.02,
      "grad_norm": 36.895530700683594,
      "kl": 0.0,
      "learning_rate": 4.883538340748494e-07,
      "logps/chosen": -213.4839630126953,
      "logps/rejected": -240.64381408691406,
      "loss": 0.3759,
      "rewards/chosen": -0.6569501757621765,
      "rewards/margins": 1.2435503005981445,
      "rewards/rejected": -1.9005005359649658,
      "step": 89
    },
    {
      "epoch": 0.02,
      "grad_norm": 35.79010009765625,
      "kl": 0.0,
      "learning_rate": 4.882229782779377e-07,
      "logps/chosen": -242.63958740234375,
      "logps/rejected": -259.929931640625,
      "loss": 0.4686,
      "rewards/chosen": -1.5431857109069824,
      "rewards/margins": 0.8138833045959473,
      "rewards/rejected": -2.3570690155029297,
      "step": 90
    },
    {
      "epoch": 0.02,
      "grad_norm": 35.99927520751953,
      "kl": 0.0,
      "learning_rate": 4.880921224810259e-07,
      "logps/chosen": -174.24330139160156,
      "logps/rejected": -301.5118713378906,
      "loss": 0.3863,
      "rewards/chosen": -0.5870503187179565,
      "rewards/margins": 1.726894736289978,
      "rewards/rejected": -2.3139450550079346,
      "step": 91
    },
    {
      "epoch": 0.02,
      "grad_norm": 36.06237030029297,
      "kl": 0.0,
      "learning_rate": 4.879612666841141e-07,
      "logps/chosen": -223.64593505859375,
      "logps/rejected": -176.1517333984375,
      "loss": 0.4347,
      "rewards/chosen": -1.5263793468475342,
      "rewards/margins": 0.5033280849456787,
      "rewards/rejected": -2.029707431793213,
      "step": 92
    },
    {
      "epoch": 0.02,
      "grad_norm": 40.38705825805664,
      "kl": 0.0,
      "learning_rate": 4.878304108872023e-07,
      "logps/chosen": -286.7668762207031,
      "logps/rejected": -294.2085876464844,
      "loss": 0.3554,
      "rewards/chosen": -1.9862689971923828,
      "rewards/margins": 0.1652989387512207,
      "rewards/rejected": -2.1515679359436035,
      "step": 93
    },
    {
      "epoch": 0.02,
      "grad_norm": 27.866796493530273,
      "kl": 0.0,
      "learning_rate": 4.876995550902905e-07,
      "logps/chosen": -206.51478576660156,
      "logps/rejected": -229.76593017578125,
      "loss": 0.2507,
      "rewards/chosen": 1.2367534637451172,
      "rewards/margins": 3.6880853176116943,
      "rewards/rejected": -2.451331853866577,
      "step": 94
    },
    {
      "epoch": 0.02,
      "grad_norm": 31.56693458557129,
      "kl": 0.0,
      "learning_rate": 4.875686992933787e-07,
      "logps/chosen": -239.28445434570312,
      "logps/rejected": -266.00836181640625,
      "loss": 0.3358,
      "rewards/chosen": -0.9398892521858215,
      "rewards/margins": 3.100093126296997,
      "rewards/rejected": -4.039982318878174,
      "step": 95
    },
    {
      "epoch": 0.03,
      "grad_norm": 33.1443977355957,
      "kl": 0.0,
      "learning_rate": 4.874378434964669e-07,
      "logps/chosen": -196.32676696777344,
      "logps/rejected": -155.44985961914062,
      "loss": 0.4324,
      "rewards/chosen": -0.7763940691947937,
      "rewards/margins": 1.0392913818359375,
      "rewards/rejected": -1.815685510635376,
      "step": 96
    },
    {
      "epoch": 0.03,
      "grad_norm": 36.72030258178711,
      "kl": 0.0,
      "learning_rate": 4.873069876995551e-07,
      "logps/chosen": -237.5760040283203,
      "logps/rejected": -381.8189392089844,
      "loss": 0.4122,
      "rewards/chosen": -1.7448315620422363,
      "rewards/margins": 0.012748241424560547,
      "rewards/rejected": -1.7575798034667969,
      "step": 97
    },
    {
      "epoch": 0.03,
      "grad_norm": 35.568607330322266,
      "kl": 0.0,
      "learning_rate": 4.871761319026433e-07,
      "logps/chosen": -261.92828369140625,
      "logps/rejected": -200.28616333007812,
      "loss": 0.3368,
      "rewards/chosen": -0.9904589653015137,
      "rewards/margins": 1.3062183856964111,
      "rewards/rejected": -2.296677350997925,
      "step": 98
    },
    {
      "epoch": 0.03,
      "grad_norm": 25.75632095336914,
      "kl": 0.0,
      "learning_rate": 4.870452761057315e-07,
      "logps/chosen": -201.7320556640625,
      "logps/rejected": -262.0142517089844,
      "loss": 0.2943,
      "rewards/chosen": -1.7546271085739136,
      "rewards/margins": 1.8151785135269165,
      "rewards/rejected": -3.56980562210083,
      "step": 99
    },
    {
      "epoch": 0.03,
      "grad_norm": 41.999332427978516,
      "kl": 0.0,
      "learning_rate": 4.869144203088197e-07,
      "logps/chosen": -218.90869140625,
      "logps/rejected": -193.1501007080078,
      "loss": 0.4521,
      "rewards/chosen": -1.5635461807250977,
      "rewards/margins": -0.5969691276550293,
      "rewards/rejected": -0.9665770530700684,
      "step": 100
    },
    {
      "epoch": 0.03,
      "grad_norm": 36.3663330078125,
      "kl": 0.0,
      "learning_rate": 4.867835645119078e-07,
      "logps/chosen": -225.2639923095703,
      "logps/rejected": -218.92816162109375,
      "loss": 0.4209,
      "rewards/chosen": -1.5182456970214844,
      "rewards/margins": 0.6523854732513428,
      "rewards/rejected": -2.170631170272827,
      "step": 101
    },
    {
      "epoch": 0.03,
      "grad_norm": 40.12117385864258,
      "kl": 0.0,
      "learning_rate": 4.86652708714996e-07,
      "logps/chosen": -232.11502075195312,
      "logps/rejected": -297.0523986816406,
      "loss": 0.3277,
      "rewards/chosen": -0.4440801739692688,
      "rewards/margins": 3.198974609375,
      "rewards/rejected": -3.643054723739624,
      "step": 102
    },
    {
      "epoch": 0.03,
      "grad_norm": 34.094383239746094,
      "kl": 0.0,
      "learning_rate": 4.865218529180843e-07,
      "logps/chosen": -227.10867309570312,
      "logps/rejected": -231.43902587890625,
      "loss": 0.3561,
      "rewards/chosen": -0.583474338054657,
      "rewards/margins": 1.4088008403778076,
      "rewards/rejected": -1.9922752380371094,
      "step": 103
    },
    {
      "epoch": 0.03,
      "grad_norm": 33.84486389160156,
      "kl": 0.0,
      "learning_rate": 4.863909971211725e-07,
      "logps/chosen": -245.8929443359375,
      "logps/rejected": -193.90614318847656,
      "loss": 0.3577,
      "rewards/chosen": -0.8680392503738403,
      "rewards/margins": 0.2119290828704834,
      "rewards/rejected": -1.0799683332443237,
      "step": 104
    },
    {
      "epoch": 0.03,
      "grad_norm": 90.90773010253906,
      "kl": 0.0,
      "learning_rate": 4.862601413242607e-07,
      "logps/chosen": -218.3358154296875,
      "logps/rejected": -267.4952697753906,
      "loss": 0.423,
      "rewards/chosen": 1.7071785926818848,
      "rewards/margins": 3.0971498489379883,
      "rewards/rejected": -1.389971137046814,
      "step": 105
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.740966796875,
      "kl": 0.0,
      "learning_rate": 4.861292855273489e-07,
      "logps/chosen": -224.6427764892578,
      "logps/rejected": -267.43890380859375,
      "loss": 0.3854,
      "rewards/chosen": -0.49575966596603394,
      "rewards/margins": 1.3802204132080078,
      "rewards/rejected": -1.875980019569397,
      "step": 106
    },
    {
      "epoch": 0.03,
      "grad_norm": 26.653839111328125,
      "kl": 0.0,
      "learning_rate": 4.859984297304371e-07,
      "logps/chosen": -192.61354064941406,
      "logps/rejected": -188.12330627441406,
      "loss": 0.3971,
      "rewards/chosen": -1.3154840469360352,
      "rewards/margins": 1.1609439849853516,
      "rewards/rejected": -2.4764280319213867,
      "step": 107
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.36748504638672,
      "kl": 0.0,
      "learning_rate": 4.858675739335253e-07,
      "logps/chosen": -288.2717590332031,
      "logps/rejected": -205.67002868652344,
      "loss": 0.364,
      "rewards/chosen": -0.9201427698135376,
      "rewards/margins": 0.7539663314819336,
      "rewards/rejected": -1.6741091012954712,
      "step": 108
    },
    {
      "epoch": 0.03,
      "grad_norm": 26.112871170043945,
      "kl": 0.0,
      "learning_rate": 4.857367181366135e-07,
      "logps/chosen": -173.88441467285156,
      "logps/rejected": -184.56675720214844,
      "loss": 0.3863,
      "rewards/chosen": -1.1070303916931152,
      "rewards/margins": 1.5630695819854736,
      "rewards/rejected": -2.670099973678589,
      "step": 109
    },
    {
      "epoch": 0.03,
      "grad_norm": 46.24738693237305,
      "kl": 0.0,
      "learning_rate": 4.856058623397017e-07,
      "logps/chosen": -290.48577880859375,
      "logps/rejected": -203.6895294189453,
      "loss": 0.3406,
      "rewards/chosen": 0.15263789892196655,
      "rewards/margins": 1.928048849105835,
      "rewards/rejected": -1.7754108905792236,
      "step": 110
    },
    {
      "epoch": 0.03,
      "grad_norm": 74.96126556396484,
      "kl": 0.0,
      "learning_rate": 4.854750065427898e-07,
      "logps/chosen": -198.45909118652344,
      "logps/rejected": -210.08143615722656,
      "loss": 0.3199,
      "rewards/chosen": -0.6983060240745544,
      "rewards/margins": 1.6797571182250977,
      "rewards/rejected": -2.378063201904297,
      "step": 111
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.173887252807617,
      "kl": 0.0,
      "learning_rate": 4.85344150745878e-07,
      "logps/chosen": -202.14859008789062,
      "logps/rejected": -185.81361389160156,
      "loss": 0.3284,
      "rewards/chosen": -1.0504775047302246,
      "rewards/margins": 1.1194241046905518,
      "rewards/rejected": -2.1699016094207764,
      "step": 112
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.457008361816406,
      "kl": 0.0,
      "learning_rate": 4.852132949489662e-07,
      "logps/chosen": -178.17190551757812,
      "logps/rejected": -198.142333984375,
      "loss": 0.2724,
      "rewards/chosen": -0.5825530886650085,
      "rewards/margins": 1.6814563274383545,
      "rewards/rejected": -2.264009475708008,
      "step": 113
    },
    {
      "epoch": 0.03,
      "grad_norm": 36.00678253173828,
      "kl": 0.0,
      "learning_rate": 4.850824391520544e-07,
      "logps/chosen": -226.97251892089844,
      "logps/rejected": -277.8453674316406,
      "loss": 0.4056,
      "rewards/chosen": -1.666909098625183,
      "rewards/margins": 0.5384429693222046,
      "rewards/rejected": -2.2053520679473877,
      "step": 114
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.167097091674805,
      "kl": 0.0,
      "learning_rate": 4.849515833551426e-07,
      "logps/chosen": -221.36300659179688,
      "logps/rejected": -242.39280700683594,
      "loss": 0.4028,
      "rewards/chosen": -1.4840993881225586,
      "rewards/margins": 1.3504664897918701,
      "rewards/rejected": -2.8345658779144287,
      "step": 115
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.748884201049805,
      "kl": 0.0,
      "learning_rate": 4.848207275582308e-07,
      "logps/chosen": -297.3919677734375,
      "logps/rejected": -213.3976287841797,
      "loss": 0.2864,
      "rewards/chosen": -2.3920040130615234,
      "rewards/margins": 0.7508974075317383,
      "rewards/rejected": -3.1429014205932617,
      "step": 116
    },
    {
      "epoch": 0.03,
      "grad_norm": 30.6126766204834,
      "kl": 0.0,
      "learning_rate": 4.84689871761319e-07,
      "logps/chosen": -236.26303100585938,
      "logps/rejected": -177.65513610839844,
      "loss": 0.3932,
      "rewards/chosen": -2.2624423503875732,
      "rewards/margins": 0.27295541763305664,
      "rewards/rejected": -2.53539776802063,
      "step": 117
    },
    {
      "epoch": 0.03,
      "grad_norm": 26.81728744506836,
      "kl": 0.0,
      "learning_rate": 4.845590159644072e-07,
      "logps/chosen": -297.1986389160156,
      "logps/rejected": -206.4488983154297,
      "loss": 0.4104,
      "rewards/chosen": -2.306591510772705,
      "rewards/margins": 0.1528792381286621,
      "rewards/rejected": -2.459470748901367,
      "step": 118
    },
    {
      "epoch": 0.03,
      "grad_norm": 32.799739837646484,
      "kl": 0.0,
      "learning_rate": 4.844281601674954e-07,
      "logps/chosen": -212.15191650390625,
      "logps/rejected": -264.2872314453125,
      "loss": 0.4941,
      "rewards/chosen": -1.7588011026382446,
      "rewards/margins": 1.0008176565170288,
      "rewards/rejected": -2.7596187591552734,
      "step": 119
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.55664825439453,
      "kl": 0.0,
      "learning_rate": 4.842973043705836e-07,
      "logps/chosen": -289.9277648925781,
      "logps/rejected": -317.9549255371094,
      "loss": 0.4072,
      "rewards/chosen": -1.3789793252944946,
      "rewards/margins": 3.475088596343994,
      "rewards/rejected": -4.854067802429199,
      "step": 120
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.860851287841797,
      "kl": 0.0,
      "learning_rate": 4.841664485736718e-07,
      "logps/chosen": -227.15924072265625,
      "logps/rejected": -233.9003143310547,
      "loss": 0.4206,
      "rewards/chosen": -0.4197777509689331,
      "rewards/margins": 2.2443408966064453,
      "rewards/rejected": -2.664118766784668,
      "step": 121
    },
    {
      "epoch": 0.03,
      "grad_norm": 35.52662658691406,
      "kl": 0.0,
      "learning_rate": 4.8403559277676e-07,
      "logps/chosen": -170.0311279296875,
      "logps/rejected": -187.07435607910156,
      "loss": 0.3639,
      "rewards/chosen": -0.22913819551467896,
      "rewards/margins": 2.3094356060028076,
      "rewards/rejected": -2.538573741912842,
      "step": 122
    },
    {
      "epoch": 0.03,
      "grad_norm": 38.98072814941406,
      "kl": 0.0,
      "learning_rate": 4.839047369798482e-07,
      "logps/chosen": -236.20376586914062,
      "logps/rejected": -213.81533813476562,
      "loss": 0.4007,
      "rewards/chosen": -0.15046189725399017,
      "rewards/margins": 1.8937103748321533,
      "rewards/rejected": -2.0441722869873047,
      "step": 123
    },
    {
      "epoch": 0.03,
      "grad_norm": 39.98687744140625,
      "kl": 0.0,
      "learning_rate": 4.837738811829364e-07,
      "logps/chosen": -276.67901611328125,
      "logps/rejected": -261.1310119628906,
      "loss": 0.4084,
      "rewards/chosen": -0.6370207071304321,
      "rewards/margins": 1.557566523551941,
      "rewards/rejected": -2.194587230682373,
      "step": 124
    },
    {
      "epoch": 0.03,
      "grad_norm": 31.81071662902832,
      "kl": 0.0,
      "learning_rate": 4.836430253860246e-07,
      "logps/chosen": -164.74241638183594,
      "logps/rejected": -164.97280883789062,
      "loss": 0.308,
      "rewards/chosen": -0.021863222122192383,
      "rewards/margins": 1.7506403923034668,
      "rewards/rejected": -1.7725036144256592,
      "step": 125
    },
    {
      "epoch": 0.03,
      "grad_norm": 40.468868255615234,
      "kl": 0.0,
      "learning_rate": 4.835121695891128e-07,
      "logps/chosen": -209.9675750732422,
      "logps/rejected": -272.98876953125,
      "loss": 0.3195,
      "rewards/chosen": -1.1054404973983765,
      "rewards/margins": 1.143508791923523,
      "rewards/rejected": -2.2489492893218994,
      "step": 126
    },
    {
      "epoch": 0.03,
      "grad_norm": 32.93410110473633,
      "kl": 0.0,
      "learning_rate": 4.83381313792201e-07,
      "logps/chosen": -191.24378967285156,
      "logps/rejected": -237.31460571289062,
      "loss": 0.419,
      "rewards/chosen": -1.1644923686981201,
      "rewards/margins": 1.5805509090423584,
      "rewards/rejected": -2.7450432777404785,
      "step": 127
    },
    {
      "epoch": 0.03,
      "grad_norm": 33.93423843383789,
      "kl": 0.0,
      "learning_rate": 4.832504579952892e-07,
      "logps/chosen": -238.41317749023438,
      "logps/rejected": -270.1029052734375,
      "loss": 0.3703,
      "rewards/chosen": -1.668627381324768,
      "rewards/margins": -0.14160168170928955,
      "rewards/rejected": -1.5270256996154785,
      "step": 128
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.237674713134766,
      "kl": 0.0,
      "learning_rate": 4.831196021983774e-07,
      "logps/chosen": -253.71865844726562,
      "logps/rejected": -199.205078125,
      "loss": 0.4102,
      "rewards/chosen": -0.7242934703826904,
      "rewards/margins": 1.206939935684204,
      "rewards/rejected": -1.9312334060668945,
      "step": 129
    },
    {
      "epoch": 0.03,
      "grad_norm": 42.5329704284668,
      "kl": 0.0,
      "learning_rate": 4.829887464014656e-07,
      "logps/chosen": -178.42868041992188,
      "logps/rejected": -188.00540161132812,
      "loss": 0.3732,
      "rewards/chosen": -0.8551152348518372,
      "rewards/margins": 1.5641908645629883,
      "rewards/rejected": -2.4193060398101807,
      "step": 130
    },
    {
      "epoch": 0.03,
      "grad_norm": 27.833166122436523,
      "kl": 0.0,
      "learning_rate": 4.828578906045538e-07,
      "logps/chosen": -211.11993408203125,
      "logps/rejected": -161.3157501220703,
      "loss": 0.4244,
      "rewards/chosen": -1.8940626382827759,
      "rewards/margins": 0.8122283220291138,
      "rewards/rejected": -2.7062909603118896,
      "step": 131
    },
    {
      "epoch": 0.03,
      "grad_norm": 41.60429763793945,
      "kl": 0.0,
      "learning_rate": 4.82727034807642e-07,
      "logps/chosen": -271.83526611328125,
      "logps/rejected": -171.28790283203125,
      "loss": 0.3065,
      "rewards/chosen": -0.894410252571106,
      "rewards/margins": 0.685144305229187,
      "rewards/rejected": -1.579554557800293,
      "step": 132
    },
    {
      "epoch": 0.03,
      "grad_norm": 37.991764068603516,
      "kl": 0.0,
      "learning_rate": 4.825961790107302e-07,
      "logps/chosen": -101.1789321899414,
      "logps/rejected": -278.728759765625,
      "loss": 0.2365,
      "rewards/chosen": -0.014444398693740368,
      "rewards/margins": 2.06191086769104,
      "rewards/rejected": -2.076355218887329,
      "step": 133
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.86128234863281,
      "kl": 0.0,
      "learning_rate": 4.824653232138184e-07,
      "logps/chosen": -250.19801330566406,
      "logps/rejected": -214.6094512939453,
      "loss": 0.2672,
      "rewards/chosen": -0.9694932103157043,
      "rewards/margins": 1.6486060619354248,
      "rewards/rejected": -2.6180992126464844,
      "step": 134
    },
    {
      "epoch": 0.04,
      "grad_norm": 40.54289245605469,
      "kl": 0.0,
      "learning_rate": 4.823344674169066e-07,
      "logps/chosen": -158.35687255859375,
      "logps/rejected": -197.9619598388672,
      "loss": 0.3677,
      "rewards/chosen": -1.2970025539398193,
      "rewards/margins": -0.352266788482666,
      "rewards/rejected": -0.9447357654571533,
      "step": 135
    },
    {
      "epoch": 0.04,
      "grad_norm": 32.23467254638672,
      "kl": 0.0,
      "learning_rate": 4.822036116199948e-07,
      "logps/chosen": -172.86297607421875,
      "logps/rejected": -270.7877502441406,
      "loss": 0.2857,
      "rewards/chosen": -0.7820056676864624,
      "rewards/margins": 1.7432712316513062,
      "rewards/rejected": -2.5252768993377686,
      "step": 136
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.8538703918457,
      "kl": 0.0,
      "learning_rate": 4.82072755823083e-07,
      "logps/chosen": -308.17401123046875,
      "logps/rejected": -210.2477264404297,
      "loss": 0.4101,
      "rewards/chosen": -0.6375879049301147,
      "rewards/margins": 0.9788401126861572,
      "rewards/rejected": -1.616428017616272,
      "step": 137
    },
    {
      "epoch": 0.04,
      "grad_norm": 40.68290328979492,
      "kl": 0.0,
      "learning_rate": 4.819419000261711e-07,
      "logps/chosen": -210.66500854492188,
      "logps/rejected": -228.51309204101562,
      "loss": 0.4091,
      "rewards/chosen": -0.766243577003479,
      "rewards/margins": 1.044326663017273,
      "rewards/rejected": -1.810570240020752,
      "step": 138
    },
    {
      "epoch": 0.04,
      "grad_norm": 41.904876708984375,
      "kl": 0.0,
      "learning_rate": 4.818110442292593e-07,
      "logps/chosen": -203.61105346679688,
      "logps/rejected": -234.1663360595703,
      "loss": 0.3681,
      "rewards/chosen": 0.1758965253829956,
      "rewards/margins": 3.3261351585388184,
      "rewards/rejected": -3.150238513946533,
      "step": 139
    },
    {
      "epoch": 0.04,
      "grad_norm": 42.24943542480469,
      "kl": 0.0,
      "learning_rate": 4.816801884323475e-07,
      "logps/chosen": -285.647705078125,
      "logps/rejected": -288.2828369140625,
      "loss": 0.3483,
      "rewards/chosen": -1.2011711597442627,
      "rewards/margins": 1.2541842460632324,
      "rewards/rejected": -2.455355405807495,
      "step": 140
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.05189514160156,
      "kl": 0.0,
      "learning_rate": 4.815493326354357e-07,
      "logps/chosen": -182.27590942382812,
      "logps/rejected": -290.2626037597656,
      "loss": 0.4624,
      "rewards/chosen": -1.4244545698165894,
      "rewards/margins": 0.7048934698104858,
      "rewards/rejected": -2.129348039627075,
      "step": 141
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.5157356262207,
      "kl": 0.0,
      "learning_rate": 4.814184768385239e-07,
      "logps/chosen": -204.3314208984375,
      "logps/rejected": -326.6562805175781,
      "loss": 0.4179,
      "rewards/chosen": -1.0364093780517578,
      "rewards/margins": 1.5139243602752686,
      "rewards/rejected": -2.5503337383270264,
      "step": 142
    },
    {
      "epoch": 0.04,
      "grad_norm": 31.165124893188477,
      "kl": 0.0,
      "learning_rate": 4.812876210416121e-07,
      "logps/chosen": -251.93438720703125,
      "logps/rejected": -233.26759338378906,
      "loss": 0.3301,
      "rewards/chosen": -2.0439867973327637,
      "rewards/margins": 0.13914990425109863,
      "rewards/rejected": -2.1831367015838623,
      "step": 143
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.466365814208984,
      "kl": 0.0,
      "learning_rate": 4.811567652447003e-07,
      "logps/chosen": -233.31072998046875,
      "logps/rejected": -202.12730407714844,
      "loss": 0.4027,
      "rewards/chosen": -2.06561541557312,
      "rewards/margins": -0.20110619068145752,
      "rewards/rejected": -1.8645092248916626,
      "step": 144
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.38949203491211,
      "kl": 0.0,
      "learning_rate": 4.810259094477885e-07,
      "logps/chosen": -256.955322265625,
      "logps/rejected": -203.2244110107422,
      "loss": 0.3354,
      "rewards/chosen": 0.2599179148674011,
      "rewards/margins": 2.934673547744751,
      "rewards/rejected": -2.674755573272705,
      "step": 145
    },
    {
      "epoch": 0.04,
      "grad_norm": 36.793487548828125,
      "kl": 0.0,
      "learning_rate": 4.808950536508767e-07,
      "logps/chosen": -245.50906372070312,
      "logps/rejected": -276.3681640625,
      "loss": 0.3287,
      "rewards/chosen": -0.11979079246520996,
      "rewards/margins": 1.873924732208252,
      "rewards/rejected": -1.993715524673462,
      "step": 146
    },
    {
      "epoch": 0.04,
      "grad_norm": 31.725196838378906,
      "kl": 0.0,
      "learning_rate": 4.807641978539649e-07,
      "logps/chosen": -128.91494750976562,
      "logps/rejected": -231.997314453125,
      "loss": 0.2698,
      "rewards/chosen": -0.5471810698509216,
      "rewards/margins": 1.7721290588378906,
      "rewards/rejected": -2.319310188293457,
      "step": 147
    },
    {
      "epoch": 0.04,
      "grad_norm": 27.87786865234375,
      "kl": 0.0,
      "learning_rate": 4.806333420570531e-07,
      "logps/chosen": -202.31613159179688,
      "logps/rejected": -222.10958862304688,
      "loss": 0.3701,
      "rewards/chosen": -1.3587830066680908,
      "rewards/margins": 0.8089334964752197,
      "rewards/rejected": -2.1677165031433105,
      "step": 148
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.04602813720703,
      "kl": 0.0,
      "learning_rate": 4.805024862601413e-07,
      "logps/chosen": -176.23416137695312,
      "logps/rejected": -229.03103637695312,
      "loss": 0.3393,
      "rewards/chosen": -0.5084764361381531,
      "rewards/margins": 1.4789369106292725,
      "rewards/rejected": -1.9874134063720703,
      "step": 149
    },
    {
      "epoch": 0.04,
      "grad_norm": 22.341838836669922,
      "kl": 0.0,
      "learning_rate": 4.803716304632295e-07,
      "logps/chosen": -167.64193725585938,
      "logps/rejected": -173.7758026123047,
      "loss": 0.269,
      "rewards/chosen": -1.1564908027648926,
      "rewards/margins": 1.4417181015014648,
      "rewards/rejected": -2.5982089042663574,
      "step": 150
    },
    {
      "epoch": 0.04,
      "grad_norm": 48.48427963256836,
      "kl": 0.0,
      "learning_rate": 4.802407746663177e-07,
      "logps/chosen": -279.48785400390625,
      "logps/rejected": -292.7326965332031,
      "loss": 0.4666,
      "rewards/chosen": -1.6676387786865234,
      "rewards/margins": -1.201262354850769,
      "rewards/rejected": -0.4663764536380768,
      "step": 151
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.24284744262695,
      "kl": 0.0,
      "learning_rate": 4.801099188694059e-07,
      "logps/chosen": -244.73452758789062,
      "logps/rejected": -240.77969360351562,
      "loss": 0.4686,
      "rewards/chosen": -1.590959072113037,
      "rewards/margins": -0.13520264625549316,
      "rewards/rejected": -1.455756425857544,
      "step": 152
    },
    {
      "epoch": 0.04,
      "grad_norm": 32.1400146484375,
      "kl": 0.0,
      "learning_rate": 4.799790630724941e-07,
      "logps/chosen": -237.9163818359375,
      "logps/rejected": -194.08131408691406,
      "loss": 0.3316,
      "rewards/chosen": 2.124384641647339,
      "rewards/margins": 3.3624322414398193,
      "rewards/rejected": -1.2380475997924805,
      "step": 153
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.19780731201172,
      "kl": 0.0,
      "learning_rate": 4.798482072755823e-07,
      "logps/chosen": -220.74940490722656,
      "logps/rejected": -224.6669921875,
      "loss": 0.3385,
      "rewards/chosen": -1.281145453453064,
      "rewards/margins": 1.0018588304519653,
      "rewards/rejected": -2.2830042839050293,
      "step": 154
    },
    {
      "epoch": 0.04,
      "grad_norm": 31.86383056640625,
      "kl": 0.0,
      "learning_rate": 4.797173514786705e-07,
      "logps/chosen": -186.74932861328125,
      "logps/rejected": -292.90008544921875,
      "loss": 0.456,
      "rewards/chosen": -1.296830415725708,
      "rewards/margins": 0.6673908233642578,
      "rewards/rejected": -1.9642212390899658,
      "step": 155
    },
    {
      "epoch": 0.04,
      "grad_norm": 32.114322662353516,
      "kl": 0.0,
      "learning_rate": 4.795864956817587e-07,
      "logps/chosen": -225.802490234375,
      "logps/rejected": -154.5421142578125,
      "loss": 0.3427,
      "rewards/chosen": -0.9391254186630249,
      "rewards/margins": 1.5108743906021118,
      "rewards/rejected": -2.4499998092651367,
      "step": 156
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.9625358581543,
      "kl": 0.0,
      "learning_rate": 4.794556398848469e-07,
      "logps/chosen": -272.5867004394531,
      "logps/rejected": -266.7203063964844,
      "loss": 0.3097,
      "rewards/chosen": -0.2914942800998688,
      "rewards/margins": 4.458119869232178,
      "rewards/rejected": -4.749614238739014,
      "step": 157
    },
    {
      "epoch": 0.04,
      "grad_norm": 27.236101150512695,
      "kl": 0.0,
      "learning_rate": 4.793247840879351e-07,
      "logps/chosen": -207.18653869628906,
      "logps/rejected": -290.4095153808594,
      "loss": 0.3525,
      "rewards/chosen": -0.14065659046173096,
      "rewards/margins": 3.9441447257995605,
      "rewards/rejected": -4.084801197052002,
      "step": 158
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.254356384277344,
      "kl": 0.0,
      "learning_rate": 4.791939282910233e-07,
      "logps/chosen": -271.6333312988281,
      "logps/rejected": -261.6518859863281,
      "loss": 0.4046,
      "rewards/chosen": -1.3647454977035522,
      "rewards/margins": 0.2699841260910034,
      "rewards/rejected": -1.6347296237945557,
      "step": 159
    },
    {
      "epoch": 0.04,
      "grad_norm": 36.57579040527344,
      "kl": 0.0,
      "learning_rate": 4.790630724941115e-07,
      "logps/chosen": -338.86138916015625,
      "logps/rejected": -216.30711364746094,
      "loss": 0.3796,
      "rewards/chosen": -1.2024385929107666,
      "rewards/margins": 0.8041627407073975,
      "rewards/rejected": -2.006601333618164,
      "step": 160
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.14208221435547,
      "kl": 0.0,
      "learning_rate": 4.789322166971997e-07,
      "logps/chosen": -175.1085205078125,
      "logps/rejected": -367.12030029296875,
      "loss": 0.2882,
      "rewards/chosen": -1.5256532430648804,
      "rewards/margins": 1.4195586442947388,
      "rewards/rejected": -2.945211887359619,
      "step": 161
    },
    {
      "epoch": 0.04,
      "grad_norm": 33.05416488647461,
      "kl": 0.0,
      "learning_rate": 4.788013609002879e-07,
      "logps/chosen": -209.37460327148438,
      "logps/rejected": -218.49258422851562,
      "loss": 0.5348,
      "rewards/chosen": -1.7475395202636719,
      "rewards/margins": -0.1888653039932251,
      "rewards/rejected": -1.5586742162704468,
      "step": 162
    },
    {
      "epoch": 0.04,
      "grad_norm": 37.163150787353516,
      "kl": 0.0,
      "learning_rate": 4.78670505103376e-07,
      "logps/chosen": -293.8970642089844,
      "logps/rejected": -269.2586364746094,
      "loss": 0.4305,
      "rewards/chosen": -1.452363133430481,
      "rewards/margins": 1.5342754125595093,
      "rewards/rejected": -2.9866385459899902,
      "step": 163
    },
    {
      "epoch": 0.04,
      "grad_norm": 32.23062515258789,
      "kl": 0.0,
      "learning_rate": 4.785396493064642e-07,
      "logps/chosen": -221.77999877929688,
      "logps/rejected": -198.4766082763672,
      "loss": 0.3642,
      "rewards/chosen": -1.34794020652771,
      "rewards/margins": 1.2967596054077148,
      "rewards/rejected": -2.644699811935425,
      "step": 164
    },
    {
      "epoch": 0.04,
      "grad_norm": 36.29683303833008,
      "kl": 0.0,
      "learning_rate": 4.784087935095524e-07,
      "logps/chosen": -239.8020477294922,
      "logps/rejected": -234.60704040527344,
      "loss": 0.2911,
      "rewards/chosen": -0.17822468280792236,
      "rewards/margins": 2.4091439247131348,
      "rewards/rejected": -2.5873684883117676,
      "step": 165
    },
    {
      "epoch": 0.04,
      "grad_norm": 37.25835037231445,
      "kl": 0.0,
      "learning_rate": 4.782779377126406e-07,
      "logps/chosen": -226.92759704589844,
      "logps/rejected": -230.7248992919922,
      "loss": 0.4445,
      "rewards/chosen": -1.0927642583847046,
      "rewards/margins": 1.1659952402114868,
      "rewards/rejected": -2.2587594985961914,
      "step": 166
    },
    {
      "epoch": 0.04,
      "grad_norm": 34.106014251708984,
      "kl": 0.0,
      "learning_rate": 4.781470819157288e-07,
      "logps/chosen": -246.8311004638672,
      "logps/rejected": -246.91729736328125,
      "loss": 0.3259,
      "rewards/chosen": -0.07399371266365051,
      "rewards/margins": 3.7297139167785645,
      "rewards/rejected": -3.8037075996398926,
      "step": 167
    },
    {
      "epoch": 0.04,
      "grad_norm": 28.763517379760742,
      "kl": 0.0,
      "learning_rate": 4.78016226118817e-07,
      "logps/chosen": -212.7134552001953,
      "logps/rejected": -273.555419921875,
      "loss": 0.2403,
      "rewards/chosen": -0.3746832311153412,
      "rewards/margins": 2.443885564804077,
      "rewards/rejected": -2.818568706512451,
      "step": 168
    },
    {
      "epoch": 0.04,
      "grad_norm": 31.084125518798828,
      "kl": 0.0,
      "learning_rate": 4.778853703219052e-07,
      "logps/chosen": -267.5098571777344,
      "logps/rejected": -239.85763549804688,
      "loss": 0.1923,
      "rewards/chosen": -1.1324867010116577,
      "rewards/margins": 2.735407829284668,
      "rewards/rejected": -3.867894411087036,
      "step": 169
    },
    {
      "epoch": 0.04,
      "grad_norm": 30.411270141601562,
      "kl": 0.0,
      "learning_rate": 4.777545145249934e-07,
      "logps/chosen": -195.84251403808594,
      "logps/rejected": -127.29740905761719,
      "loss": 0.3347,
      "rewards/chosen": -1.0493810176849365,
      "rewards/margins": 0.6141175031661987,
      "rewards/rejected": -1.6634985208511353,
      "step": 170
    },
    {
      "epoch": 0.04,
      "grad_norm": 35.29057312011719,
      "kl": 0.0,
      "learning_rate": 4.776236587280816e-07,
      "logps/chosen": -181.64306640625,
      "logps/rejected": -311.97705078125,
      "loss": 0.3247,
      "rewards/chosen": -1.0975894927978516,
      "rewards/margins": 0.9621212482452393,
      "rewards/rejected": -2.059710741043091,
      "step": 171
    },
    {
      "epoch": 0.05,
      "grad_norm": 34.762691497802734,
      "kl": 0.0,
      "learning_rate": 4.774928029311698e-07,
      "logps/chosen": -233.34786987304688,
      "logps/rejected": -180.6575469970703,
      "loss": 0.3342,
      "rewards/chosen": -1.5650959014892578,
      "rewards/margins": 0.0028657913208007812,
      "rewards/rejected": -1.5679616928100586,
      "step": 172
    },
    {
      "epoch": 0.05,
      "grad_norm": 34.95657730102539,
      "kl": 0.0,
      "learning_rate": 4.77361947134258e-07,
      "logps/chosen": -233.89166259765625,
      "logps/rejected": -164.43466186523438,
      "loss": 0.3293,
      "rewards/chosen": 0.26836246252059937,
      "rewards/margins": 2.1331124305725098,
      "rewards/rejected": -1.8647499084472656,
      "step": 173
    },
    {
      "epoch": 0.05,
      "grad_norm": 35.36391830444336,
      "kl": 0.0,
      "learning_rate": 4.772310913373462e-07,
      "logps/chosen": -254.98135375976562,
      "logps/rejected": -245.09056091308594,
      "loss": 0.3722,
      "rewards/chosen": -0.3194230794906616,
      "rewards/margins": 1.2747198343276978,
      "rewards/rejected": -1.5941429138183594,
      "step": 174
    },
    {
      "epoch": 0.05,
      "grad_norm": 28.04030418395996,
      "kl": 0.0,
      "learning_rate": 4.771002355404344e-07,
      "logps/chosen": -191.9232177734375,
      "logps/rejected": -249.7639617919922,
      "loss": 0.1768,
      "rewards/chosen": -1.6041406393051147,
      "rewards/margins": 0.7407997846603394,
      "rewards/rejected": -2.344940423965454,
      "step": 175
    },
    {
      "epoch": 0.05,
      "grad_norm": 33.58053207397461,
      "kl": 0.0,
      "learning_rate": 4.769693797435226e-07,
      "logps/chosen": -258.4371032714844,
      "logps/rejected": -221.71328735351562,
      "loss": 0.3767,
      "rewards/chosen": 0.0411025732755661,
      "rewards/margins": 2.80287766456604,
      "rewards/rejected": -2.761775016784668,
      "step": 176
    },
    {
      "epoch": 0.05,
      "grad_norm": 44.45076370239258,
      "kl": 0.0,
      "learning_rate": 4.768385239466108e-07,
      "logps/chosen": -262.84375,
      "logps/rejected": -283.4814147949219,
      "loss": 0.284,
      "rewards/chosen": -0.07760030031204224,
      "rewards/margins": 2.31966495513916,
      "rewards/rejected": -2.3972651958465576,
      "step": 177
    },
    {
      "epoch": 0.05,
      "grad_norm": 32.66361999511719,
      "kl": 0.0,
      "learning_rate": 4.76707668149699e-07,
      "logps/chosen": -173.99244689941406,
      "logps/rejected": -231.17002868652344,
      "loss": 0.2292,
      "rewards/chosen": -0.2332194596529007,
      "rewards/margins": 2.0381579399108887,
      "rewards/rejected": -2.2713773250579834,
      "step": 178
    },
    {
      "epoch": 0.05,
      "grad_norm": 29.631681442260742,
      "kl": 0.0,
      "learning_rate": 4.765768123527872e-07,
      "logps/chosen": -215.5139923095703,
      "logps/rejected": -275.6757507324219,
      "loss": 0.3481,
      "rewards/chosen": -0.28280165791511536,
      "rewards/margins": 1.5615084171295166,
      "rewards/rejected": -1.8443100452423096,
      "step": 179
    },
    {
      "epoch": 0.05,
      "grad_norm": 33.25776672363281,
      "kl": 0.0,
      "learning_rate": 4.764459565558754e-07,
      "logps/chosen": -300.66571044921875,
      "logps/rejected": -226.95843505859375,
      "loss": 0.5674,
      "rewards/chosen": -2.056067705154419,
      "rewards/margins": -0.5937771797180176,
      "rewards/rejected": -1.4622905254364014,
      "step": 180
    },
    {
      "epoch": 0.05,
      "grad_norm": 32.5307502746582,
      "kl": 0.0,
      "learning_rate": 4.763151007589636e-07,
      "logps/chosen": -274.2823791503906,
      "logps/rejected": -278.12158203125,
      "loss": 0.364,
      "rewards/chosen": -1.056592345237732,
      "rewards/margins": 1.1514142751693726,
      "rewards/rejected": -2.2080066204071045,
      "step": 181
    },
    {
      "epoch": 0.05,
      "grad_norm": 31.034576416015625,
      "kl": 0.0,
      "learning_rate": 4.7618424496205177e-07,
      "logps/chosen": -320.25201416015625,
      "logps/rejected": -268.442138671875,
      "loss": 0.3914,
      "rewards/chosen": -1.3781906366348267,
      "rewards/margins": 1.2480562925338745,
      "rewards/rejected": -2.626246929168701,
      "step": 182
    },
    {
      "epoch": 0.05,
      "grad_norm": 27.58580207824707,
      "kl": 0.0,
      "learning_rate": 4.7605338916513997e-07,
      "logps/chosen": -178.425537109375,
      "logps/rejected": -249.5200958251953,
      "loss": 0.3172,
      "rewards/chosen": -0.6601690649986267,
      "rewards/margins": 2.0617916584014893,
      "rewards/rejected": -2.7219607830047607,
      "step": 183
    },
    {
      "epoch": 0.05,
      "grad_norm": 31.219865798950195,
      "kl": 0.0,
      "learning_rate": 4.7592253336822816e-07,
      "logps/chosen": -279.55352783203125,
      "logps/rejected": -152.67330932617188,
      "loss": 0.4063,
      "rewards/chosen": -1.6273868083953857,
      "rewards/margins": -0.3303135633468628,
      "rewards/rejected": -1.297073245048523,
      "step": 184
    },
    {
      "epoch": 0.05,
      "grad_norm": 39.95933532714844,
      "kl": 0.0,
      "learning_rate": 4.7579167757131636e-07,
      "logps/chosen": -217.0849151611328,
      "logps/rejected": -275.55670166015625,
      "loss": 0.4124,
      "rewards/chosen": -0.4421677887439728,
      "rewards/margins": 1.6669080257415771,
      "rewards/rejected": -2.1090757846832275,
      "step": 185
    },
    {
      "epoch": 0.05,
      "grad_norm": 29.07082748413086,
      "kl": 0.0,
      "learning_rate": 4.756608217744046e-07,
      "logps/chosen": -126.4207992553711,
      "logps/rejected": -215.87127685546875,
      "loss": 0.3561,
      "rewards/chosen": -1.5301021337509155,
      "rewards/margins": 0.5697237253189087,
      "rewards/rejected": -2.099825859069824,
      "step": 186
    },
    {
      "epoch": 0.05,
      "grad_norm": 36.05703353881836,
      "kl": 0.0,
      "learning_rate": 4.755299659774928e-07,
      "logps/chosen": -300.80914306640625,
      "logps/rejected": -214.6817626953125,
      "loss": 0.5,
      "rewards/chosen": -1.702012062072754,
      "rewards/margins": -0.06618618965148926,
      "rewards/rejected": -1.6358258724212646,
      "step": 187
    },
    {
      "epoch": 0.05,
      "grad_norm": 32.64337921142578,
      "kl": 0.0,
      "learning_rate": 4.75399110180581e-07,
      "logps/chosen": -314.01226806640625,
      "logps/rejected": -260.5047607421875,
      "loss": 0.2495,
      "rewards/chosen": -0.9918256402015686,
      "rewards/margins": 1.699979305267334,
      "rewards/rejected": -2.691804885864258,
      "step": 188
    },
    {
      "epoch": 0.05,
      "grad_norm": 28.98259735107422,
      "kl": 0.0,
      "learning_rate": 4.752682543836692e-07,
      "logps/chosen": -171.03106689453125,
      "logps/rejected": -305.4718933105469,
      "loss": 0.4056,
      "rewards/chosen": -1.0309761762619019,
      "rewards/margins": 2.5201992988586426,
      "rewards/rejected": -3.551175594329834,
      "step": 189
    },
    {
      "epoch": 0.05,
      "grad_norm": 43.78743362426758,
      "kl": 0.0,
      "learning_rate": 4.751373985867574e-07,
      "logps/chosen": -203.0801239013672,
      "logps/rejected": -247.2637939453125,
      "loss": 0.3496,
      "rewards/chosen": 0.1427454948425293,
      "rewards/margins": 2.5608232021331787,
      "rewards/rejected": -2.4180777072906494,
      "step": 190
    },
    {
      "epoch": 0.05,
      "grad_norm": 34.81067657470703,
      "kl": 0.0,
      "learning_rate": 4.750065427898456e-07,
      "logps/chosen": -177.9116973876953,
      "logps/rejected": -285.660400390625,
      "loss": 0.3558,
      "rewards/chosen": 0.284568727016449,
      "rewards/margins": 2.3395156860351562,
      "rewards/rejected": -2.0549468994140625,
      "step": 191
    },
    {
      "epoch": 0.05,
      "grad_norm": 36.1607780456543,
      "kl": 0.0,
      "learning_rate": 4.748756869929338e-07,
      "logps/chosen": -238.54434204101562,
      "logps/rejected": -200.06578063964844,
      "loss": 0.3894,
      "rewards/chosen": -0.9025675654411316,
      "rewards/margins": -0.14597797393798828,
      "rewards/rejected": -0.7565895915031433,
      "step": 192
    },
    {
      "epoch": 0.05,
      "grad_norm": 36.35572814941406,
      "kl": 0.0,
      "learning_rate": 4.74744831196022e-07,
      "logps/chosen": -176.56765747070312,
      "logps/rejected": -160.97991943359375,
      "loss": 0.4684,
      "rewards/chosen": -0.44666993618011475,
      "rewards/margins": 1.3998445272445679,
      "rewards/rejected": -1.8465144634246826,
      "step": 193
    },
    {
      "epoch": 0.05,
      "grad_norm": 38.28248977661133,
      "kl": 0.0,
      "learning_rate": 4.746139753991101e-07,
      "logps/chosen": -267.4242858886719,
      "logps/rejected": -227.41146850585938,
      "loss": 0.3664,
      "rewards/chosen": -1.034665822982788,
      "rewards/margins": 1.0908312797546387,
      "rewards/rejected": -2.1254971027374268,
      "step": 194
    },
    {
      "epoch": 0.05,
      "grad_norm": 41.40991973876953,
      "kl": 0.0,
      "learning_rate": 4.744831196021983e-07,
      "logps/chosen": -215.8043212890625,
      "logps/rejected": -260.5753173828125,
      "loss": 0.4136,
      "rewards/chosen": -0.8841571807861328,
      "rewards/margins": 2.375264883041382,
      "rewards/rejected": -3.2594220638275146,
      "step": 195
    },
    {
      "epoch": 0.05,
      "grad_norm": 30.04712677001953,
      "kl": 0.0,
      "learning_rate": 4.743522638052865e-07,
      "logps/chosen": -247.63121032714844,
      "logps/rejected": -191.33665466308594,
      "loss": 0.4148,
      "rewards/chosen": -2.485023021697998,
      "rewards/margins": 0.23484015464782715,
      "rewards/rejected": -2.719863176345825,
      "step": 196
    },
    {
      "epoch": 0.05,
      "grad_norm": 35.53774642944336,
      "kl": 0.0,
      "learning_rate": 4.742214080083747e-07,
      "logps/chosen": -199.6810302734375,
      "logps/rejected": -191.50827026367188,
      "loss": 0.295,
      "rewards/chosen": -1.0607842206954956,
      "rewards/margins": 1.159951090812683,
      "rewards/rejected": -2.2207353115081787,
      "step": 197
    },
    {
      "epoch": 0.05,
      "grad_norm": 38.35923767089844,
      "kl": 0.0,
      "learning_rate": 4.740905522114629e-07,
      "logps/chosen": -242.72299194335938,
      "logps/rejected": -247.96231079101562,
      "loss": 0.4228,
      "rewards/chosen": -1.3263717889785767,
      "rewards/margins": 1.3898276090621948,
      "rewards/rejected": -2.7161993980407715,
      "step": 198
    },
    {
      "epoch": 0.05,
      "grad_norm": 29.171445846557617,
      "kl": 0.0,
      "learning_rate": 4.7395969641455116e-07,
      "logps/chosen": -260.06341552734375,
      "logps/rejected": -262.1416931152344,
      "loss": 0.3828,
      "rewards/chosen": -2.1225051879882812,
      "rewards/margins": 1.466343641281128,
      "rewards/rejected": -3.588848829269409,
      "step": 199
    },
    {
      "epoch": 0.05,
      "grad_norm": 33.99834060668945,
      "kl": 0.0,
      "learning_rate": 4.7382884061763935e-07,
      "logps/chosen": -204.8150634765625,
      "logps/rejected": -220.98208618164062,
      "loss": 0.3487,
      "rewards/chosen": -0.6152539253234863,
      "rewards/margins": 1.9658851623535156,
      "rewards/rejected": -2.581139087677002,
      "step": 200
    },
    {
      "epoch": 0.05,
      "grad_norm": 23.109434127807617,
      "kl": 0.0,
      "learning_rate": 4.7369798482072755e-07,
      "logps/chosen": -189.38824462890625,
      "logps/rejected": -225.818115234375,
      "loss": 0.3498,
      "rewards/chosen": -1.0156517028808594,
      "rewards/margins": 2.376852035522461,
      "rewards/rejected": -3.3925037384033203,
      "step": 201
    },
    {
      "epoch": 0.05,
      "grad_norm": 31.507305145263672,
      "kl": 0.0,
      "learning_rate": 4.7356712902381575e-07,
      "logps/chosen": -225.10400390625,
      "logps/rejected": -249.22921752929688,
      "loss": 0.3674,
      "rewards/chosen": -0.7410069108009338,
      "rewards/margins": 1.9564170837402344,
      "rewards/rejected": -2.6974239349365234,
      "step": 202
    },
    {
      "epoch": 0.05,
      "grad_norm": 40.36268615722656,
      "kl": 0.0,
      "learning_rate": 4.7343627322690394e-07,
      "logps/chosen": -241.38609313964844,
      "logps/rejected": -219.32449340820312,
      "loss": 0.3755,
      "rewards/chosen": 1.6069724559783936,
      "rewards/margins": 2.9934029579162598,
      "rewards/rejected": -1.3864305019378662,
      "step": 203
    },
    {
      "epoch": 0.05,
      "grad_norm": 31.71283531188965,
      "kl": 0.0,
      "learning_rate": 4.7330541742999214e-07,
      "logps/chosen": -245.4090576171875,
      "logps/rejected": -264.3318786621094,
      "loss": 0.3452,
      "rewards/chosen": -1.086578130722046,
      "rewards/margins": 0.6778701543807983,
      "rewards/rejected": -1.7644482851028442,
      "step": 204
    },
    {
      "epoch": 0.05,
      "grad_norm": 34.97480010986328,
      "kl": 0.0,
      "learning_rate": 4.7317456163308034e-07,
      "logps/chosen": -284.29791259765625,
      "logps/rejected": -244.2501678466797,
      "loss": 0.2654,
      "rewards/chosen": -0.4497016370296478,
      "rewards/margins": 2.9477717876434326,
      "rewards/rejected": -3.3974733352661133,
      "step": 205
    },
    {
      "epoch": 0.05,
      "grad_norm": 42.56742858886719,
      "kl": 0.0,
      "learning_rate": 4.7304370583616853e-07,
      "logps/chosen": -275.45928955078125,
      "logps/rejected": -223.07778930664062,
      "loss": 0.3474,
      "rewards/chosen": -0.8359074592590332,
      "rewards/margins": 1.7483019828796387,
      "rewards/rejected": -2.584209442138672,
      "step": 206
    },
    {
      "epoch": 0.05,
      "grad_norm": 33.80686569213867,
      "kl": 0.0,
      "learning_rate": 4.7291285003925673e-07,
      "logps/chosen": -180.66078186035156,
      "logps/rejected": -202.80429077148438,
      "loss": 0.308,
      "rewards/chosen": -1.281614899635315,
      "rewards/margins": 0.863860011100769,
      "rewards/rejected": -2.145474910736084,
      "step": 207
    },
    {
      "epoch": 0.05,
      "grad_norm": 37.145477294921875,
      "kl": 0.0,
      "learning_rate": 4.727819942423449e-07,
      "logps/chosen": -290.77020263671875,
      "logps/rejected": -283.8406066894531,
      "loss": 0.4507,
      "rewards/chosen": -1.2572423219680786,
      "rewards/margins": 2.4235076904296875,
      "rewards/rejected": -3.6807498931884766,
      "step": 208
    },
    {
      "epoch": 0.05,
      "grad_norm": 35.7796745300293,
      "kl": 0.0,
      "learning_rate": 4.7265113844543307e-07,
      "logps/chosen": -219.49148559570312,
      "logps/rejected": -117.24064636230469,
      "loss": 0.444,
      "rewards/chosen": -0.850780189037323,
      "rewards/margins": 0.35081928968429565,
      "rewards/rejected": -1.2015994787216187,
      "step": 209
    },
    {
      "epoch": 0.05,
      "grad_norm": 26.550434112548828,
      "kl": 0.0,
      "learning_rate": 4.7252028264852126e-07,
      "logps/chosen": -162.2927703857422,
      "logps/rejected": -262.95635986328125,
      "loss": 0.3816,
      "rewards/chosen": -0.9790937304496765,
      "rewards/margins": 1.541642189025879,
      "rewards/rejected": -2.5207359790802,
      "step": 210
    },
    {
      "epoch": 0.06,
      "grad_norm": 38.59910583496094,
      "kl": 0.0,
      "learning_rate": 4.7238942685160946e-07,
      "logps/chosen": -220.10491943359375,
      "logps/rejected": -193.5897216796875,
      "loss": 0.2675,
      "rewards/chosen": 0.22388845682144165,
      "rewards/margins": 1.840318202972412,
      "rewards/rejected": -1.6164298057556152,
      "step": 211
    },
    {
      "epoch": 0.06,
      "grad_norm": 31.72176742553711,
      "kl": 0.0,
      "learning_rate": 4.722585710546977e-07,
      "logps/chosen": -158.26223754882812,
      "logps/rejected": -349.89923095703125,
      "loss": 0.4544,
      "rewards/chosen": -0.953671932220459,
      "rewards/margins": 5.621851444244385,
      "rewards/rejected": -6.575523376464844,
      "step": 212
    },
    {
      "epoch": 0.06,
      "grad_norm": 35.59261703491211,
      "kl": 0.0,
      "learning_rate": 4.721277152577859e-07,
      "logps/chosen": -188.9051513671875,
      "logps/rejected": -229.09368896484375,
      "loss": 0.3985,
      "rewards/chosen": -0.7103174924850464,
      "rewards/margins": 1.4962338209152222,
      "rewards/rejected": -2.2065513134002686,
      "step": 213
    },
    {
      "epoch": 0.06,
      "grad_norm": 29.88261604309082,
      "kl": 0.0,
      "learning_rate": 4.719968594608741e-07,
      "logps/chosen": -258.1292419433594,
      "logps/rejected": -285.9716491699219,
      "loss": 0.2742,
      "rewards/chosen": -0.8898869752883911,
      "rewards/margins": 4.2793169021606445,
      "rewards/rejected": -5.169203758239746,
      "step": 214
    },
    {
      "epoch": 0.06,
      "grad_norm": 31.082521438598633,
      "kl": 0.0,
      "learning_rate": 4.718660036639623e-07,
      "logps/chosen": -235.5315704345703,
      "logps/rejected": -142.43453979492188,
      "loss": 0.3042,
      "rewards/chosen": -1.472436785697937,
      "rewards/margins": 0.5655766725540161,
      "rewards/rejected": -2.038013458251953,
      "step": 215
    },
    {
      "epoch": 0.06,
      "grad_norm": 45.72218704223633,
      "kl": 0.0,
      "learning_rate": 4.717351478670505e-07,
      "logps/chosen": -277.6252136230469,
      "logps/rejected": -190.9403839111328,
      "loss": 0.3446,
      "rewards/chosen": 0.9356330037117004,
      "rewards/margins": 2.0501766204833984,
      "rewards/rejected": -1.1145436763763428,
      "step": 216
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.96467971801758,
      "kl": 0.0,
      "learning_rate": 4.716042920701387e-07,
      "logps/chosen": -194.5904541015625,
      "logps/rejected": -251.4320068359375,
      "loss": 0.4222,
      "rewards/chosen": -0.46582919359207153,
      "rewards/margins": 1.6109564304351807,
      "rewards/rejected": -2.0767855644226074,
      "step": 217
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.587127685546875,
      "kl": 0.0,
      "learning_rate": 4.714734362732269e-07,
      "logps/chosen": -250.0608673095703,
      "logps/rejected": -217.1116943359375,
      "loss": 0.2782,
      "rewards/chosen": -0.5743464231491089,
      "rewards/margins": 1.952079176902771,
      "rewards/rejected": -2.52642560005188,
      "step": 218
    },
    {
      "epoch": 0.06,
      "grad_norm": 27.2506046295166,
      "kl": 0.0,
      "learning_rate": 4.713425804763151e-07,
      "logps/chosen": -150.00576782226562,
      "logps/rejected": -322.8671875,
      "loss": 0.1698,
      "rewards/chosen": -0.5786823034286499,
      "rewards/margins": 3.880950450897217,
      "rewards/rejected": -4.459632873535156,
      "step": 219
    },
    {
      "epoch": 0.06,
      "grad_norm": 31.029760360717773,
      "kl": 0.0,
      "learning_rate": 4.712117246794033e-07,
      "logps/chosen": -250.29478454589844,
      "logps/rejected": -235.32687377929688,
      "loss": 0.3649,
      "rewards/chosen": -1.7568782567977905,
      "rewards/margins": 1.4462846517562866,
      "rewards/rejected": -3.203162908554077,
      "step": 220
    },
    {
      "epoch": 0.06,
      "grad_norm": 38.26043701171875,
      "kl": 0.0,
      "learning_rate": 4.7108086888249147e-07,
      "logps/chosen": -279.1617431640625,
      "logps/rejected": -174.76123046875,
      "loss": 0.4814,
      "rewards/chosen": -1.5011340379714966,
      "rewards/margins": 0.26141786575317383,
      "rewards/rejected": -1.7625519037246704,
      "step": 221
    },
    {
      "epoch": 0.06,
      "grad_norm": 35.2736701965332,
      "kl": 0.0,
      "learning_rate": 4.7095001308557967e-07,
      "logps/chosen": -215.34039306640625,
      "logps/rejected": -221.15383911132812,
      "loss": 0.3409,
      "rewards/chosen": -0.45132410526275635,
      "rewards/margins": 2.0330100059509277,
      "rewards/rejected": -2.4843342304229736,
      "step": 222
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.65085983276367,
      "kl": 0.0,
      "learning_rate": 4.7081915728866786e-07,
      "logps/chosen": -194.25527954101562,
      "logps/rejected": -124.3009033203125,
      "loss": 0.4022,
      "rewards/chosen": -0.9982702732086182,
      "rewards/margins": 0.494891881942749,
      "rewards/rejected": -1.4931621551513672,
      "step": 223
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.66006851196289,
      "kl": 0.0,
      "learning_rate": 4.706883014917561e-07,
      "logps/chosen": -213.27227783203125,
      "logps/rejected": -195.74505615234375,
      "loss": 0.3352,
      "rewards/chosen": -0.09169921278953552,
      "rewards/margins": 1.9126981496810913,
      "rewards/rejected": -2.004397392272949,
      "step": 224
    },
    {
      "epoch": 0.06,
      "grad_norm": 37.03841018676758,
      "kl": 0.0,
      "learning_rate": 4.7055744569484426e-07,
      "logps/chosen": -174.44583129882812,
      "logps/rejected": -203.36178588867188,
      "loss": 0.4792,
      "rewards/chosen": -0.9792169332504272,
      "rewards/margins": 0.39964139461517334,
      "rewards/rejected": -1.3788583278656006,
      "step": 225
    },
    {
      "epoch": 0.06,
      "grad_norm": 41.96936798095703,
      "kl": 0.0,
      "learning_rate": 4.7042658989793245e-07,
      "logps/chosen": -222.01852416992188,
      "logps/rejected": -235.48773193359375,
      "loss": 0.4073,
      "rewards/chosen": -1.3425058126449585,
      "rewards/margins": 0.3740893602371216,
      "rewards/rejected": -1.71659517288208,
      "step": 226
    },
    {
      "epoch": 0.06,
      "grad_norm": 40.41291427612305,
      "kl": 0.0,
      "learning_rate": 4.7029573410102065e-07,
      "logps/chosen": -274.5633544921875,
      "logps/rejected": -268.20489501953125,
      "loss": 0.4272,
      "rewards/chosen": -1.1389542818069458,
      "rewards/margins": 2.3568062782287598,
      "rewards/rejected": -3.495760440826416,
      "step": 227
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.06272506713867,
      "kl": 0.0,
      "learning_rate": 4.7016487830410885e-07,
      "logps/chosen": -245.97525024414062,
      "logps/rejected": -242.12374877929688,
      "loss": 0.3408,
      "rewards/chosen": -1.1917027235031128,
      "rewards/margins": 1.347025752067566,
      "rewards/rejected": -2.5387284755706787,
      "step": 228
    },
    {
      "epoch": 0.06,
      "grad_norm": 42.17586898803711,
      "kl": 0.0,
      "learning_rate": 4.7003402250719704e-07,
      "logps/chosen": -197.28846740722656,
      "logps/rejected": -220.36756896972656,
      "loss": 0.4867,
      "rewards/chosen": -1.028113603591919,
      "rewards/margins": 1.651493787765503,
      "rewards/rejected": -2.679607391357422,
      "step": 229
    },
    {
      "epoch": 0.06,
      "grad_norm": 44.75341033935547,
      "kl": 0.0,
      "learning_rate": 4.6990316671028524e-07,
      "logps/chosen": -224.07839965820312,
      "logps/rejected": -217.51248168945312,
      "loss": 0.3321,
      "rewards/chosen": -0.5282418131828308,
      "rewards/margins": 2.796116590499878,
      "rewards/rejected": -3.3243584632873535,
      "step": 230
    },
    {
      "epoch": 0.06,
      "grad_norm": 37.11387634277344,
      "kl": 0.0,
      "learning_rate": 4.6977231091337343e-07,
      "logps/chosen": -137.24411010742188,
      "logps/rejected": -313.54193115234375,
      "loss": 0.359,
      "rewards/chosen": -0.643952488899231,
      "rewards/margins": 2.7075109481811523,
      "rewards/rejected": -3.3514633178710938,
      "step": 231
    },
    {
      "epoch": 0.06,
      "grad_norm": 34.26787185668945,
      "kl": 0.0,
      "learning_rate": 4.6964145511646163e-07,
      "logps/chosen": -167.1756134033203,
      "logps/rejected": -229.2905731201172,
      "loss": 0.2363,
      "rewards/chosen": -0.5329712629318237,
      "rewards/margins": 1.356623649597168,
      "rewards/rejected": -1.8895949125289917,
      "step": 232
    },
    {
      "epoch": 0.06,
      "grad_norm": 35.61827850341797,
      "kl": 0.0,
      "learning_rate": 4.695105993195498e-07,
      "logps/chosen": -197.7938690185547,
      "logps/rejected": -295.2776794433594,
      "loss": 0.3338,
      "rewards/chosen": -0.19512000679969788,
      "rewards/margins": 2.5928189754486084,
      "rewards/rejected": -2.7879390716552734,
      "step": 233
    },
    {
      "epoch": 0.06,
      "grad_norm": 49.267677307128906,
      "kl": 0.0,
      "learning_rate": 4.69379743522638e-07,
      "logps/chosen": -235.7493896484375,
      "logps/rejected": -297.271240234375,
      "loss": 0.4593,
      "rewards/chosen": -0.2626681625843048,
      "rewards/margins": 1.3399256467819214,
      "rewards/rejected": -1.6025937795639038,
      "step": 234
    },
    {
      "epoch": 0.06,
      "grad_norm": 31.048612594604492,
      "kl": 0.0,
      "learning_rate": 4.692488877257262e-07,
      "logps/chosen": -232.46897888183594,
      "logps/rejected": -274.14178466796875,
      "loss": 0.3967,
      "rewards/chosen": -1.576374888420105,
      "rewards/margins": 1.132400631904602,
      "rewards/rejected": -2.708775520324707,
      "step": 235
    },
    {
      "epoch": 0.06,
      "grad_norm": 34.81768035888672,
      "kl": 0.0,
      "learning_rate": 4.691180319288144e-07,
      "logps/chosen": -186.15216064453125,
      "logps/rejected": -307.45843505859375,
      "loss": 0.322,
      "rewards/chosen": -0.2905099391937256,
      "rewards/margins": 1.9975306987762451,
      "rewards/rejected": -2.2880406379699707,
      "step": 236
    },
    {
      "epoch": 0.06,
      "grad_norm": 44.079856872558594,
      "kl": 0.0,
      "learning_rate": 4.6898717613190266e-07,
      "logps/chosen": -294.75274658203125,
      "logps/rejected": -263.3562316894531,
      "loss": 0.3592,
      "rewards/chosen": -0.2211398184299469,
      "rewards/margins": 2.107311487197876,
      "rewards/rejected": -2.32845139503479,
      "step": 237
    },
    {
      "epoch": 0.06,
      "grad_norm": 32.26349639892578,
      "kl": 0.0,
      "learning_rate": 4.6885632033499086e-07,
      "logps/chosen": -204.82138061523438,
      "logps/rejected": -248.318603515625,
      "loss": 0.3495,
      "rewards/chosen": 0.03919875621795654,
      "rewards/margins": 2.928051471710205,
      "rewards/rejected": -2.888852834701538,
      "step": 238
    },
    {
      "epoch": 0.06,
      "grad_norm": 35.30009078979492,
      "kl": 0.0,
      "learning_rate": 4.6872546453807906e-07,
      "logps/chosen": -306.24169921875,
      "logps/rejected": -213.28111267089844,
      "loss": 0.3976,
      "rewards/chosen": -1.4108914136886597,
      "rewards/margins": 0.8999141454696655,
      "rewards/rejected": -2.310805559158325,
      "step": 239
    },
    {
      "epoch": 0.06,
      "grad_norm": 35.18433380126953,
      "kl": 0.0,
      "learning_rate": 4.685946087411672e-07,
      "logps/chosen": -258.08056640625,
      "logps/rejected": -275.6287841796875,
      "loss": 0.4349,
      "rewards/chosen": -0.9918340444564819,
      "rewards/margins": 2.3593192100524902,
      "rewards/rejected": -3.3511531352996826,
      "step": 240
    },
    {
      "epoch": 0.06,
      "grad_norm": 24.199462890625,
      "kl": 0.0,
      "learning_rate": 4.684637529442554e-07,
      "logps/chosen": -202.14227294921875,
      "logps/rejected": -251.01712036132812,
      "loss": 0.2988,
      "rewards/chosen": -2.8060295581817627,
      "rewards/margins": 1.014716625213623,
      "rewards/rejected": -3.8207461833953857,
      "step": 241
    },
    {
      "epoch": 0.06,
      "grad_norm": 27.472558975219727,
      "kl": 0.0,
      "learning_rate": 4.683328971473436e-07,
      "logps/chosen": -178.1534881591797,
      "logps/rejected": -194.3978729248047,
      "loss": 0.3999,
      "rewards/chosen": -1.4756015539169312,
      "rewards/margins": 1.0923720598220825,
      "rewards/rejected": -2.5679736137390137,
      "step": 242
    },
    {
      "epoch": 0.06,
      "grad_norm": 28.934558868408203,
      "kl": 0.0,
      "learning_rate": 4.682020413504318e-07,
      "logps/chosen": -232.8424835205078,
      "logps/rejected": -196.27610778808594,
      "loss": 0.25,
      "rewards/chosen": 0.7410671710968018,
      "rewards/margins": 3.822826862335205,
      "rewards/rejected": -3.0817596912384033,
      "step": 243
    },
    {
      "epoch": 0.06,
      "grad_norm": 38.57919692993164,
      "kl": 0.0,
      "learning_rate": 4.6807118555352e-07,
      "logps/chosen": -210.18600463867188,
      "logps/rejected": -185.69248962402344,
      "loss": 0.3098,
      "rewards/chosen": -1.65409255027771,
      "rewards/margins": 1.2091917991638184,
      "rewards/rejected": -2.8632843494415283,
      "step": 244
    },
    {
      "epoch": 0.06,
      "grad_norm": 40.655635833740234,
      "kl": 0.0,
      "learning_rate": 4.679403297566082e-07,
      "logps/chosen": -206.66592407226562,
      "logps/rejected": -244.49119567871094,
      "loss": 0.3466,
      "rewards/chosen": -0.9859999418258667,
      "rewards/margins": 1.3829981088638306,
      "rewards/rejected": -2.3689980506896973,
      "step": 245
    },
    {
      "epoch": 0.06,
      "grad_norm": 37.547794342041016,
      "kl": 0.0,
      "learning_rate": 4.678094739596964e-07,
      "logps/chosen": -256.795654296875,
      "logps/rejected": -254.77978515625,
      "loss": 0.3306,
      "rewards/chosen": -1.4593631029129028,
      "rewards/margins": 2.0132813453674316,
      "rewards/rejected": -3.472644567489624,
      "step": 246
    },
    {
      "epoch": 0.06,
      "grad_norm": 36.778751373291016,
      "kl": 0.0,
      "learning_rate": 4.6767861816278457e-07,
      "logps/chosen": -260.1071472167969,
      "logps/rejected": -200.48402404785156,
      "loss": 0.2881,
      "rewards/chosen": 1.2593364715576172,
      "rewards/margins": 3.518388271331787,
      "rewards/rejected": -2.25905179977417,
      "step": 247
    },
    {
      "epoch": 0.06,
      "grad_norm": 38.92163848876953,
      "kl": 0.0,
      "learning_rate": 4.6754776236587277e-07,
      "logps/chosen": -234.36021423339844,
      "logps/rejected": -215.75294494628906,
      "loss": 0.3333,
      "rewards/chosen": 0.5298810601234436,
      "rewards/margins": 2.223097324371338,
      "rewards/rejected": -1.693216323852539,
      "step": 248
    },
    {
      "epoch": 0.07,
      "grad_norm": 36.91664505004883,
      "kl": 0.0,
      "learning_rate": 4.6741690656896096e-07,
      "logps/chosen": -258.3697204589844,
      "logps/rejected": -216.38294982910156,
      "loss": 0.4925,
      "rewards/chosen": -0.6519094705581665,
      "rewards/margins": 1.3434323072433472,
      "rewards/rejected": -1.9953417778015137,
      "step": 249
    },
    {
      "epoch": 0.07,
      "grad_norm": 31.283388137817383,
      "kl": 0.0,
      "learning_rate": 4.672860507720492e-07,
      "logps/chosen": -216.1317596435547,
      "logps/rejected": -266.24176025390625,
      "loss": 0.361,
      "rewards/chosen": 0.45249366760253906,
      "rewards/margins": 3.7620913982391357,
      "rewards/rejected": -3.3095977306365967,
      "step": 250
    },
    {
      "epoch": 0.07,
      "grad_norm": 39.949859619140625,
      "kl": 0.0,
      "learning_rate": 4.671551949751374e-07,
      "logps/chosen": -291.876708984375,
      "logps/rejected": -256.897216796875,
      "loss": 0.4256,
      "rewards/chosen": -2.2693746089935303,
      "rewards/margins": 0.718336820602417,
      "rewards/rejected": -2.9877114295959473,
      "step": 251
    },
    {
      "epoch": 0.07,
      "grad_norm": 48.09956741333008,
      "kl": 0.0,
      "learning_rate": 4.670243391782256e-07,
      "logps/chosen": -307.2916564941406,
      "logps/rejected": -232.60179138183594,
      "loss": 0.3285,
      "rewards/chosen": -2.1419198513031006,
      "rewards/margins": 0.21775078773498535,
      "rewards/rejected": -2.359670639038086,
      "step": 252
    },
    {
      "epoch": 0.07,
      "grad_norm": 30.89328956604004,
      "kl": 0.0,
      "learning_rate": 4.668934833813138e-07,
      "logps/chosen": -253.2288055419922,
      "logps/rejected": -260.0640869140625,
      "loss": 0.3365,
      "rewards/chosen": -0.9447171092033386,
      "rewards/margins": 3.576873540878296,
      "rewards/rejected": -4.521590709686279,
      "step": 253
    },
    {
      "epoch": 0.07,
      "grad_norm": 47.67763137817383,
      "kl": 0.0,
      "learning_rate": 4.66762627584402e-07,
      "logps/chosen": -176.8143768310547,
      "logps/rejected": -242.53236389160156,
      "loss": 0.2547,
      "rewards/chosen": 1.01246976852417,
      "rewards/margins": 3.594667434692383,
      "rewards/rejected": -2.582197666168213,
      "step": 254
    },
    {
      "epoch": 0.07,
      "grad_norm": 35.259056091308594,
      "kl": 0.0,
      "learning_rate": 4.666317717874902e-07,
      "logps/chosen": -226.5983428955078,
      "logps/rejected": -223.51144409179688,
      "loss": 0.263,
      "rewards/chosen": -0.17412535846233368,
      "rewards/margins": 2.4813520908355713,
      "rewards/rejected": -2.655477523803711,
      "step": 255
    },
    {
      "epoch": 0.07,
      "grad_norm": 41.33015823364258,
      "kl": 0.0,
      "learning_rate": 4.6650091599057834e-07,
      "logps/chosen": -291.62744140625,
      "logps/rejected": -264.74493408203125,
      "loss": 0.366,
      "rewards/chosen": -0.48753511905670166,
      "rewards/margins": 3.4678664207458496,
      "rewards/rejected": -3.955401659011841,
      "step": 256
    },
    {
      "epoch": 0.07,
      "grad_norm": 39.72134017944336,
      "kl": 0.0,
      "learning_rate": 4.6637006019366653e-07,
      "logps/chosen": -302.4512634277344,
      "logps/rejected": -221.8700408935547,
      "loss": 0.305,
      "rewards/chosen": -0.7385875582695007,
      "rewards/margins": 1.6246614456176758,
      "rewards/rejected": -2.3632490634918213,
      "step": 257
    },
    {
      "epoch": 0.07,
      "grad_norm": 39.22993850708008,
      "kl": 0.0,
      "learning_rate": 4.6623920439675473e-07,
      "logps/chosen": -239.31459045410156,
      "logps/rejected": -185.0542449951172,
      "loss": 0.3941,
      "rewards/chosen": -0.697856068611145,
      "rewards/margins": 1.7438751459121704,
      "rewards/rejected": -2.4417312145233154,
      "step": 258
    },
    {
      "epoch": 0.07,
      "grad_norm": 39.030784606933594,
      "kl": 0.0,
      "learning_rate": 4.661083485998429e-07,
      "logps/chosen": -214.9169464111328,
      "logps/rejected": -231.33749389648438,
      "loss": 0.3734,
      "rewards/chosen": -0.9121836423873901,
      "rewards/margins": 2.3024215698242188,
      "rewards/rejected": -3.2146050930023193,
      "step": 259
    },
    {
      "epoch": 0.07,
      "grad_norm": 30.60447120666504,
      "kl": 0.0,
      "learning_rate": 4.659774928029311e-07,
      "logps/chosen": -237.2434539794922,
      "logps/rejected": -217.3273162841797,
      "loss": 0.3293,
      "rewards/chosen": -2.5721724033355713,
      "rewards/margins": 0.5294625759124756,
      "rewards/rejected": -3.101634979248047,
      "step": 260
    },
    {
      "epoch": 0.07,
      "grad_norm": 35.15093231201172,
      "kl": 0.0,
      "learning_rate": 4.658466370060193e-07,
      "logps/chosen": -251.8607635498047,
      "logps/rejected": -244.05323791503906,
      "loss": 0.2917,
      "rewards/chosen": -1.1427861452102661,
      "rewards/margins": 1.8218802213668823,
      "rewards/rejected": -2.9646663665771484,
      "step": 261
    },
    {
      "epoch": 0.07,
      "grad_norm": 42.44389724731445,
      "kl": 0.0,
      "learning_rate": 4.657157812091075e-07,
      "logps/chosen": -237.9396209716797,
      "logps/rejected": -263.0314025878906,
      "loss": 0.42,
      "rewards/chosen": -0.2850094735622406,
      "rewards/margins": 1.6426814794540405,
      "rewards/rejected": -1.9276909828186035,
      "step": 262
    },
    {
      "epoch": 0.07,
      "grad_norm": 37.14183044433594,
      "kl": 0.0,
      "learning_rate": 4.6558492541219576e-07,
      "logps/chosen": -185.787109375,
      "logps/rejected": -261.66461181640625,
      "loss": 0.446,
      "rewards/chosen": -1.9390316009521484,
      "rewards/margins": 0.6079616546630859,
      "rewards/rejected": -2.5469932556152344,
      "step": 263
    },
    {
      "epoch": 0.07,
      "grad_norm": 33.57796859741211,
      "kl": 0.0,
      "learning_rate": 4.6545406961528396e-07,
      "logps/chosen": -157.2796630859375,
      "logps/rejected": -283.6850891113281,
      "loss": 0.2372,
      "rewards/chosen": -0.8357820510864258,
      "rewards/margins": 2.492841958999634,
      "rewards/rejected": -3.3286240100860596,
      "step": 264
    },
    {
      "epoch": 0.07,
      "grad_norm": 40.712528228759766,
      "kl": 0.0,
      "learning_rate": 4.6532321381837215e-07,
      "logps/chosen": -220.23959350585938,
      "logps/rejected": -255.13534545898438,
      "loss": 0.4645,
      "rewards/chosen": -1.641434907913208,
      "rewards/margins": 0.34841763973236084,
      "rewards/rejected": -1.9898525476455688,
      "step": 265
    },
    {
      "epoch": 0.07,
      "grad_norm": 35.535221099853516,
      "kl": 0.0,
      "learning_rate": 4.6519235802146035e-07,
      "logps/chosen": -223.94854736328125,
      "logps/rejected": -246.826416015625,
      "loss": 0.4288,
      "rewards/chosen": -1.153721570968628,
      "rewards/margins": 0.5903757810592651,
      "rewards/rejected": -1.744097352027893,
      "step": 266
    },
    {
      "epoch": 0.07,
      "grad_norm": 42.14219665527344,
      "kl": 0.0,
      "learning_rate": 4.6506150222454855e-07,
      "logps/chosen": -211.50079345703125,
      "logps/rejected": -159.27609252929688,
      "loss": 0.3998,
      "rewards/chosen": -1.4032256603240967,
      "rewards/margins": 0.3597506284713745,
      "rewards/rejected": -1.7629762887954712,
      "step": 267
    },
    {
      "epoch": 0.07,
      "grad_norm": 38.34206771850586,
      "kl": 0.0,
      "learning_rate": 4.6493064642763674e-07,
      "logps/chosen": -269.44757080078125,
      "logps/rejected": -209.7416534423828,
      "loss": 0.4226,
      "rewards/chosen": -0.985856831073761,
      "rewards/margins": 1.1115570068359375,
      "rewards/rejected": -2.0974137783050537,
      "step": 268
    },
    {
      "epoch": 0.07,
      "grad_norm": 37.94346237182617,
      "kl": 0.0,
      "learning_rate": 4.6479979063072494e-07,
      "logps/chosen": -225.378173828125,
      "logps/rejected": -273.8226318359375,
      "loss": 0.3977,
      "rewards/chosen": -1.5155614614486694,
      "rewards/margins": 1.2213491201400757,
      "rewards/rejected": -2.736910581588745,
      "step": 269
    },
    {
      "epoch": 0.07,
      "grad_norm": 32.25834655761719,
      "kl": 0.0,
      "learning_rate": 4.6466893483381313e-07,
      "logps/chosen": -199.16827392578125,
      "logps/rejected": -248.81382751464844,
      "loss": 0.2876,
      "rewards/chosen": -0.5442147254943848,
      "rewards/margins": 2.8547651767730713,
      "rewards/rejected": -3.398979902267456,
      "step": 270
    },
    {
      "epoch": 0.07,
      "grad_norm": 31.128337860107422,
      "kl": 0.0,
      "learning_rate": 4.645380790369013e-07,
      "logps/chosen": -197.1934356689453,
      "logps/rejected": -247.61465454101562,
      "loss": 0.3763,
      "rewards/chosen": -1.405363917350769,
      "rewards/margins": 1.312684178352356,
      "rewards/rejected": -2.718048095703125,
      "step": 271
    },
    {
      "epoch": 0.07,
      "grad_norm": 29.893508911132812,
      "kl": 0.0,
      "learning_rate": 4.6440722323998947e-07,
      "logps/chosen": -136.73426818847656,
      "logps/rejected": -273.2953796386719,
      "loss": 0.2353,
      "rewards/chosen": -0.5470638871192932,
      "rewards/margins": 3.5090365409851074,
      "rewards/rejected": -4.056100368499756,
      "step": 272
    },
    {
      "epoch": 0.07,
      "grad_norm": 41.489437103271484,
      "kl": 0.0,
      "learning_rate": 4.6427636744307767e-07,
      "logps/chosen": -227.7633056640625,
      "logps/rejected": -265.1458740234375,
      "loss": 0.4804,
      "rewards/chosen": -1.4401181936264038,
      "rewards/margins": 1.0054680109024048,
      "rewards/rejected": -2.4455862045288086,
      "step": 273
    },
    {
      "epoch": 0.07,
      "grad_norm": 37.44647216796875,
      "kl": 0.0,
      "learning_rate": 4.6414551164616587e-07,
      "logps/chosen": -255.96173095703125,
      "logps/rejected": -239.48736572265625,
      "loss": 0.3189,
      "rewards/chosen": 0.37182945013046265,
      "rewards/margins": 3.4821300506591797,
      "rewards/rejected": -3.1103005409240723,
      "step": 274
    },
    {
      "epoch": 0.07,
      "grad_norm": 29.41668701171875,
      "kl": 0.0,
      "learning_rate": 4.640146558492541e-07,
      "logps/chosen": -201.92637634277344,
      "logps/rejected": -269.31353759765625,
      "loss": 0.3188,
      "rewards/chosen": -1.8693069219589233,
      "rewards/margins": 2.036485195159912,
      "rewards/rejected": -3.905791997909546,
      "step": 275
    },
    {
      "epoch": 0.07,
      "grad_norm": 38.13005447387695,
      "kl": 0.0,
      "learning_rate": 4.638838000523423e-07,
      "logps/chosen": -295.34039306640625,
      "logps/rejected": -117.98519134521484,
      "loss": 0.4628,
      "rewards/chosen": -1.9872322082519531,
      "rewards/margins": 0.26657724380493164,
      "rewards/rejected": -2.2538094520568848,
      "step": 276
    },
    {
      "epoch": 0.07,
      "grad_norm": 37.00981521606445,
      "kl": 0.0,
      "learning_rate": 4.637529442554305e-07,
      "logps/chosen": -220.0794219970703,
      "logps/rejected": -217.39553833007812,
      "loss": 0.3312,
      "rewards/chosen": -0.0731474757194519,
      "rewards/margins": 2.3764216899871826,
      "rewards/rejected": -2.4495692253112793,
      "step": 277
    },
    {
      "epoch": 0.07,
      "grad_norm": 33.918983459472656,
      "kl": 0.0,
      "learning_rate": 4.636220884585187e-07,
      "logps/chosen": -238.72340393066406,
      "logps/rejected": -212.756103515625,
      "loss": 0.3588,
      "rewards/chosen": -0.8355458378791809,
      "rewards/margins": 1.306795358657837,
      "rewards/rejected": -2.142341136932373,
      "step": 278
    },
    {
      "epoch": 0.07,
      "grad_norm": 30.741132736206055,
      "kl": 0.0,
      "learning_rate": 4.634912326616069e-07,
      "logps/chosen": -169.1579132080078,
      "logps/rejected": -195.23297119140625,
      "loss": 0.2557,
      "rewards/chosen": 0.54413241147995,
      "rewards/margins": 3.2936644554138184,
      "rewards/rejected": -2.7495319843292236,
      "step": 279
    },
    {
      "epoch": 0.07,
      "grad_norm": 27.65318489074707,
      "kl": 0.0,
      "learning_rate": 4.633603768646951e-07,
      "logps/chosen": -157.45840454101562,
      "logps/rejected": -249.44149780273438,
      "loss": 0.3092,
      "rewards/chosen": -1.2606077194213867,
      "rewards/margins": 1.9259283542633057,
      "rewards/rejected": -3.1865360736846924,
      "step": 280
    },
    {
      "epoch": 0.07,
      "grad_norm": 41.376121520996094,
      "kl": 0.0,
      "learning_rate": 4.632295210677833e-07,
      "logps/chosen": -186.80088806152344,
      "logps/rejected": -292.2984924316406,
      "loss": 0.3037,
      "rewards/chosen": -1.9069658517837524,
      "rewards/margins": 2.8274898529052734,
      "rewards/rejected": -4.734455585479736,
      "step": 281
    },
    {
      "epoch": 0.07,
      "grad_norm": 35.813560485839844,
      "kl": 0.0,
      "learning_rate": 4.630986652708715e-07,
      "logps/chosen": -184.72157287597656,
      "logps/rejected": -164.49346923828125,
      "loss": 0.4109,
      "rewards/chosen": -1.1023026704788208,
      "rewards/margins": 1.4965165853500366,
      "rewards/rejected": -2.5988192558288574,
      "step": 282
    },
    {
      "epoch": 0.07,
      "grad_norm": 23.877840042114258,
      "kl": 0.0,
      "learning_rate": 4.629678094739597e-07,
      "logps/chosen": -279.3133850097656,
      "logps/rejected": -282.68597412109375,
      "loss": 0.4464,
      "rewards/chosen": -3.0280158519744873,
      "rewards/margins": 0.8431928157806396,
      "rewards/rejected": -3.871208667755127,
      "step": 283
    },
    {
      "epoch": 0.07,
      "grad_norm": 28.17823028564453,
      "kl": 0.0,
      "learning_rate": 4.628369536770479e-07,
      "logps/chosen": -279.7229919433594,
      "logps/rejected": -268.95501708984375,
      "loss": 0.3018,
      "rewards/chosen": -1.9918068647384644,
      "rewards/margins": 0.9593816995620728,
      "rewards/rejected": -2.951188564300537,
      "step": 284
    },
    {
      "epoch": 0.07,
      "grad_norm": 26.56211280822754,
      "kl": 0.0,
      "learning_rate": 4.627060978801361e-07,
      "logps/chosen": -231.16656494140625,
      "logps/rejected": -253.2298126220703,
      "loss": 0.3365,
      "rewards/chosen": -1.716474175453186,
      "rewards/margins": 0.8960174322128296,
      "rewards/rejected": -2.6124916076660156,
      "step": 285
    },
    {
      "epoch": 0.07,
      "grad_norm": 34.96949005126953,
      "kl": 0.0,
      "learning_rate": 4.6257524208322427e-07,
      "logps/chosen": -228.77427673339844,
      "logps/rejected": -317.2232666015625,
      "loss": 0.3701,
      "rewards/chosen": -1.3939743041992188,
      "rewards/margins": 2.045186758041382,
      "rewards/rejected": -3.4391610622406006,
      "step": 286
    },
    {
      "epoch": 0.08,
      "grad_norm": 31.497352600097656,
      "kl": 0.0,
      "learning_rate": 4.624443862863124e-07,
      "logps/chosen": -260.1445007324219,
      "logps/rejected": -292.47064208984375,
      "loss": 0.394,
      "rewards/chosen": -1.3395678997039795,
      "rewards/margins": 2.3608195781707764,
      "rewards/rejected": -3.700387477874756,
      "step": 287
    },
    {
      "epoch": 0.08,
      "grad_norm": 32.95969772338867,
      "kl": 0.0,
      "learning_rate": 4.6231353048940066e-07,
      "logps/chosen": -177.44320678710938,
      "logps/rejected": -211.9121551513672,
      "loss": 0.3771,
      "rewards/chosen": -1.091148853302002,
      "rewards/margins": 1.9192008972167969,
      "rewards/rejected": -3.010349750518799,
      "step": 288
    },
    {
      "epoch": 0.08,
      "grad_norm": 37.73873519897461,
      "kl": 0.0,
      "learning_rate": 4.6218267469248886e-07,
      "logps/chosen": -193.40924072265625,
      "logps/rejected": -239.34669494628906,
      "loss": 0.3655,
      "rewards/chosen": 0.05758616328239441,
      "rewards/margins": 3.134945869445801,
      "rewards/rejected": -3.077359676361084,
      "step": 289
    },
    {
      "epoch": 0.08,
      "grad_norm": 36.732391357421875,
      "kl": 0.0,
      "learning_rate": 4.6205181889557706e-07,
      "logps/chosen": -208.61558532714844,
      "logps/rejected": -299.9091796875,
      "loss": 0.395,
      "rewards/chosen": -1.2155559062957764,
      "rewards/margins": 3.3333466053009033,
      "rewards/rejected": -4.54890251159668,
      "step": 290
    },
    {
      "epoch": 0.08,
      "grad_norm": 37.14192199707031,
      "kl": 0.0,
      "learning_rate": 4.6192096309866525e-07,
      "logps/chosen": -298.52789306640625,
      "logps/rejected": -256.9224548339844,
      "loss": 0.3217,
      "rewards/chosen": -1.5316705703735352,
      "rewards/margins": 1.1650197505950928,
      "rewards/rejected": -2.696690320968628,
      "step": 291
    },
    {
      "epoch": 0.08,
      "grad_norm": 32.1880989074707,
      "kl": 0.0,
      "learning_rate": 4.6179010730175345e-07,
      "logps/chosen": -230.33580017089844,
      "logps/rejected": -185.30084228515625,
      "loss": 0.5325,
      "rewards/chosen": -2.6992344856262207,
      "rewards/margins": -1.3056819438934326,
      "rewards/rejected": -1.393552541732788,
      "step": 292
    },
    {
      "epoch": 0.08,
      "grad_norm": 29.584102630615234,
      "kl": 0.0,
      "learning_rate": 4.6165925150484164e-07,
      "logps/chosen": -213.70803833007812,
      "logps/rejected": -351.8007507324219,
      "loss": 0.2521,
      "rewards/chosen": 0.7046425938606262,
      "rewards/margins": 5.104078769683838,
      "rewards/rejected": -4.399435997009277,
      "step": 293
    },
    {
      "epoch": 0.08,
      "grad_norm": 38.5718879699707,
      "kl": 0.0,
      "learning_rate": 4.6152839570792984e-07,
      "logps/chosen": -226.90501403808594,
      "logps/rejected": -238.16050720214844,
      "loss": 0.3634,
      "rewards/chosen": -1.4450457096099854,
      "rewards/margins": 1.0846538543701172,
      "rewards/rejected": -2.5296995639801025,
      "step": 294
    },
    {
      "epoch": 0.08,
      "grad_norm": 32.4056396484375,
      "kl": 0.0,
      "learning_rate": 4.6139753991101804e-07,
      "logps/chosen": -232.12557983398438,
      "logps/rejected": -208.5637969970703,
      "loss": 0.4,
      "rewards/chosen": -0.6349735260009766,
      "rewards/margins": 1.985107421875,
      "rewards/rejected": -2.6200809478759766,
      "step": 295
    },
    {
      "epoch": 0.08,
      "grad_norm": 33.10838317871094,
      "kl": 0.0,
      "learning_rate": 4.6126668411410623e-07,
      "logps/chosen": -267.5959777832031,
      "logps/rejected": -214.76614379882812,
      "loss": 0.3235,
      "rewards/chosen": 0.1534360647201538,
      "rewards/margins": 2.617750644683838,
      "rewards/rejected": -2.4643144607543945,
      "step": 296
    },
    {
      "epoch": 0.08,
      "grad_norm": 38.25209426879883,
      "kl": 0.0,
      "learning_rate": 4.6113582831719443e-07,
      "logps/chosen": -195.14288330078125,
      "logps/rejected": -314.3270263671875,
      "loss": 0.4319,
      "rewards/chosen": -1.1439883708953857,
      "rewards/margins": 0.742764949798584,
      "rewards/rejected": -1.8867533206939697,
      "step": 297
    },
    {
      "epoch": 0.08,
      "grad_norm": 36.58635711669922,
      "kl": 0.0,
      "learning_rate": 4.610049725202826e-07,
      "logps/chosen": -229.46063232421875,
      "logps/rejected": -270.51690673828125,
      "loss": 0.4123,
      "rewards/chosen": -1.3162682056427002,
      "rewards/margins": 2.033482313156128,
      "rewards/rejected": -3.349750518798828,
      "step": 298
    },
    {
      "epoch": 0.08,
      "grad_norm": 42.640995025634766,
      "kl": 0.0,
      "learning_rate": 4.608741167233708e-07,
      "logps/chosen": -247.8602294921875,
      "logps/rejected": -213.57020568847656,
      "loss": 0.3484,
      "rewards/chosen": -0.9282722473144531,
      "rewards/margins": 2.3152124881744385,
      "rewards/rejected": -3.2434847354888916,
      "step": 299
    },
    {
      "epoch": 0.08,
      "grad_norm": 30.121410369873047,
      "kl": 0.0,
      "learning_rate": 4.60743260926459e-07,
      "logps/chosen": -322.234375,
      "logps/rejected": -201.81800842285156,
      "loss": 0.3658,
      "rewards/chosen": -4.029051303863525,
      "rewards/margins": -1.586775779724121,
      "rewards/rejected": -2.4422755241394043,
      "step": 300
    },
    {
      "epoch": 0.08,
      "grad_norm": 34.066890716552734,
      "kl": 0.0,
      "learning_rate": 4.6061240512954727e-07,
      "logps/chosen": -241.13233947753906,
      "logps/rejected": -256.83050537109375,
      "loss": 0.4248,
      "rewards/chosen": -0.6289612650871277,
      "rewards/margins": 2.3376858234405518,
      "rewards/rejected": -2.966647148132324,
      "step": 301
    },
    {
      "epoch": 0.08,
      "grad_norm": 40.31227493286133,
      "kl": 0.0,
      "learning_rate": 4.604815493326354e-07,
      "logps/chosen": -160.533447265625,
      "logps/rejected": -278.9567565917969,
      "loss": 0.2845,
      "rewards/chosen": -0.09239254146814346,
      "rewards/margins": 2.371143341064453,
      "rewards/rejected": -2.463535785675049,
      "step": 302
    },
    {
      "epoch": 0.08,
      "grad_norm": 35.84306716918945,
      "kl": 0.0,
      "learning_rate": 4.603506935357236e-07,
      "logps/chosen": -315.9750061035156,
      "logps/rejected": -267.2980041503906,
      "loss": 0.3654,
      "rewards/chosen": -2.6014981269836426,
      "rewards/margins": 0.05855679512023926,
      "rewards/rejected": -2.660054922103882,
      "step": 303
    },
    {
      "epoch": 0.08,
      "grad_norm": 33.047218322753906,
      "kl": 0.0,
      "learning_rate": 4.602198377388118e-07,
      "logps/chosen": -217.36009216308594,
      "logps/rejected": -280.88397216796875,
      "loss": 0.3229,
      "rewards/chosen": -0.9439008235931396,
      "rewards/margins": 1.590653896331787,
      "rewards/rejected": -2.5345547199249268,
      "step": 304
    },
    {
      "epoch": 0.08,
      "grad_norm": 28.75385093688965,
      "kl": 0.0,
      "learning_rate": 4.600889819419e-07,
      "logps/chosen": -239.74850463867188,
      "logps/rejected": -273.96075439453125,
      "loss": 0.4472,
      "rewards/chosen": -1.6880508661270142,
      "rewards/margins": 0.40789759159088135,
      "rewards/rejected": -2.0959484577178955,
      "step": 305
    },
    {
      "epoch": 0.08,
      "grad_norm": 32.19318389892578,
      "kl": 0.0,
      "learning_rate": 4.599581261449882e-07,
      "logps/chosen": -210.97512817382812,
      "logps/rejected": -190.31158447265625,
      "loss": 0.3105,
      "rewards/chosen": -0.14496548473834991,
      "rewards/margins": 3.4035897254943848,
      "rewards/rejected": -3.5485551357269287,
      "step": 306
    },
    {
      "epoch": 0.08,
      "grad_norm": 35.92880630493164,
      "kl": 0.0,
      "learning_rate": 4.598272703480764e-07,
      "logps/chosen": -287.72381591796875,
      "logps/rejected": -233.5065155029297,
      "loss": 0.3544,
      "rewards/chosen": -0.6701926589012146,
      "rewards/margins": 2.354236602783203,
      "rewards/rejected": -3.0244293212890625,
      "step": 307
    },
    {
      "epoch": 0.08,
      "grad_norm": 40.42152404785156,
      "kl": 0.0,
      "learning_rate": 4.596964145511646e-07,
      "logps/chosen": -209.75469970703125,
      "logps/rejected": -257.677978515625,
      "loss": 0.3458,
      "rewards/chosen": -1.3090839385986328,
      "rewards/margins": 0.7361466884613037,
      "rewards/rejected": -2.0452306270599365,
      "step": 308
    },
    {
      "epoch": 0.08,
      "grad_norm": 31.875043869018555,
      "kl": 0.0,
      "learning_rate": 4.595655587542528e-07,
      "logps/chosen": -283.4571228027344,
      "logps/rejected": -289.9070739746094,
      "loss": 0.2973,
      "rewards/chosen": -0.22630636394023895,
      "rewards/margins": 2.636915445327759,
      "rewards/rejected": -2.8632218837738037,
      "step": 309
    },
    {
      "epoch": 0.08,
      "grad_norm": 28.033388137817383,
      "kl": 0.0,
      "learning_rate": 4.59434702957341e-07,
      "logps/chosen": -218.15414428710938,
      "logps/rejected": -284.31768798828125,
      "loss": 0.3171,
      "rewards/chosen": -0.10873293876647949,
      "rewards/margins": 3.379168748855591,
      "rewards/rejected": -3.4879016876220703,
      "step": 310
    },
    {
      "epoch": 0.08,
      "grad_norm": 26.282222747802734,
      "kl": 0.0,
      "learning_rate": 4.593038471604292e-07,
      "logps/chosen": -297.50421142578125,
      "logps/rejected": -234.680908203125,
      "loss": 0.3721,
      "rewards/chosen": -1.7684048414230347,
      "rewards/margins": 2.316326141357422,
      "rewards/rejected": -4.084731101989746,
      "step": 311
    },
    {
      "epoch": 0.08,
      "grad_norm": 37.98878479003906,
      "kl": 0.0,
      "learning_rate": 4.5917299136351737e-07,
      "logps/chosen": -224.66392517089844,
      "logps/rejected": -205.10140991210938,
      "loss": 0.4488,
      "rewards/chosen": 0.4431179463863373,
      "rewards/margins": 4.447665214538574,
      "rewards/rejected": -4.004547119140625,
      "step": 312
    },
    {
      "epoch": 0.08,
      "grad_norm": 31.1519775390625,
      "kl": 0.0,
      "learning_rate": 4.590421355666056e-07,
      "logps/chosen": -162.36737060546875,
      "logps/rejected": -209.4821319580078,
      "loss": 0.328,
      "rewards/chosen": -1.0744552612304688,
      "rewards/margins": 0.31392955780029297,
      "rewards/rejected": -1.3883848190307617,
      "step": 313
    },
    {
      "epoch": 0.08,
      "grad_norm": 35.20518112182617,
      "kl": 0.0,
      "learning_rate": 4.589112797696938e-07,
      "logps/chosen": -234.55455017089844,
      "logps/rejected": -276.0071716308594,
      "loss": 0.2797,
      "rewards/chosen": -1.737378716468811,
      "rewards/margins": 0.6882723569869995,
      "rewards/rejected": -2.4256510734558105,
      "step": 314
    },
    {
      "epoch": 0.08,
      "grad_norm": 32.201969146728516,
      "kl": 0.0,
      "learning_rate": 4.58780423972782e-07,
      "logps/chosen": -250.39517211914062,
      "logps/rejected": -225.61228942871094,
      "loss": 0.3912,
      "rewards/chosen": -0.4257909655570984,
      "rewards/margins": 2.58058500289917,
      "rewards/rejected": -3.006376028060913,
      "step": 315
    },
    {
      "epoch": 0.08,
      "grad_norm": 34.16560363769531,
      "kl": 0.0,
      "learning_rate": 4.586495681758702e-07,
      "logps/chosen": -188.47059631347656,
      "logps/rejected": -191.3616943359375,
      "loss": 0.3849,
      "rewards/chosen": -1.0985850095748901,
      "rewards/margins": 0.9038892984390259,
      "rewards/rejected": -2.002474308013916,
      "step": 316
    },
    {
      "epoch": 0.08,
      "grad_norm": 29.20052146911621,
      "kl": 0.0,
      "learning_rate": 4.585187123789584e-07,
      "logps/chosen": -232.6088104248047,
      "logps/rejected": -240.9322509765625,
      "loss": 0.3795,
      "rewards/chosen": -1.5760377645492554,
      "rewards/margins": 1.283504605293274,
      "rewards/rejected": -2.8595423698425293,
      "step": 317
    },
    {
      "epoch": 0.08,
      "grad_norm": 30.147212982177734,
      "kl": 0.0,
      "learning_rate": 4.5838785658204655e-07,
      "logps/chosen": -205.91464233398438,
      "logps/rejected": -161.905029296875,
      "loss": 0.457,
      "rewards/chosen": -2.334472417831421,
      "rewards/margins": 0.6229822635650635,
      "rewards/rejected": -2.9574546813964844,
      "step": 318
    },
    {
      "epoch": 0.08,
      "grad_norm": 30.38490104675293,
      "kl": 0.0,
      "learning_rate": 4.5825700078513474e-07,
      "logps/chosen": -263.05523681640625,
      "logps/rejected": -242.82974243164062,
      "loss": 0.2415,
      "rewards/chosen": -0.4325067400932312,
      "rewards/margins": 2.2001450061798096,
      "rewards/rejected": -2.6326518058776855,
      "step": 319
    },
    {
      "epoch": 0.08,
      "grad_norm": 30.640369415283203,
      "kl": 0.0,
      "learning_rate": 4.5812614498822294e-07,
      "logps/chosen": -220.70742797851562,
      "logps/rejected": -234.42156982421875,
      "loss": 0.328,
      "rewards/chosen": -0.45821380615234375,
      "rewards/margins": 2.211210250854492,
      "rewards/rejected": -2.669424057006836,
      "step": 320
    },
    {
      "epoch": 0.08,
      "grad_norm": 33.323116302490234,
      "kl": 0.0,
      "learning_rate": 4.5799528919131113e-07,
      "logps/chosen": -186.67019653320312,
      "logps/rejected": -245.32626342773438,
      "loss": 0.3918,
      "rewards/chosen": -1.0099728107452393,
      "rewards/margins": 2.810298442840576,
      "rewards/rejected": -3.8202712535858154,
      "step": 321
    },
    {
      "epoch": 0.08,
      "grad_norm": 33.337738037109375,
      "kl": 0.0,
      "learning_rate": 4.5786443339439933e-07,
      "logps/chosen": -204.4862518310547,
      "logps/rejected": -188.58395385742188,
      "loss": 0.3712,
      "rewards/chosen": -0.8825057148933411,
      "rewards/margins": 2.2278504371643066,
      "rewards/rejected": -3.110356092453003,
      "step": 322
    },
    {
      "epoch": 0.08,
      "grad_norm": 28.831174850463867,
      "kl": 0.0,
      "learning_rate": 4.5773357759748753e-07,
      "logps/chosen": -147.2012481689453,
      "logps/rejected": -315.11572265625,
      "loss": 0.1919,
      "rewards/chosen": -1.441099762916565,
      "rewards/margins": 1.3434785604476929,
      "rewards/rejected": -2.784578323364258,
      "step": 323
    },
    {
      "epoch": 0.08,
      "grad_norm": 36.66320037841797,
      "kl": 0.0,
      "learning_rate": 4.576027218005757e-07,
      "logps/chosen": -251.1447296142578,
      "logps/rejected": -287.647705078125,
      "loss": 0.3665,
      "rewards/chosen": -0.43537652492523193,
      "rewards/margins": 3.960326671600342,
      "rewards/rejected": -4.395703315734863,
      "step": 324
    },
    {
      "epoch": 0.09,
      "grad_norm": 32.31867599487305,
      "kl": 0.0,
      "learning_rate": 4.574718660036639e-07,
      "logps/chosen": -271.9960021972656,
      "logps/rejected": -210.555908203125,
      "loss": 0.4788,
      "rewards/chosen": -2.4970388412475586,
      "rewards/margins": 0.04955768585205078,
      "rewards/rejected": -2.5465965270996094,
      "step": 325
    },
    {
      "epoch": 0.09,
      "grad_norm": 24.782032012939453,
      "kl": 0.0,
      "learning_rate": 4.5734101020675217e-07,
      "logps/chosen": -270.2687683105469,
      "logps/rejected": -261.0841369628906,
      "loss": 0.2655,
      "rewards/chosen": -0.7793827056884766,
      "rewards/margins": 4.47045373916626,
      "rewards/rejected": -5.249836444854736,
      "step": 326
    },
    {
      "epoch": 0.09,
      "grad_norm": 34.565975189208984,
      "kl": 0.0,
      "learning_rate": 4.5721015440984036e-07,
      "logps/chosen": -173.59933471679688,
      "logps/rejected": -167.2440185546875,
      "loss": 0.2976,
      "rewards/chosen": 0.5397694110870361,
      "rewards/margins": 2.3671255111694336,
      "rewards/rejected": -1.827355980873108,
      "step": 327
    },
    {
      "epoch": 0.09,
      "grad_norm": 28.94892692565918,
      "kl": 0.0,
      "learning_rate": 4.5707929861292856e-07,
      "logps/chosen": -195.4495849609375,
      "logps/rejected": -331.1904296875,
      "loss": 0.2857,
      "rewards/chosen": -1.0347931385040283,
      "rewards/margins": 0.9696147441864014,
      "rewards/rejected": -2.0044078826904297,
      "step": 328
    },
    {
      "epoch": 0.09,
      "grad_norm": 32.52096939086914,
      "kl": 0.0,
      "learning_rate": 4.5694844281601676e-07,
      "logps/chosen": -258.057373046875,
      "logps/rejected": -178.46829223632812,
      "loss": 0.3334,
      "rewards/chosen": -0.9900558590888977,
      "rewards/margins": 1.796595811843872,
      "rewards/rejected": -2.786651611328125,
      "step": 329
    },
    {
      "epoch": 0.09,
      "grad_norm": 27.296085357666016,
      "kl": 0.0,
      "learning_rate": 4.5681758701910495e-07,
      "logps/chosen": -184.41143798828125,
      "logps/rejected": -201.610107421875,
      "loss": 0.2267,
      "rewards/chosen": 0.0977037250995636,
      "rewards/margins": 4.80125617980957,
      "rewards/rejected": -4.70355224609375,
      "step": 330
    },
    {
      "epoch": 0.09,
      "grad_norm": 41.09619140625,
      "kl": 0.0,
      "learning_rate": 4.5668673122219315e-07,
      "logps/chosen": -250.09512329101562,
      "logps/rejected": -241.91793823242188,
      "loss": 0.235,
      "rewards/chosen": -0.5410808324813843,
      "rewards/margins": 3.400998115539551,
      "rewards/rejected": -3.9420788288116455,
      "step": 331
    },
    {
      "epoch": 0.09,
      "grad_norm": 42.17990493774414,
      "kl": 0.0,
      "learning_rate": 4.5655587542528134e-07,
      "logps/chosen": -156.83604431152344,
      "logps/rejected": -210.15084838867188,
      "loss": 0.3969,
      "rewards/chosen": -0.7945634126663208,
      "rewards/margins": 0.6295374631881714,
      "rewards/rejected": -1.4241008758544922,
      "step": 332
    },
    {
      "epoch": 0.09,
      "grad_norm": 27.20249366760254,
      "kl": 0.0,
      "learning_rate": 4.564250196283695e-07,
      "logps/chosen": -219.873291015625,
      "logps/rejected": -245.4140167236328,
      "loss": 0.417,
      "rewards/chosen": -1.2123730182647705,
      "rewards/margins": 1.475210428237915,
      "rewards/rejected": -2.6875834465026855,
      "step": 333
    },
    {
      "epoch": 0.09,
      "grad_norm": 36.41046142578125,
      "kl": 0.0,
      "learning_rate": 4.562941638314577e-07,
      "logps/chosen": -280.1678771972656,
      "logps/rejected": -204.20095825195312,
      "loss": 0.3973,
      "rewards/chosen": -0.5638540983200073,
      "rewards/margins": 1.0525778532028198,
      "rewards/rejected": -1.6164319515228271,
      "step": 334
    },
    {
      "epoch": 0.09,
      "grad_norm": 35.49837875366211,
      "kl": 0.0,
      "learning_rate": 4.561633080345459e-07,
      "logps/chosen": -146.32089233398438,
      "logps/rejected": -191.40664672851562,
      "loss": 0.3823,
      "rewards/chosen": -0.7487538456916809,
      "rewards/margins": 1.607696294784546,
      "rewards/rejected": -2.356450080871582,
      "step": 335
    },
    {
      "epoch": 0.09,
      "grad_norm": 30.863536834716797,
      "kl": 0.0,
      "learning_rate": 4.560324522376341e-07,
      "logps/chosen": -187.48995971679688,
      "logps/rejected": -209.26446533203125,
      "loss": 0.3971,
      "rewards/chosen": 0.22723525762557983,
      "rewards/margins": 2.7782607078552246,
      "rewards/rejected": -2.551025390625,
      "step": 336
    },
    {
      "epoch": 0.09,
      "grad_norm": 37.7666015625,
      "kl": 0.0,
      "learning_rate": 4.5590159644072227e-07,
      "logps/chosen": -248.80059814453125,
      "logps/rejected": -278.6592712402344,
      "loss": 0.4711,
      "rewards/chosen": -1.5165441036224365,
      "rewards/margins": 1.5899741649627686,
      "rewards/rejected": -3.106518268585205,
      "step": 337
    },
    {
      "epoch": 0.09,
      "grad_norm": 37.20514678955078,
      "kl": 0.0,
      "learning_rate": 4.5577074064381047e-07,
      "logps/chosen": -194.8486328125,
      "logps/rejected": -197.070068359375,
      "loss": 0.2708,
      "rewards/chosen": -0.3378767967224121,
      "rewards/margins": 4.038641452789307,
      "rewards/rejected": -4.376518249511719,
      "step": 338
    },
    {
      "epoch": 0.09,
      "grad_norm": 32.2724723815918,
      "kl": 0.0,
      "learning_rate": 4.556398848468987e-07,
      "logps/chosen": -183.97116088867188,
      "logps/rejected": -131.2269287109375,
      "loss": 0.4721,
      "rewards/chosen": -0.4851449131965637,
      "rewards/margins": 1.3216569423675537,
      "rewards/rejected": -1.8068017959594727,
      "step": 339
    },
    {
      "epoch": 0.09,
      "grad_norm": 41.788490295410156,
      "kl": 0.0,
      "learning_rate": 4.555090290499869e-07,
      "logps/chosen": -183.49282836914062,
      "logps/rejected": -221.72186279296875,
      "loss": 0.442,
      "rewards/chosen": -0.2105513960123062,
      "rewards/margins": 1.0875836610794067,
      "rewards/rejected": -1.2981350421905518,
      "step": 340
    },
    {
      "epoch": 0.09,
      "grad_norm": 45.68136978149414,
      "kl": 0.0,
      "learning_rate": 4.553781732530751e-07,
      "logps/chosen": -244.565185546875,
      "logps/rejected": -161.9296417236328,
      "loss": 0.3436,
      "rewards/chosen": -1.4009125232696533,
      "rewards/margins": 0.33695781230926514,
      "rewards/rejected": -1.7378703355789185,
      "step": 341
    },
    {
      "epoch": 0.09,
      "grad_norm": 32.6483039855957,
      "kl": 0.0,
      "learning_rate": 4.552473174561633e-07,
      "logps/chosen": -231.45835876464844,
      "logps/rejected": -231.79014587402344,
      "loss": 0.3056,
      "rewards/chosen": -0.8326898813247681,
      "rewards/margins": 2.3973751068115234,
      "rewards/rejected": -3.230065107345581,
      "step": 342
    },
    {
      "epoch": 0.09,
      "grad_norm": 35.41828536987305,
      "kl": 0.0,
      "learning_rate": 4.551164616592515e-07,
      "logps/chosen": -257.51177978515625,
      "logps/rejected": -246.73941040039062,
      "loss": 0.3756,
      "rewards/chosen": -0.0694907158613205,
      "rewards/margins": 2.133653402328491,
      "rewards/rejected": -2.203144073486328,
      "step": 343
    },
    {
      "epoch": 0.09,
      "grad_norm": 34.879451751708984,
      "kl": 0.0,
      "learning_rate": 4.549856058623397e-07,
      "logps/chosen": -234.83639526367188,
      "logps/rejected": -155.150146484375,
      "loss": 0.3385,
      "rewards/chosen": -0.7099578380584717,
      "rewards/margins": 1.7405083179473877,
      "rewards/rejected": -2.4504661560058594,
      "step": 344
    },
    {
      "epoch": 0.09,
      "grad_norm": 31.389076232910156,
      "kl": 0.0,
      "learning_rate": 4.548547500654279e-07,
      "logps/chosen": -258.70452880859375,
      "logps/rejected": -311.24713134765625,
      "loss": 0.4328,
      "rewards/chosen": -2.0910329818725586,
      "rewards/margins": 0.573084831237793,
      "rewards/rejected": -2.6641178131103516,
      "step": 345
    },
    {
      "epoch": 0.09,
      "grad_norm": 27.579402923583984,
      "kl": 0.0,
      "learning_rate": 4.547238942685161e-07,
      "logps/chosen": -117.16097259521484,
      "logps/rejected": -220.65586853027344,
      "loss": 0.4051,
      "rewards/chosen": -1.0259122848510742,
      "rewards/margins": 2.0292129516601562,
      "rewards/rejected": -3.0551252365112305,
      "step": 346
    },
    {
      "epoch": 0.09,
      "grad_norm": 27.12200164794922,
      "kl": 0.0,
      "learning_rate": 4.545930384716043e-07,
      "logps/chosen": -138.2091827392578,
      "logps/rejected": -207.37220764160156,
      "loss": 0.2588,
      "rewards/chosen": -0.047653257846832275,
      "rewards/margins": 2.87380313873291,
      "rewards/rejected": -2.9214563369750977,
      "step": 347
    },
    {
      "epoch": 0.09,
      "grad_norm": 39.55778121948242,
      "kl": 0.0,
      "learning_rate": 4.544621826746925e-07,
      "logps/chosen": -200.1582489013672,
      "logps/rejected": -231.9014892578125,
      "loss": 0.4636,
      "rewards/chosen": -0.7325445413589478,
      "rewards/margins": 1.465537667274475,
      "rewards/rejected": -2.198082208633423,
      "step": 348
    },
    {
      "epoch": 0.09,
      "grad_norm": 42.7136344909668,
      "kl": 0.0,
      "learning_rate": 4.543313268777806e-07,
      "logps/chosen": -276.27520751953125,
      "logps/rejected": -225.70761108398438,
      "loss": 0.4484,
      "rewards/chosen": -0.7546127438545227,
      "rewards/margins": 2.266116142272949,
      "rewards/rejected": -3.020728826522827,
      "step": 349
    },
    {
      "epoch": 0.09,
      "grad_norm": 29.12485694885254,
      "kl": 0.0,
      "learning_rate": 4.542004710808688e-07,
      "logps/chosen": -203.95103454589844,
      "logps/rejected": -192.92665100097656,
      "loss": 0.257,
      "rewards/chosen": -2.052464008331299,
      "rewards/margins": 0.40848374366760254,
      "rewards/rejected": -2.4609477519989014,
      "step": 350
    },
    {
      "epoch": 0.09,
      "grad_norm": 36.106475830078125,
      "kl": 0.0,
      "learning_rate": 4.54069615283957e-07,
      "logps/chosen": -229.77401733398438,
      "logps/rejected": -179.5250244140625,
      "loss": 0.3838,
      "rewards/chosen": -0.8526365756988525,
      "rewards/margins": 0.7249114513397217,
      "rewards/rejected": -1.5775480270385742,
      "step": 351
    },
    {
      "epoch": 0.09,
      "grad_norm": 28.871374130249023,
      "kl": 0.0,
      "learning_rate": 4.5393875948704527e-07,
      "logps/chosen": -155.2418975830078,
      "logps/rejected": -218.70916748046875,
      "loss": 0.3559,
      "rewards/chosen": -0.5292015075683594,
      "rewards/margins": 1.4907526969909668,
      "rewards/rejected": -2.019954204559326,
      "step": 352
    },
    {
      "epoch": 0.09,
      "grad_norm": 31.84480857849121,
      "kl": 0.0,
      "learning_rate": 4.5380790369013346e-07,
      "logps/chosen": -153.22557067871094,
      "logps/rejected": -258.18243408203125,
      "loss": 0.2501,
      "rewards/chosen": -1.4030628204345703,
      "rewards/margins": 1.9705705642700195,
      "rewards/rejected": -3.37363338470459,
      "step": 353
    },
    {
      "epoch": 0.09,
      "grad_norm": 31.34514617919922,
      "kl": 0.0,
      "learning_rate": 4.5367704789322166e-07,
      "logps/chosen": -146.6463165283203,
      "logps/rejected": -246.03086853027344,
      "loss": 0.3873,
      "rewards/chosen": 0.3020341992378235,
      "rewards/margins": 2.9109833240509033,
      "rewards/rejected": -2.6089491844177246,
      "step": 354
    },
    {
      "epoch": 0.09,
      "grad_norm": 33.196434020996094,
      "kl": 0.0,
      "learning_rate": 4.5354619209630985e-07,
      "logps/chosen": -186.69134521484375,
      "logps/rejected": -210.71072387695312,
      "loss": 0.3469,
      "rewards/chosen": 0.3410475254058838,
      "rewards/margins": 2.4019737243652344,
      "rewards/rejected": -2.0609261989593506,
      "step": 355
    },
    {
      "epoch": 0.09,
      "grad_norm": 27.802734375,
      "kl": 0.0,
      "learning_rate": 4.5341533629939805e-07,
      "logps/chosen": -199.94622802734375,
      "logps/rejected": -146.0865020751953,
      "loss": 0.3936,
      "rewards/chosen": -0.6471868753433228,
      "rewards/margins": 1.7640784978866577,
      "rewards/rejected": -2.4112653732299805,
      "step": 356
    },
    {
      "epoch": 0.09,
      "grad_norm": 38.54668045043945,
      "kl": 0.0,
      "learning_rate": 4.5328448050248625e-07,
      "logps/chosen": -223.62625122070312,
      "logps/rejected": -254.55821228027344,
      "loss": 0.3434,
      "rewards/chosen": -1.1160787343978882,
      "rewards/margins": 1.0182207822799683,
      "rewards/rejected": -2.1342995166778564,
      "step": 357
    },
    {
      "epoch": 0.09,
      "grad_norm": 28.735496520996094,
      "kl": 0.0,
      "learning_rate": 4.5315362470557444e-07,
      "logps/chosen": -211.4855194091797,
      "logps/rejected": -257.02008056640625,
      "loss": 0.2874,
      "rewards/chosen": -0.2564331591129303,
      "rewards/margins": 3.4900126457214355,
      "rewards/rejected": -3.746445894241333,
      "step": 358
    },
    {
      "epoch": 0.09,
      "grad_norm": 35.81536102294922,
      "kl": 0.0,
      "learning_rate": 4.5302276890866264e-07,
      "logps/chosen": -304.2633361816406,
      "logps/rejected": -209.7054901123047,
      "loss": 0.2967,
      "rewards/chosen": -0.46264463663101196,
      "rewards/margins": 3.166010856628418,
      "rewards/rejected": -3.628655433654785,
      "step": 359
    },
    {
      "epoch": 0.09,
      "grad_norm": 40.20705795288086,
      "kl": 0.0,
      "learning_rate": 4.5289191311175084e-07,
      "logps/chosen": -266.1392822265625,
      "logps/rejected": -251.1559600830078,
      "loss": 0.4054,
      "rewards/chosen": -0.13369740545749664,
      "rewards/margins": 1.7442787885665894,
      "rewards/rejected": -1.8779761791229248,
      "step": 360
    },
    {
      "epoch": 0.09,
      "grad_norm": 31.861595153808594,
      "kl": 0.0,
      "learning_rate": 4.5276105731483903e-07,
      "logps/chosen": -224.10833740234375,
      "logps/rejected": -223.89828491210938,
      "loss": 0.2936,
      "rewards/chosen": -0.20376244187355042,
      "rewards/margins": 2.0179710388183594,
      "rewards/rejected": -2.221733570098877,
      "step": 361
    },
    {
      "epoch": 0.09,
      "grad_norm": 34.194580078125,
      "kl": 0.0,
      "learning_rate": 4.5263020151792723e-07,
      "logps/chosen": -229.5283203125,
      "logps/rejected": -227.322998046875,
      "loss": 0.3522,
      "rewards/chosen": 0.7541962265968323,
      "rewards/margins": 2.9511911869049072,
      "rewards/rejected": -2.1969950199127197,
      "step": 362
    },
    {
      "epoch": 0.1,
      "grad_norm": 33.832305908203125,
      "kl": 0.0,
      "learning_rate": 4.524993457210154e-07,
      "logps/chosen": -225.90982055664062,
      "logps/rejected": -228.3668975830078,
      "loss": 0.3117,
      "rewards/chosen": -0.7936959266662598,
      "rewards/margins": 1.0352134704589844,
      "rewards/rejected": -1.8289093971252441,
      "step": 363
    },
    {
      "epoch": 0.1,
      "grad_norm": 29.428434371948242,
      "kl": 0.0,
      "learning_rate": 4.5236848992410357e-07,
      "logps/chosen": -185.18519592285156,
      "logps/rejected": -264.0128173828125,
      "loss": 0.2548,
      "rewards/chosen": -1.0546776056289673,
      "rewards/margins": 2.7360973358154297,
      "rewards/rejected": -3.7907750606536865,
      "step": 364
    },
    {
      "epoch": 0.1,
      "grad_norm": 32.661277770996094,
      "kl": 0.0,
      "learning_rate": 4.522376341271918e-07,
      "logps/chosen": -208.7366180419922,
      "logps/rejected": -296.23974609375,
      "loss": 0.3074,
      "rewards/chosen": 0.3005746304988861,
      "rewards/margins": 2.755589246749878,
      "rewards/rejected": -2.455014705657959,
      "step": 365
    },
    {
      "epoch": 0.1,
      "grad_norm": 30.65373420715332,
      "kl": 0.0,
      "learning_rate": 4.5210677833028e-07,
      "logps/chosen": -259.0675354003906,
      "logps/rejected": -204.466064453125,
      "loss": 0.2904,
      "rewards/chosen": -1.7415310144424438,
      "rewards/margins": 0.5091778039932251,
      "rewards/rejected": -2.250708818435669,
      "step": 366
    },
    {
      "epoch": 0.1,
      "grad_norm": 35.81827926635742,
      "kl": 0.0,
      "learning_rate": 4.519759225333682e-07,
      "logps/chosen": -244.27406311035156,
      "logps/rejected": -152.07162475585938,
      "loss": 0.3159,
      "rewards/chosen": -1.1547815799713135,
      "rewards/margins": 1.677286148071289,
      "rewards/rejected": -2.8320677280426025,
      "step": 367
    },
    {
      "epoch": 0.1,
      "grad_norm": 40.60730743408203,
      "kl": 0.0,
      "learning_rate": 4.518450667364564e-07,
      "logps/chosen": -300.86279296875,
      "logps/rejected": -291.1861572265625,
      "loss": 0.4233,
      "rewards/chosen": 0.04416877031326294,
      "rewards/margins": 3.0914156436920166,
      "rewards/rejected": -3.0472469329833984,
      "step": 368
    },
    {
      "epoch": 0.1,
      "grad_norm": 42.675716400146484,
      "kl": 0.0,
      "learning_rate": 4.517142109395446e-07,
      "logps/chosen": -193.25894165039062,
      "logps/rejected": -163.64535522460938,
      "loss": 0.4682,
      "rewards/chosen": -1.3016144037246704,
      "rewards/margins": 0.8137959241867065,
      "rewards/rejected": -2.115410327911377,
      "step": 369
    },
    {
      "epoch": 0.1,
      "grad_norm": 33.09608840942383,
      "kl": 0.0,
      "learning_rate": 4.515833551426328e-07,
      "logps/chosen": -160.6686553955078,
      "logps/rejected": -247.52890014648438,
      "loss": 0.3394,
      "rewards/chosen": -0.44542351365089417,
      "rewards/margins": 2.8359451293945312,
      "rewards/rejected": -3.2813687324523926,
      "step": 370
    },
    {
      "epoch": 0.1,
      "grad_norm": 49.479881286621094,
      "kl": 0.0,
      "learning_rate": 4.51452499345721e-07,
      "logps/chosen": -273.73419189453125,
      "logps/rejected": -243.27877807617188,
      "loss": 0.4571,
      "rewards/chosen": -1.3881046772003174,
      "rewards/margins": -0.02787947654724121,
      "rewards/rejected": -1.3602252006530762,
      "step": 371
    },
    {
      "epoch": 0.1,
      "grad_norm": 41.906150817871094,
      "kl": 0.0,
      "learning_rate": 4.513216435488092e-07,
      "logps/chosen": -220.07510375976562,
      "logps/rejected": -236.5010528564453,
      "loss": 0.331,
      "rewards/chosen": -0.32473501563072205,
      "rewards/margins": 2.5255327224731445,
      "rewards/rejected": -2.8502676486968994,
      "step": 372
    },
    {
      "epoch": 0.1,
      "grad_norm": 42.59035110473633,
      "kl": 0.0,
      "learning_rate": 4.511907877518974e-07,
      "logps/chosen": -282.63482666015625,
      "logps/rejected": -258.4151306152344,
      "loss": 0.2893,
      "rewards/chosen": -0.18127818405628204,
      "rewards/margins": 1.0610069036483765,
      "rewards/rejected": -1.242285132408142,
      "step": 373
    },
    {
      "epoch": 0.1,
      "grad_norm": 37.68351364135742,
      "kl": 0.0,
      "learning_rate": 4.510599319549856e-07,
      "logps/chosen": -210.8810272216797,
      "logps/rejected": -224.22665405273438,
      "loss": 0.3508,
      "rewards/chosen": -1.3154093027114868,
      "rewards/margins": 2.27657413482666,
      "rewards/rejected": -3.5919833183288574,
      "step": 374
    },
    {
      "epoch": 0.1,
      "grad_norm": 31.09569549560547,
      "kl": 0.0,
      "learning_rate": 4.509290761580738e-07,
      "logps/chosen": -195.7852783203125,
      "logps/rejected": -257.9025573730469,
      "loss": 0.2977,
      "rewards/chosen": -0.7623598575592041,
      "rewards/margins": 3.2512733936309814,
      "rewards/rejected": -4.0136332511901855,
      "step": 375
    },
    {
      "epoch": 0.1,
      "grad_norm": 28.333711624145508,
      "kl": 0.0,
      "learning_rate": 4.5079822036116197e-07,
      "logps/chosen": -151.38967895507812,
      "logps/rejected": -297.0498046875,
      "loss": 0.3272,
      "rewards/chosen": -1.5544800758361816,
      "rewards/margins": 0.2516918182373047,
      "rewards/rejected": -1.8061718940734863,
      "step": 376
    },
    {
      "epoch": 0.1,
      "grad_norm": 36.108795166015625,
      "kl": 0.0,
      "learning_rate": 4.506673645642502e-07,
      "logps/chosen": -227.50051879882812,
      "logps/rejected": -211.9762420654297,
      "loss": 0.4023,
      "rewards/chosen": -0.5598442554473877,
      "rewards/margins": 2.065431833267212,
      "rewards/rejected": -2.6252760887145996,
      "step": 377
    },
    {
      "epoch": 0.1,
      "grad_norm": 33.0745849609375,
      "kl": 0.0,
      "learning_rate": 4.505365087673384e-07,
      "logps/chosen": -213.019287109375,
      "logps/rejected": -207.67184448242188,
      "loss": 0.3544,
      "rewards/chosen": -1.3546139001846313,
      "rewards/margins": 0.8756808042526245,
      "rewards/rejected": -2.230294704437256,
      "step": 378
    },
    {
      "epoch": 0.1,
      "grad_norm": 34.654964447021484,
      "kl": 0.0,
      "learning_rate": 4.504056529704266e-07,
      "logps/chosen": -182.68475341796875,
      "logps/rejected": -255.54612731933594,
      "loss": 0.3742,
      "rewards/chosen": -1.2387803792953491,
      "rewards/margins": 1.8842393159866333,
      "rewards/rejected": -3.1230196952819824,
      "step": 379
    },
    {
      "epoch": 0.1,
      "grad_norm": 32.7558479309082,
      "kl": 0.0,
      "learning_rate": 4.5027479717351476e-07,
      "logps/chosen": -192.14639282226562,
      "logps/rejected": -229.38015747070312,
      "loss": 0.3754,
      "rewards/chosen": -0.07900139689445496,
      "rewards/margins": 2.6180408000946045,
      "rewards/rejected": -2.697042226791382,
      "step": 380
    },
    {
      "epoch": 0.1,
      "grad_norm": 36.6190071105957,
      "kl": 0.0,
      "learning_rate": 4.5014394137660295e-07,
      "logps/chosen": -268.7249755859375,
      "logps/rejected": -251.80455017089844,
      "loss": 0.4801,
      "rewards/chosen": -1.5141268968582153,
      "rewards/margins": 1.2382701635360718,
      "rewards/rejected": -2.752397060394287,
      "step": 381
    },
    {
      "epoch": 0.1,
      "grad_norm": 40.666839599609375,
      "kl": 0.0,
      "learning_rate": 4.5001308557969115e-07,
      "logps/chosen": -243.43234252929688,
      "logps/rejected": -258.41448974609375,
      "loss": 0.3511,
      "rewards/chosen": -0.3709592819213867,
      "rewards/margins": 1.2282294034957886,
      "rewards/rejected": -1.5991886854171753,
      "step": 382
    },
    {
      "epoch": 0.1,
      "grad_norm": 28.4544620513916,
      "kl": 0.0,
      "learning_rate": 4.4988222978277935e-07,
      "logps/chosen": -198.5283660888672,
      "logps/rejected": -217.16868591308594,
      "loss": 0.3838,
      "rewards/chosen": -2.058994770050049,
      "rewards/margins": 0.5409789085388184,
      "rewards/rejected": -2.599973678588867,
      "step": 383
    },
    {
      "epoch": 0.1,
      "grad_norm": 39.92369079589844,
      "kl": 0.0,
      "learning_rate": 4.4975137398586754e-07,
      "logps/chosen": -201.83322143554688,
      "logps/rejected": -245.54714965820312,
      "loss": 0.3817,
      "rewards/chosen": -0.4379090666770935,
      "rewards/margins": 2.6852946281433105,
      "rewards/rejected": -3.123203754425049,
      "step": 384
    },
    {
      "epoch": 0.1,
      "grad_norm": 42.2255744934082,
      "kl": 0.0,
      "learning_rate": 4.4962051818895574e-07,
      "logps/chosen": -183.98609924316406,
      "logps/rejected": -319.1829833984375,
      "loss": 0.4237,
      "rewards/chosen": -0.3203003704547882,
      "rewards/margins": 2.765152931213379,
      "rewards/rejected": -3.0854532718658447,
      "step": 385
    },
    {
      "epoch": 0.1,
      "grad_norm": 29.236879348754883,
      "kl": 0.0,
      "learning_rate": 4.4948966239204393e-07,
      "logps/chosen": -93.41141510009766,
      "logps/rejected": -202.96383666992188,
      "loss": 0.327,
      "rewards/chosen": -0.3800518810749054,
      "rewards/margins": 2.651289701461792,
      "rewards/rejected": -3.031341552734375,
      "step": 386
    },
    {
      "epoch": 0.1,
      "grad_norm": 37.4599609375,
      "kl": 0.0,
      "learning_rate": 4.4935880659513213e-07,
      "logps/chosen": -271.9604797363281,
      "logps/rejected": -222.61170959472656,
      "loss": 0.4275,
      "rewards/chosen": 0.17430379986763,
      "rewards/margins": 2.8483176231384277,
      "rewards/rejected": -2.67401385307312,
      "step": 387
    },
    {
      "epoch": 0.1,
      "grad_norm": 32.062278747558594,
      "kl": 0.0,
      "learning_rate": 4.492279507982203e-07,
      "logps/chosen": -190.10693359375,
      "logps/rejected": -176.67543029785156,
      "loss": 0.2995,
      "rewards/chosen": -0.9995455145835876,
      "rewards/margins": 1.7313144207000732,
      "rewards/rejected": -2.7308599948883057,
      "step": 388
    },
    {
      "epoch": 0.1,
      "grad_norm": 38.55339813232422,
      "kl": 0.0,
      "learning_rate": 4.490970950013085e-07,
      "logps/chosen": -220.9341583251953,
      "logps/rejected": -299.83245849609375,
      "loss": 0.3378,
      "rewards/chosen": -0.39133620262145996,
      "rewards/margins": 2.5305755138397217,
      "rewards/rejected": -2.9219117164611816,
      "step": 389
    },
    {
      "epoch": 0.1,
      "grad_norm": 39.30725860595703,
      "kl": 0.0,
      "learning_rate": 4.4896623920439677e-07,
      "logps/chosen": -182.44326782226562,
      "logps/rejected": -174.75648498535156,
      "loss": 0.3955,
      "rewards/chosen": -0.47413867712020874,
      "rewards/margins": 1.2516155242919922,
      "rewards/rejected": -1.7257542610168457,
      "step": 390
    },
    {
      "epoch": 0.1,
      "grad_norm": 33.8344612121582,
      "kl": 0.0,
      "learning_rate": 4.4883538340748497e-07,
      "logps/chosen": -236.1179656982422,
      "logps/rejected": -250.09539794921875,
      "loss": 0.4352,
      "rewards/chosen": -1.4198930263519287,
      "rewards/margins": 1.684675931930542,
      "rewards/rejected": -3.1045689582824707,
      "step": 391
    },
    {
      "epoch": 0.1,
      "grad_norm": 30.08695411682129,
      "kl": 0.0,
      "learning_rate": 4.4870452761057316e-07,
      "logps/chosen": -186.74351501464844,
      "logps/rejected": -253.93115234375,
      "loss": 0.2261,
      "rewards/chosen": -0.4091317355632782,
      "rewards/margins": 3.6403825283050537,
      "rewards/rejected": -4.049514293670654,
      "step": 392
    },
    {
      "epoch": 0.1,
      "grad_norm": 34.06538772583008,
      "kl": 0.0,
      "learning_rate": 4.4857367181366136e-07,
      "logps/chosen": -193.60354614257812,
      "logps/rejected": -134.43006896972656,
      "loss": 0.3712,
      "rewards/chosen": -0.271693617105484,
      "rewards/margins": 1.1191461086273193,
      "rewards/rejected": -1.390839695930481,
      "step": 393
    },
    {
      "epoch": 0.1,
      "grad_norm": 34.09449768066406,
      "kl": 0.0,
      "learning_rate": 4.4844281601674956e-07,
      "logps/chosen": -218.30441284179688,
      "logps/rejected": -234.31195068359375,
      "loss": 0.2001,
      "rewards/chosen": 0.4233044981956482,
      "rewards/margins": 5.021885871887207,
      "rewards/rejected": -4.598581314086914,
      "step": 394
    },
    {
      "epoch": 0.1,
      "grad_norm": 42.07844924926758,
      "kl": 0.0,
      "learning_rate": 4.483119602198377e-07,
      "logps/chosen": -231.95108032226562,
      "logps/rejected": -268.6423645019531,
      "loss": 0.4383,
      "rewards/chosen": -0.824382483959198,
      "rewards/margins": 2.927910089492798,
      "rewards/rejected": -3.7522926330566406,
      "step": 395
    },
    {
      "epoch": 0.1,
      "grad_norm": 38.50265121459961,
      "kl": 0.0,
      "learning_rate": 4.481811044229259e-07,
      "logps/chosen": -270.509033203125,
      "logps/rejected": -226.2156219482422,
      "loss": 0.2796,
      "rewards/chosen": -0.18926948308944702,
      "rewards/margins": 1.739548921585083,
      "rewards/rejected": -1.9288183450698853,
      "step": 396
    },
    {
      "epoch": 0.1,
      "grad_norm": 31.2486515045166,
      "kl": 0.0,
      "learning_rate": 4.480502486260141e-07,
      "logps/chosen": -109.7884521484375,
      "logps/rejected": -203.29953002929688,
      "loss": 0.4896,
      "rewards/chosen": -0.8157114386558533,
      "rewards/margins": 0.568125307559967,
      "rewards/rejected": -1.3838367462158203,
      "step": 397
    },
    {
      "epoch": 0.1,
      "grad_norm": 31.697792053222656,
      "kl": 0.0,
      "learning_rate": 4.479193928291023e-07,
      "logps/chosen": -174.076171875,
      "logps/rejected": -253.45223999023438,
      "loss": 0.3,
      "rewards/chosen": -0.8394641876220703,
      "rewards/margins": 2.1134212017059326,
      "rewards/rejected": -2.952885389328003,
      "step": 398
    },
    {
      "epoch": 0.1,
      "grad_norm": 28.867341995239258,
      "kl": 0.0,
      "learning_rate": 4.477885370321905e-07,
      "logps/chosen": -168.5074005126953,
      "logps/rejected": -125.79701232910156,
      "loss": 0.3656,
      "rewards/chosen": -0.439168244600296,
      "rewards/margins": 0.6221072673797607,
      "rewards/rejected": -1.0612754821777344,
      "step": 399
    },
    {
      "epoch": 0.1,
      "grad_norm": 35.56822967529297,
      "kl": 0.0,
      "learning_rate": 4.476576812352787e-07,
      "logps/chosen": -166.24661254882812,
      "logps/rejected": -263.20733642578125,
      "loss": 0.3092,
      "rewards/chosen": -0.6019458770751953,
      "rewards/margins": 2.142655611038208,
      "rewards/rejected": -2.7446014881134033,
      "step": 400
    },
    {
      "epoch": 0.1,
      "grad_norm": 34.35905838012695,
      "kl": 0.0,
      "learning_rate": 4.475268254383669e-07,
      "logps/chosen": -185.26710510253906,
      "logps/rejected": -250.02255249023438,
      "loss": 0.2589,
      "rewards/chosen": 0.7835355401039124,
      "rewards/margins": 4.608429431915283,
      "rewards/rejected": -3.8248939514160156,
      "step": 401
    },
    {
      "epoch": 0.11,
      "grad_norm": 31.946916580200195,
      "kl": 0.0,
      "learning_rate": 4.4739596964145507e-07,
      "logps/chosen": -147.8795623779297,
      "logps/rejected": -273.70904541015625,
      "loss": 0.4864,
      "rewards/chosen": -0.9677847027778625,
      "rewards/margins": 1.813450813293457,
      "rewards/rejected": -2.781235456466675,
      "step": 402
    },
    {
      "epoch": 0.11,
      "grad_norm": 34.859928131103516,
      "kl": 0.0,
      "learning_rate": 4.472651138445433e-07,
      "logps/chosen": -235.14657592773438,
      "logps/rejected": -225.0830078125,
      "loss": 0.3405,
      "rewards/chosen": -1.2147836685180664,
      "rewards/margins": 1.1082990169525146,
      "rewards/rejected": -2.323082685470581,
      "step": 403
    },
    {
      "epoch": 0.11,
      "grad_norm": 34.983360290527344,
      "kl": 0.0,
      "learning_rate": 4.471342580476315e-07,
      "logps/chosen": -163.03114318847656,
      "logps/rejected": -290.476318359375,
      "loss": 0.2981,
      "rewards/chosen": -1.457593321800232,
      "rewards/margins": 2.553722381591797,
      "rewards/rejected": -4.011315822601318,
      "step": 404
    },
    {
      "epoch": 0.11,
      "grad_norm": 33.62318801879883,
      "kl": 0.0,
      "learning_rate": 4.470034022507197e-07,
      "logps/chosen": -233.85498046875,
      "logps/rejected": -183.947509765625,
      "loss": 0.4204,
      "rewards/chosen": -0.9742795825004578,
      "rewards/margins": 2.231663227081299,
      "rewards/rejected": -3.2059428691864014,
      "step": 405
    },
    {
      "epoch": 0.11,
      "grad_norm": 45.01471710205078,
      "kl": 0.0,
      "learning_rate": 4.468725464538079e-07,
      "logps/chosen": -193.53396606445312,
      "logps/rejected": -337.14697265625,
      "loss": 0.4105,
      "rewards/chosen": -0.45279431343078613,
      "rewards/margins": 1.2500965595245361,
      "rewards/rejected": -1.7028908729553223,
      "step": 406
    },
    {
      "epoch": 0.11,
      "grad_norm": 28.890682220458984,
      "kl": 0.0,
      "learning_rate": 4.467416906568961e-07,
      "logps/chosen": -208.4068145751953,
      "logps/rejected": -235.36312866210938,
      "loss": 0.3001,
      "rewards/chosen": 0.7755467295646667,
      "rewards/margins": 3.6093943119049072,
      "rewards/rejected": -2.8338475227355957,
      "step": 407
    },
    {
      "epoch": 0.11,
      "grad_norm": 31.09847068786621,
      "kl": 0.0,
      "learning_rate": 4.466108348599843e-07,
      "logps/chosen": -238.82534790039062,
      "logps/rejected": -218.0955352783203,
      "loss": 0.321,
      "rewards/chosen": 0.5067283511161804,
      "rewards/margins": 3.1289937496185303,
      "rewards/rejected": -2.622265338897705,
      "step": 408
    },
    {
      "epoch": 0.11,
      "grad_norm": 30.704862594604492,
      "kl": 0.0,
      "learning_rate": 4.464799790630725e-07,
      "logps/chosen": -318.6136169433594,
      "logps/rejected": -350.36749267578125,
      "loss": 0.3123,
      "rewards/chosen": -1.7736846208572388,
      "rewards/margins": 1.854911208152771,
      "rewards/rejected": -3.6285958290100098,
      "step": 409
    },
    {
      "epoch": 0.11,
      "grad_norm": 38.331790924072266,
      "kl": 0.0,
      "learning_rate": 4.4634912326616064e-07,
      "logps/chosen": -206.4320068359375,
      "logps/rejected": -287.9844970703125,
      "loss": 0.3798,
      "rewards/chosen": -0.3898891806602478,
      "rewards/margins": 3.1153624057769775,
      "rewards/rejected": -3.50525164604187,
      "step": 410
    },
    {
      "epoch": 0.11,
      "grad_norm": 25.918659210205078,
      "kl": 0.0,
      "learning_rate": 4.4621826746924884e-07,
      "logps/chosen": -194.8382568359375,
      "logps/rejected": -218.07589721679688,
      "loss": 0.2852,
      "rewards/chosen": -0.7628296613693237,
      "rewards/margins": 3.037680149078369,
      "rewards/rejected": -3.8005099296569824,
      "step": 411
    },
    {
      "epoch": 0.11,
      "grad_norm": 39.002342224121094,
      "kl": 0.0,
      "learning_rate": 4.4608741167233703e-07,
      "logps/chosen": -172.56983947753906,
      "logps/rejected": -279.04034423828125,
      "loss": 0.4511,
      "rewards/chosen": -0.5036152601242065,
      "rewards/margins": 1.6097949743270874,
      "rewards/rejected": -2.113410234451294,
      "step": 412
    },
    {
      "epoch": 0.11,
      "grad_norm": 26.85070037841797,
      "kl": 0.0,
      "learning_rate": 4.4595655587542523e-07,
      "logps/chosen": -221.30715942382812,
      "logps/rejected": -273.9478759765625,
      "loss": 0.3823,
      "rewards/chosen": 0.18906378746032715,
      "rewards/margins": 3.575185537338257,
      "rewards/rejected": -3.3861217498779297,
      "step": 413
    },
    {
      "epoch": 0.11,
      "grad_norm": 32.324066162109375,
      "kl": 0.0,
      "learning_rate": 4.458257000785134e-07,
      "logps/chosen": -226.74598693847656,
      "logps/rejected": -250.83737182617188,
      "loss": 0.2715,
      "rewards/chosen": -0.3493311405181885,
      "rewards/margins": 3.514986753463745,
      "rewards/rejected": -3.8643178939819336,
      "step": 414
    },
    {
      "epoch": 0.11,
      "grad_norm": 33.729129791259766,
      "kl": 0.0,
      "learning_rate": 4.456948442816016e-07,
      "logps/chosen": -257.4822998046875,
      "logps/rejected": -214.71878051757812,
      "loss": 0.3478,
      "rewards/chosen": -1.7488336563110352,
      "rewards/margins": 2.3019628524780273,
      "rewards/rejected": -4.0507965087890625,
      "step": 415
    },
    {
      "epoch": 0.11,
      "grad_norm": 26.120983123779297,
      "kl": 0.0,
      "learning_rate": 4.4556398848468987e-07,
      "logps/chosen": -184.21572875976562,
      "logps/rejected": -255.57809448242188,
      "loss": 0.321,
      "rewards/chosen": -1.7084475755691528,
      "rewards/margins": 1.36984121799469,
      "rewards/rejected": -3.0782887935638428,
      "step": 416
    },
    {
      "epoch": 0.11,
      "grad_norm": 29.163272857666016,
      "kl": 0.0,
      "learning_rate": 4.4543313268777807e-07,
      "logps/chosen": -147.19149780273438,
      "logps/rejected": -168.89613342285156,
      "loss": 0.3638,
      "rewards/chosen": -0.3925864100456238,
      "rewards/margins": 1.8936991691589355,
      "rewards/rejected": -2.286285638809204,
      "step": 417
    },
    {
      "epoch": 0.11,
      "grad_norm": 33.93145751953125,
      "kl": 0.0,
      "learning_rate": 4.4530227689086626e-07,
      "logps/chosen": -214.961181640625,
      "logps/rejected": -205.99252319335938,
      "loss": 0.3564,
      "rewards/chosen": -0.7181621193885803,
      "rewards/margins": 1.9614551067352295,
      "rewards/rejected": -2.679617166519165,
      "step": 418
    },
    {
      "epoch": 0.11,
      "grad_norm": 30.19272232055664,
      "kl": 0.0,
      "learning_rate": 4.4517142109395446e-07,
      "logps/chosen": -200.70773315429688,
      "logps/rejected": -210.8472442626953,
      "loss": 0.3503,
      "rewards/chosen": -1.6026099920272827,
      "rewards/margins": 0.9804550409317017,
      "rewards/rejected": -2.5830650329589844,
      "step": 419
    },
    {
      "epoch": 0.11,
      "grad_norm": 30.529420852661133,
      "kl": 0.0,
      "learning_rate": 4.4504056529704265e-07,
      "logps/chosen": -209.10699462890625,
      "logps/rejected": -269.783447265625,
      "loss": 0.2915,
      "rewards/chosen": -1.2561055421829224,
      "rewards/margins": 2.0115232467651367,
      "rewards/rejected": -3.2676289081573486,
      "step": 420
    },
    {
      "epoch": 0.11,
      "grad_norm": 29.09016990661621,
      "kl": 0.0,
      "learning_rate": 4.4490970950013085e-07,
      "logps/chosen": -326.3652038574219,
      "logps/rejected": -254.44314575195312,
      "loss": 0.3048,
      "rewards/chosen": -1.2969313859939575,
      "rewards/margins": 2.0072875022888184,
      "rewards/rejected": -3.3042187690734863,
      "step": 421
    },
    {
      "epoch": 0.11,
      "grad_norm": 26.089576721191406,
      "kl": 0.0,
      "learning_rate": 4.4477885370321905e-07,
      "logps/chosen": -287.49560546875,
      "logps/rejected": -273.09234619140625,
      "loss": 0.2429,
      "rewards/chosen": -0.9192136526107788,
      "rewards/margins": 3.4407401084899902,
      "rewards/rejected": -4.359953880310059,
      "step": 422
    },
    {
      "epoch": 0.11,
      "grad_norm": 24.35614013671875,
      "kl": 0.0,
      "learning_rate": 4.4464799790630724e-07,
      "logps/chosen": -302.5399475097656,
      "logps/rejected": -250.1197967529297,
      "loss": 0.1487,
      "rewards/chosen": 0.8381339907646179,
      "rewards/margins": 4.958958625793457,
      "rewards/rejected": -4.120824813842773,
      "step": 423
    },
    {
      "epoch": 0.11,
      "grad_norm": 43.92964553833008,
      "kl": 0.0,
      "learning_rate": 4.4451714210939544e-07,
      "logps/chosen": -217.13169860839844,
      "logps/rejected": -206.9733428955078,
      "loss": 0.374,
      "rewards/chosen": -1.333292007446289,
      "rewards/margins": 4.744856834411621,
      "rewards/rejected": -6.07814884185791,
      "step": 424
    },
    {
      "epoch": 0.11,
      "grad_norm": 41.32809829711914,
      "kl": 0.0,
      "learning_rate": 4.4438628631248363e-07,
      "logps/chosen": -259.3089904785156,
      "logps/rejected": -276.80120849609375,
      "loss": 0.2415,
      "rewards/chosen": -0.16916459798812866,
      "rewards/margins": 3.5735318660736084,
      "rewards/rejected": -3.742696523666382,
      "step": 425
    },
    {
      "epoch": 0.11,
      "grad_norm": 39.2205810546875,
      "kl": 0.0,
      "learning_rate": 4.442554305155718e-07,
      "logps/chosen": -261.5947265625,
      "logps/rejected": -271.5604248046875,
      "loss": 0.426,
      "rewards/chosen": -0.7233175039291382,
      "rewards/margins": 3.2420592308044434,
      "rewards/rejected": -3.965376615524292,
      "step": 426
    },
    {
      "epoch": 0.11,
      "grad_norm": 41.977691650390625,
      "kl": 0.0,
      "learning_rate": 4.4412457471866e-07,
      "logps/chosen": -246.94467163085938,
      "logps/rejected": -245.75967407226562,
      "loss": 0.3938,
      "rewards/chosen": -1.275907039642334,
      "rewards/margins": 2.263812780380249,
      "rewards/rejected": -3.539719820022583,
      "step": 427
    },
    {
      "epoch": 0.11,
      "grad_norm": 29.9937744140625,
      "kl": 0.0,
      "learning_rate": 4.4399371892174817e-07,
      "logps/chosen": -205.16627502441406,
      "logps/rejected": -245.968505859375,
      "loss": 0.3472,
      "rewards/chosen": -1.6494184732437134,
      "rewards/margins": 3.15864896774292,
      "rewards/rejected": -4.808067321777344,
      "step": 428
    },
    {
      "epoch": 0.11,
      "grad_norm": 35.75617980957031,
      "kl": 0.0,
      "learning_rate": 4.438628631248364e-07,
      "logps/chosen": -187.45712280273438,
      "logps/rejected": -228.374755859375,
      "loss": 0.3824,
      "rewards/chosen": -0.853541374206543,
      "rewards/margins": 1.3908743858337402,
      "rewards/rejected": -2.244415760040283,
      "step": 429
    },
    {
      "epoch": 0.11,
      "grad_norm": 30.78190040588379,
      "kl": 0.0,
      "learning_rate": 4.437320073279246e-07,
      "logps/chosen": -180.026123046875,
      "logps/rejected": -260.7657165527344,
      "loss": 0.4076,
      "rewards/chosen": -0.43349024653434753,
      "rewards/margins": 2.4749932289123535,
      "rewards/rejected": -2.9084835052490234,
      "step": 430
    },
    {
      "epoch": 0.11,
      "grad_norm": 39.93263626098633,
      "kl": 0.0,
      "learning_rate": 4.436011515310128e-07,
      "logps/chosen": -221.72950744628906,
      "logps/rejected": -139.82058715820312,
      "loss": 0.3167,
      "rewards/chosen": 0.9334081411361694,
      "rewards/margins": 2.3411035537719727,
      "rewards/rejected": -1.4076952934265137,
      "step": 431
    },
    {
      "epoch": 0.11,
      "grad_norm": 36.58866500854492,
      "kl": 0.0,
      "learning_rate": 4.43470295734101e-07,
      "logps/chosen": -187.53683471679688,
      "logps/rejected": -194.1912841796875,
      "loss": 0.4121,
      "rewards/chosen": -0.8121582269668579,
      "rewards/margins": 1.4370888471603394,
      "rewards/rejected": -2.2492470741271973,
      "step": 432
    },
    {
      "epoch": 0.11,
      "grad_norm": 37.1215705871582,
      "kl": 0.0,
      "learning_rate": 4.433394399371892e-07,
      "logps/chosen": -194.21035766601562,
      "logps/rejected": -261.7844543457031,
      "loss": 0.2948,
      "rewards/chosen": -0.80370032787323,
      "rewards/margins": 2.7978358268737793,
      "rewards/rejected": -3.6015360355377197,
      "step": 433
    },
    {
      "epoch": 0.11,
      "grad_norm": 23.91290283203125,
      "kl": 0.0,
      "learning_rate": 4.432085841402774e-07,
      "logps/chosen": -223.78013610839844,
      "logps/rejected": -208.88961791992188,
      "loss": 0.3526,
      "rewards/chosen": -0.15191972255706787,
      "rewards/margins": 2.919671058654785,
      "rewards/rejected": -3.0715909004211426,
      "step": 434
    },
    {
      "epoch": 0.11,
      "grad_norm": 31.860248565673828,
      "kl": 0.0,
      "learning_rate": 4.430777283433656e-07,
      "logps/chosen": -178.88320922851562,
      "logps/rejected": -286.60174560546875,
      "loss": 0.4119,
      "rewards/chosen": -1.61677086353302,
      "rewards/margins": 2.9354071617126465,
      "rewards/rejected": -4.552177906036377,
      "step": 435
    },
    {
      "epoch": 0.11,
      "grad_norm": 32.55792236328125,
      "kl": 0.0,
      "learning_rate": 4.429468725464538e-07,
      "logps/chosen": -168.29994201660156,
      "logps/rejected": -196.41845703125,
      "loss": 0.3352,
      "rewards/chosen": 1.1356920003890991,
      "rewards/margins": 5.061020851135254,
      "rewards/rejected": -3.9253289699554443,
      "step": 436
    },
    {
      "epoch": 0.11,
      "grad_norm": 35.387332916259766,
      "kl": 0.0,
      "learning_rate": 4.42816016749542e-07,
      "logps/chosen": -148.4381103515625,
      "logps/rejected": -210.69398498535156,
      "loss": 0.382,
      "rewards/chosen": -1.410331130027771,
      "rewards/margins": 0.021602511405944824,
      "rewards/rejected": -1.4319336414337158,
      "step": 437
    },
    {
      "epoch": 0.11,
      "grad_norm": 32.027957916259766,
      "kl": 0.0,
      "learning_rate": 4.426851609526302e-07,
      "logps/chosen": -270.435791015625,
      "logps/rejected": -170.29534912109375,
      "loss": 0.2659,
      "rewards/chosen": -0.9582297801971436,
      "rewards/margins": 1.9723150730133057,
      "rewards/rejected": -2.930544853210449,
      "step": 438
    },
    {
      "epoch": 0.11,
      "grad_norm": 28.259552001953125,
      "kl": 0.0,
      "learning_rate": 4.425543051557184e-07,
      "logps/chosen": -171.87156677246094,
      "logps/rejected": -164.96092224121094,
      "loss": 0.3884,
      "rewards/chosen": -0.9051600098609924,
      "rewards/margins": 1.720262050628662,
      "rewards/rejected": -2.6254220008850098,
      "step": 439
    },
    {
      "epoch": 0.12,
      "grad_norm": 30.440942764282227,
      "kl": 0.0,
      "learning_rate": 4.424234493588066e-07,
      "logps/chosen": -263.3205261230469,
      "logps/rejected": -270.30535888671875,
      "loss": 0.3815,
      "rewards/chosen": -1.8120743036270142,
      "rewards/margins": 1.4591060876846313,
      "rewards/rejected": -3.2711803913116455,
      "step": 440
    },
    {
      "epoch": 0.12,
      "grad_norm": 35.62495040893555,
      "kl": 0.0,
      "learning_rate": 4.422925935618947e-07,
      "logps/chosen": -183.18359375,
      "logps/rejected": -287.8877258300781,
      "loss": 0.401,
      "rewards/chosen": -1.1367744207382202,
      "rewards/margins": 2.48806095123291,
      "rewards/rejected": -3.62483549118042,
      "step": 441
    },
    {
      "epoch": 0.12,
      "grad_norm": 26.652666091918945,
      "kl": 0.0,
      "learning_rate": 4.4216173776498297e-07,
      "logps/chosen": -238.51235961914062,
      "logps/rejected": -200.37075805664062,
      "loss": 0.3568,
      "rewards/chosen": -0.887931227684021,
      "rewards/margins": 1.5551592111587524,
      "rewards/rejected": -2.4430904388427734,
      "step": 442
    },
    {
      "epoch": 0.12,
      "grad_norm": 38.41551208496094,
      "kl": 0.0,
      "learning_rate": 4.4203088196807116e-07,
      "logps/chosen": -159.75192260742188,
      "logps/rejected": -239.3623046875,
      "loss": 0.2291,
      "rewards/chosen": 0.6571463346481323,
      "rewards/margins": 3.578679084777832,
      "rewards/rejected": -2.92153263092041,
      "step": 443
    },
    {
      "epoch": 0.12,
      "grad_norm": 27.979955673217773,
      "kl": 0.0,
      "learning_rate": 4.4190002617115936e-07,
      "logps/chosen": -234.58811950683594,
      "logps/rejected": -179.65280151367188,
      "loss": 0.2937,
      "rewards/chosen": -0.8682149052619934,
      "rewards/margins": 3.7219350337982178,
      "rewards/rejected": -4.590149879455566,
      "step": 444
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.59569549560547,
      "kl": 0.0,
      "learning_rate": 4.4176917037424756e-07,
      "logps/chosen": -220.4178009033203,
      "logps/rejected": -181.2563934326172,
      "loss": 0.5664,
      "rewards/chosen": -1.7921407222747803,
      "rewards/margins": -0.3066593408584595,
      "rewards/rejected": -1.4854813814163208,
      "step": 445
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.9988899230957,
      "kl": 0.0,
      "learning_rate": 4.4163831457733575e-07,
      "logps/chosen": -203.83201599121094,
      "logps/rejected": -372.3682861328125,
      "loss": 0.2888,
      "rewards/chosen": 0.25906336307525635,
      "rewards/margins": 4.292977809906006,
      "rewards/rejected": -4.033914566040039,
      "step": 446
    },
    {
      "epoch": 0.12,
      "grad_norm": 40.23044204711914,
      "kl": 0.0,
      "learning_rate": 4.4150745878042395e-07,
      "logps/chosen": -247.44821166992188,
      "logps/rejected": -221.32139587402344,
      "loss": 0.4277,
      "rewards/chosen": -0.29965513944625854,
      "rewards/margins": 2.7966206073760986,
      "rewards/rejected": -3.096275806427002,
      "step": 447
    },
    {
      "epoch": 0.12,
      "grad_norm": 32.306182861328125,
      "kl": 0.0,
      "learning_rate": 4.4137660298351214e-07,
      "logps/chosen": -174.62615966796875,
      "logps/rejected": -183.30177307128906,
      "loss": 0.4944,
      "rewards/chosen": -0.57502681016922,
      "rewards/margins": 2.9689502716064453,
      "rewards/rejected": -3.5439770221710205,
      "step": 448
    },
    {
      "epoch": 0.12,
      "grad_norm": 29.693349838256836,
      "kl": 0.0,
      "learning_rate": 4.4124574718660034e-07,
      "logps/chosen": -230.96018981933594,
      "logps/rejected": -289.1417236328125,
      "loss": 0.4293,
      "rewards/chosen": -3.256486177444458,
      "rewards/margins": -0.15216612815856934,
      "rewards/rejected": -3.1043200492858887,
      "step": 449
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.37322998046875,
      "kl": 0.0,
      "learning_rate": 4.4111489138968854e-07,
      "logps/chosen": -268.78509521484375,
      "logps/rejected": -179.531982421875,
      "loss": 0.3369,
      "rewards/chosen": -0.8727652430534363,
      "rewards/margins": 2.1034626960754395,
      "rewards/rejected": -2.9762279987335205,
      "step": 450
    },
    {
      "epoch": 0.12,
      "grad_norm": 34.335166931152344,
      "kl": 0.0,
      "learning_rate": 4.4098403559277673e-07,
      "logps/chosen": -316.95068359375,
      "logps/rejected": -154.5581512451172,
      "loss": 0.4674,
      "rewards/chosen": -2.679093360900879,
      "rewards/margins": -1.4148513078689575,
      "rewards/rejected": -1.2642420530319214,
      "step": 451
    },
    {
      "epoch": 0.12,
      "grad_norm": 39.86390686035156,
      "kl": 0.0,
      "learning_rate": 4.4085317979586493e-07,
      "logps/chosen": -219.30465698242188,
      "logps/rejected": -263.12744140625,
      "loss": 0.5171,
      "rewards/chosen": -0.31301993131637573,
      "rewards/margins": 0.6610985398292542,
      "rewards/rejected": -0.9741184711456299,
      "step": 452
    },
    {
      "epoch": 0.12,
      "grad_norm": 30.844038009643555,
      "kl": 0.0,
      "learning_rate": 4.407223239989531e-07,
      "logps/chosen": -225.07394409179688,
      "logps/rejected": -217.85385131835938,
      "loss": 0.3538,
      "rewards/chosen": -0.46413740515708923,
      "rewards/margins": 2.2845165729522705,
      "rewards/rejected": -2.7486538887023926,
      "step": 453
    },
    {
      "epoch": 0.12,
      "grad_norm": 41.72404098510742,
      "kl": 0.0,
      "learning_rate": 4.405914682020414e-07,
      "logps/chosen": -229.8594207763672,
      "logps/rejected": -241.03822326660156,
      "loss": 0.4037,
      "rewards/chosen": -0.8823609948158264,
      "rewards/margins": 0.31540435552597046,
      "rewards/rejected": -1.1977653503417969,
      "step": 454
    },
    {
      "epoch": 0.12,
      "grad_norm": 42.6037483215332,
      "kl": 0.0,
      "learning_rate": 4.4046061240512957e-07,
      "logps/chosen": -300.2820739746094,
      "logps/rejected": -227.77395629882812,
      "loss": 0.3454,
      "rewards/chosen": 0.1626671850681305,
      "rewards/margins": 2.2905962467193604,
      "rewards/rejected": -2.1279289722442627,
      "step": 455
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.86833190917969,
      "kl": 0.0,
      "learning_rate": 4.4032975660821777e-07,
      "logps/chosen": -318.31591796875,
      "logps/rejected": -182.9334716796875,
      "loss": 0.3524,
      "rewards/chosen": 0.35922086238861084,
      "rewards/margins": 3.2377238273620605,
      "rewards/rejected": -2.87850284576416,
      "step": 456
    },
    {
      "epoch": 0.12,
      "grad_norm": 37.31455612182617,
      "kl": 0.0,
      "learning_rate": 4.401989008113059e-07,
      "logps/chosen": -269.36700439453125,
      "logps/rejected": -237.45333862304688,
      "loss": 0.3255,
      "rewards/chosen": -0.11557312309741974,
      "rewards/margins": 1.8929672241210938,
      "rewards/rejected": -2.008540391921997,
      "step": 457
    },
    {
      "epoch": 0.12,
      "grad_norm": 38.61105728149414,
      "kl": 0.0,
      "learning_rate": 4.400680450143941e-07,
      "logps/chosen": -219.1864776611328,
      "logps/rejected": -252.305908203125,
      "loss": 0.3503,
      "rewards/chosen": -1.6341336965560913,
      "rewards/margins": 2.3688693046569824,
      "rewards/rejected": -4.003003120422363,
      "step": 458
    },
    {
      "epoch": 0.12,
      "grad_norm": 37.14291763305664,
      "kl": 0.0,
      "learning_rate": 4.399371892174823e-07,
      "logps/chosen": -265.4627380371094,
      "logps/rejected": -212.84605407714844,
      "loss": 0.2379,
      "rewards/chosen": 0.17642521858215332,
      "rewards/margins": 2.7223758697509766,
      "rewards/rejected": -2.5459506511688232,
      "step": 459
    },
    {
      "epoch": 0.12,
      "grad_norm": 22.45039939880371,
      "kl": 0.0,
      "learning_rate": 4.398063334205705e-07,
      "logps/chosen": -206.451171875,
      "logps/rejected": -218.98048400878906,
      "loss": 0.2924,
      "rewards/chosen": -1.1858196258544922,
      "rewards/margins": 1.884453296661377,
      "rewards/rejected": -3.070272922515869,
      "step": 460
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.07036590576172,
      "kl": 0.0,
      "learning_rate": 4.396754776236587e-07,
      "logps/chosen": -241.53549194335938,
      "logps/rejected": -200.14956665039062,
      "loss": 0.431,
      "rewards/chosen": -0.8274599313735962,
      "rewards/margins": 1.791495442390442,
      "rewards/rejected": -2.618955373764038,
      "step": 461
    },
    {
      "epoch": 0.12,
      "grad_norm": 39.4976806640625,
      "kl": 0.0,
      "learning_rate": 4.395446218267469e-07,
      "logps/chosen": -127.9756088256836,
      "logps/rejected": -189.22503662109375,
      "loss": 0.4124,
      "rewards/chosen": -0.7792689800262451,
      "rewards/margins": 0.9542557001113892,
      "rewards/rejected": -1.7335246801376343,
      "step": 462
    },
    {
      "epoch": 0.12,
      "grad_norm": 37.367706298828125,
      "kl": 0.0,
      "learning_rate": 4.394137660298351e-07,
      "logps/chosen": -269.013916015625,
      "logps/rejected": -239.1348876953125,
      "loss": 0.463,
      "rewards/chosen": -1.267537236213684,
      "rewards/margins": 0.260334849357605,
      "rewards/rejected": -1.527872085571289,
      "step": 463
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.60445022583008,
      "kl": 0.0,
      "learning_rate": 4.392829102329233e-07,
      "logps/chosen": -214.6671142578125,
      "logps/rejected": -296.5412902832031,
      "loss": 0.3431,
      "rewards/chosen": 0.05106091499328613,
      "rewards/margins": 4.666691780090332,
      "rewards/rejected": -4.615631103515625,
      "step": 464
    },
    {
      "epoch": 0.12,
      "grad_norm": 43.4371337890625,
      "kl": 0.0,
      "learning_rate": 4.391520544360115e-07,
      "logps/chosen": -294.7562561035156,
      "logps/rejected": -311.7882995605469,
      "loss": 0.4404,
      "rewards/chosen": -1.535415768623352,
      "rewards/margins": 3.4017748832702637,
      "rewards/rejected": -4.937190532684326,
      "step": 465
    },
    {
      "epoch": 0.12,
      "grad_norm": 31.70909881591797,
      "kl": 0.0,
      "learning_rate": 4.390211986390997e-07,
      "logps/chosen": -289.6376953125,
      "logps/rejected": -218.08839416503906,
      "loss": 0.3983,
      "rewards/chosen": -0.7291394472122192,
      "rewards/margins": 1.2570115327835083,
      "rewards/rejected": -1.9861509799957275,
      "step": 466
    },
    {
      "epoch": 0.12,
      "grad_norm": 36.29698181152344,
      "kl": 0.0,
      "learning_rate": 4.388903428421879e-07,
      "logps/chosen": -302.9260559082031,
      "logps/rejected": -270.5092468261719,
      "loss": 0.3438,
      "rewards/chosen": -1.1891206502914429,
      "rewards/margins": 1.5823391675949097,
      "rewards/rejected": -2.7714598178863525,
      "step": 467
    },
    {
      "epoch": 0.12,
      "grad_norm": 41.817020416259766,
      "kl": 0.0,
      "learning_rate": 4.387594870452761e-07,
      "logps/chosen": -211.2220458984375,
      "logps/rejected": -263.3524169921875,
      "loss": 0.2837,
      "rewards/chosen": 0.021623237058520317,
      "rewards/margins": 3.064910411834717,
      "rewards/rejected": -3.0432872772216797,
      "step": 468
    },
    {
      "epoch": 0.12,
      "grad_norm": 32.100032806396484,
      "kl": 0.0,
      "learning_rate": 4.386286312483643e-07,
      "logps/chosen": -197.6712646484375,
      "logps/rejected": -304.293212890625,
      "loss": 0.3716,
      "rewards/chosen": -0.31620892882347107,
      "rewards/margins": 4.1429762840271,
      "rewards/rejected": -4.4591851234436035,
      "step": 469
    },
    {
      "epoch": 0.12,
      "grad_norm": 35.59612274169922,
      "kl": 0.0,
      "learning_rate": 4.384977754514525e-07,
      "logps/chosen": -198.55593872070312,
      "logps/rejected": -264.3667297363281,
      "loss": 0.2592,
      "rewards/chosen": -0.7703940272331238,
      "rewards/margins": 1.892730474472046,
      "rewards/rejected": -2.6631245613098145,
      "step": 470
    },
    {
      "epoch": 0.12,
      "grad_norm": 39.834415435791016,
      "kl": 0.0,
      "learning_rate": 4.383669196545407e-07,
      "logps/chosen": -242.6627655029297,
      "logps/rejected": -194.5601806640625,
      "loss": 0.2882,
      "rewards/chosen": -0.9662243127822876,
      "rewards/margins": 0.030240893363952637,
      "rewards/rejected": -0.9964652061462402,
      "step": 471
    },
    {
      "epoch": 0.12,
      "grad_norm": 38.55018615722656,
      "kl": 0.0,
      "learning_rate": 4.3823606385762885e-07,
      "logps/chosen": -211.4266357421875,
      "logps/rejected": -157.74371337890625,
      "loss": 0.3696,
      "rewards/chosen": -0.15886731445789337,
      "rewards/margins": 2.9636335372924805,
      "rewards/rejected": -3.1225008964538574,
      "step": 472
    },
    {
      "epoch": 0.12,
      "grad_norm": 36.88655090332031,
      "kl": 0.0,
      "learning_rate": 4.3810520806071705e-07,
      "logps/chosen": -195.3677215576172,
      "logps/rejected": -293.7377624511719,
      "loss": 0.2387,
      "rewards/chosen": 0.6904792189598083,
      "rewards/margins": 3.3669517040252686,
      "rewards/rejected": -2.6764724254608154,
      "step": 473
    },
    {
      "epoch": 0.12,
      "grad_norm": 38.9611701965332,
      "kl": 0.0,
      "learning_rate": 4.3797435226380524e-07,
      "logps/chosen": -243.78555297851562,
      "logps/rejected": -197.21170043945312,
      "loss": 0.4396,
      "rewards/chosen": -0.7475346326828003,
      "rewards/margins": 0.7941581010818481,
      "rewards/rejected": -1.5416927337646484,
      "step": 474
    },
    {
      "epoch": 0.12,
      "grad_norm": 33.91997528076172,
      "kl": 0.0,
      "learning_rate": 4.3784349646689344e-07,
      "logps/chosen": -268.0740661621094,
      "logps/rejected": -150.24636840820312,
      "loss": 0.3428,
      "rewards/chosen": -2.255317211151123,
      "rewards/margins": 0.03300309181213379,
      "rewards/rejected": -2.288320302963257,
      "step": 475
    },
    {
      "epoch": 0.12,
      "grad_norm": 35.875980377197266,
      "kl": 0.0,
      "learning_rate": 4.3771264066998164e-07,
      "logps/chosen": -296.677001953125,
      "logps/rejected": -214.2599334716797,
      "loss": 0.3321,
      "rewards/chosen": -1.3506510257720947,
      "rewards/margins": 1.1312048435211182,
      "rewards/rejected": -2.481855869293213,
      "step": 476
    },
    {
      "epoch": 0.12,
      "grad_norm": 34.1489372253418,
      "kl": 0.0,
      "learning_rate": 4.3758178487306983e-07,
      "logps/chosen": -250.9139404296875,
      "logps/rejected": -231.31741333007812,
      "loss": 0.3651,
      "rewards/chosen": -1.9945584535598755,
      "rewards/margins": 1.617838978767395,
      "rewards/rejected": -3.6123974323272705,
      "step": 477
    },
    {
      "epoch": 0.13,
      "grad_norm": 35.04484939575195,
      "kl": 0.0,
      "learning_rate": 4.3745092907615803e-07,
      "logps/chosen": -236.66690063476562,
      "logps/rejected": -208.755615234375,
      "loss": 0.3611,
      "rewards/chosen": -0.8417435884475708,
      "rewards/margins": 1.9930511713027954,
      "rewards/rejected": -2.834794759750366,
      "step": 478
    },
    {
      "epoch": 0.13,
      "grad_norm": 29.14078140258789,
      "kl": 0.0,
      "learning_rate": 4.373200732792462e-07,
      "logps/chosen": -193.83944702148438,
      "logps/rejected": -175.72816467285156,
      "loss": 0.3968,
      "rewards/chosen": -1.354570746421814,
      "rewards/margins": 1.389928936958313,
      "rewards/rejected": -2.744499683380127,
      "step": 479
    },
    {
      "epoch": 0.13,
      "grad_norm": 37.82932662963867,
      "kl": 0.0,
      "learning_rate": 4.3718921748233447e-07,
      "logps/chosen": -293.6022644042969,
      "logps/rejected": -255.2928009033203,
      "loss": 0.4888,
      "rewards/chosen": -0.7668954133987427,
      "rewards/margins": -0.5312289595603943,
      "rewards/rejected": -0.23566646873950958,
      "step": 480
    },
    {
      "epoch": 0.13,
      "grad_norm": 34.05666732788086,
      "kl": 0.0,
      "learning_rate": 4.3705836168542267e-07,
      "logps/chosen": -210.43983459472656,
      "logps/rejected": -252.8617706298828,
      "loss": 0.3394,
      "rewards/chosen": -0.34639981389045715,
      "rewards/margins": 1.4680942296981812,
      "rewards/rejected": -1.814494013786316,
      "step": 481
    },
    {
      "epoch": 0.13,
      "grad_norm": 26.431236267089844,
      "kl": 0.0,
      "learning_rate": 4.3692750588851086e-07,
      "logps/chosen": -230.23416137695312,
      "logps/rejected": -259.32794189453125,
      "loss": 0.3312,
      "rewards/chosen": 1.6871323585510254,
      "rewards/margins": 4.462740898132324,
      "rewards/rejected": -2.7756083011627197,
      "step": 482
    },
    {
      "epoch": 0.13,
      "grad_norm": 34.93710708618164,
      "kl": 0.0,
      "learning_rate": 4.3679665009159906e-07,
      "logps/chosen": -192.50726318359375,
      "logps/rejected": -204.75120544433594,
      "loss": 0.4178,
      "rewards/chosen": -0.37875622510910034,
      "rewards/margins": 2.6469016075134277,
      "rewards/rejected": -3.025657892227173,
      "step": 483
    },
    {
      "epoch": 0.13,
      "grad_norm": 29.03684425354004,
      "kl": 0.0,
      "learning_rate": 4.3666579429468726e-07,
      "logps/chosen": -230.1165771484375,
      "logps/rejected": -233.3046875,
      "loss": 0.2737,
      "rewards/chosen": -0.02157449722290039,
      "rewards/margins": 3.5609054565429688,
      "rewards/rejected": -3.582479953765869,
      "step": 484
    },
    {
      "epoch": 0.13,
      "grad_norm": 34.31352233886719,
      "kl": 0.0,
      "learning_rate": 4.3653493849777545e-07,
      "logps/chosen": -213.8225555419922,
      "logps/rejected": -216.7666015625,
      "loss": 0.4475,
      "rewards/chosen": -0.675076961517334,
      "rewards/margins": 1.7822785377502441,
      "rewards/rejected": -2.457355499267578,
      "step": 485
    },
    {
      "epoch": 0.13,
      "grad_norm": 32.48976516723633,
      "kl": 0.0,
      "learning_rate": 4.3640408270086365e-07,
      "logps/chosen": -186.95602416992188,
      "logps/rejected": -334.0401306152344,
      "loss": 0.237,
      "rewards/chosen": -1.392081379890442,
      "rewards/margins": 2.3323307037353516,
      "rewards/rejected": -3.724411964416504,
      "step": 486
    },
    {
      "epoch": 0.13,
      "grad_norm": 26.30244255065918,
      "kl": 0.0,
      "learning_rate": 4.3627322690395185e-07,
      "logps/chosen": -195.7552947998047,
      "logps/rejected": -242.45310974121094,
      "loss": 0.2757,
      "rewards/chosen": -0.8110968470573425,
      "rewards/margins": 3.5064804553985596,
      "rewards/rejected": -4.317577362060547,
      "step": 487
    },
    {
      "epoch": 0.13,
      "grad_norm": 43.99164962768555,
      "kl": 0.0,
      "learning_rate": 4.3614237110704e-07,
      "logps/chosen": -155.76412963867188,
      "logps/rejected": -216.58558654785156,
      "loss": 0.316,
      "rewards/chosen": -0.40736696124076843,
      "rewards/margins": 2.6981070041656494,
      "rewards/rejected": -3.1054739952087402,
      "step": 488
    },
    {
      "epoch": 0.13,
      "grad_norm": 32.1126594543457,
      "kl": 0.0,
      "learning_rate": 4.360115153101282e-07,
      "logps/chosen": -205.777099609375,
      "logps/rejected": -174.24252319335938,
      "loss": 0.3616,
      "rewards/chosen": -0.022801468148827553,
      "rewards/margins": 2.2425994873046875,
      "rewards/rejected": -2.2654008865356445,
      "step": 489
    },
    {
      "epoch": 0.13,
      "grad_norm": 33.053733825683594,
      "kl": 0.0,
      "learning_rate": 4.358806595132164e-07,
      "logps/chosen": -283.0188293457031,
      "logps/rejected": -314.4449157714844,
      "loss": 0.2158,
      "rewards/chosen": -0.4068226218223572,
      "rewards/margins": 2.8746418952941895,
      "rewards/rejected": -3.2814645767211914,
      "step": 490
    },
    {
      "epoch": 0.13,
      "grad_norm": 35.346317291259766,
      "kl": 0.0,
      "learning_rate": 4.357498037163046e-07,
      "logps/chosen": -168.42393493652344,
      "logps/rejected": -167.74624633789062,
      "loss": 0.3801,
      "rewards/chosen": -0.9554387331008911,
      "rewards/margins": 1.6682840585708618,
      "rewards/rejected": -2.623722791671753,
      "step": 491
    },
    {
      "epoch": 0.13,
      "grad_norm": 34.33647155761719,
      "kl": 0.0,
      "learning_rate": 4.3561894791939277e-07,
      "logps/chosen": -324.6409606933594,
      "logps/rejected": -205.80564880371094,
      "loss": 0.4075,
      "rewards/chosen": -0.8289295434951782,
      "rewards/margins": 3.134256362915039,
      "rewards/rejected": -3.9631857872009277,
      "step": 492
    },
    {
      "epoch": 0.13,
      "grad_norm": 32.74213409423828,
      "kl": 0.0,
      "learning_rate": 4.35488092122481e-07,
      "logps/chosen": -186.9199981689453,
      "logps/rejected": -213.62106323242188,
      "loss": 0.288,
      "rewards/chosen": -0.1531243622303009,
      "rewards/margins": 2.8898119926452637,
      "rewards/rejected": -3.042936325073242,
      "step": 493
    },
    {
      "epoch": 0.13,
      "grad_norm": 39.94125747680664,
      "kl": 0.0,
      "learning_rate": 4.353572363255692e-07,
      "logps/chosen": -150.59657287597656,
      "logps/rejected": -246.30706787109375,
      "loss": 0.3572,
      "rewards/chosen": -0.5869843363761902,
      "rewards/margins": 2.4806878566741943,
      "rewards/rejected": -3.0676722526550293,
      "step": 494
    },
    {
      "epoch": 0.13,
      "grad_norm": 35.68771743774414,
      "kl": 0.0,
      "learning_rate": 4.352263805286574e-07,
      "logps/chosen": -313.64434814453125,
      "logps/rejected": -268.2545471191406,
      "loss": 0.3196,
      "rewards/chosen": -1.9877912998199463,
      "rewards/margins": 2.831235647201538,
      "rewards/rejected": -4.819026947021484,
      "step": 495
    },
    {
      "epoch": 0.13,
      "grad_norm": 28.748931884765625,
      "kl": 0.0,
      "learning_rate": 4.350955247317456e-07,
      "logps/chosen": -221.89503479003906,
      "logps/rejected": -178.8087158203125,
      "loss": 0.4381,
      "rewards/chosen": -1.9237765073776245,
      "rewards/margins": 0.5343056917190552,
      "rewards/rejected": -2.4580821990966797,
      "step": 496
    },
    {
      "epoch": 0.13,
      "grad_norm": 34.48188400268555,
      "kl": 0.0,
      "learning_rate": 4.349646689348338e-07,
      "logps/chosen": -232.4418182373047,
      "logps/rejected": -245.66510009765625,
      "loss": 0.301,
      "rewards/chosen": -0.1335909068584442,
      "rewards/margins": 3.0244925022125244,
      "rewards/rejected": -3.158083438873291,
      "step": 497
    },
    {
      "epoch": 0.13,
      "grad_norm": 30.280105590820312,
      "kl": 0.0,
      "learning_rate": 4.34833813137922e-07,
      "logps/chosen": -176.8372344970703,
      "logps/rejected": -306.17205810546875,
      "loss": 0.3286,
      "rewards/chosen": -0.7078924179077148,
      "rewards/margins": 5.860451698303223,
      "rewards/rejected": -6.5683441162109375,
      "step": 498
    },
    {
      "epoch": 0.13,
      "grad_norm": 28.61834144592285,
      "kl": 0.0,
      "learning_rate": 4.347029573410102e-07,
      "logps/chosen": -205.4990234375,
      "logps/rejected": -324.9005432128906,
      "loss": 0.2317,
      "rewards/chosen": -0.580607533454895,
      "rewards/margins": 4.204123020172119,
      "rewards/rejected": -4.784730434417725,
      "step": 499
    },
    {
      "epoch": 0.13,
      "grad_norm": 31.07453727722168,
      "kl": 0.0,
      "learning_rate": 4.345721015440984e-07,
      "logps/chosen": -212.39117431640625,
      "logps/rejected": -107.82008361816406,
      "loss": 0.3148,
      "rewards/chosen": 0.3389224112033844,
      "rewards/margins": 2.8236329555511475,
      "rewards/rejected": -2.484710454940796,
      "step": 500
    },
    {
      "epoch": 0.13,
      "grad_norm": 30.91390609741211,
      "kl": 0.0,
      "learning_rate": 4.344412457471866e-07,
      "logps/chosen": -230.955078125,
      "logps/rejected": -310.66424560546875,
      "loss": 0.3263,
      "rewards/chosen": -0.5058167576789856,
      "rewards/margins": 1.7073900699615479,
      "rewards/rejected": -2.2132067680358887,
      "step": 501
    },
    {
      "epoch": 0.13,
      "grad_norm": 33.28854751586914,
      "kl": 0.0,
      "learning_rate": 4.343103899502748e-07,
      "logps/chosen": -167.78994750976562,
      "logps/rejected": -217.27552795410156,
      "loss": 0.2653,
      "rewards/chosen": -0.5831389427185059,
      "rewards/margins": 2.3856923580169678,
      "rewards/rejected": -2.9688313007354736,
      "step": 502
    },
    {
      "epoch": 0.13,
      "grad_norm": 32.2841682434082,
      "kl": 0.0,
      "learning_rate": 4.3417953415336293e-07,
      "logps/chosen": -182.3533935546875,
      "logps/rejected": -282.0680847167969,
      "loss": 0.3076,
      "rewards/chosen": -1.6923186779022217,
      "rewards/margins": 2.6697137355804443,
      "rewards/rejected": -4.362032413482666,
      "step": 503
    },
    {
      "epoch": 0.13,
      "grad_norm": 35.49862289428711,
      "kl": 0.0,
      "learning_rate": 4.340486783564511e-07,
      "logps/chosen": -207.2657470703125,
      "logps/rejected": -210.40371704101562,
      "loss": 0.3002,
      "rewards/chosen": 1.06424880027771,
      "rewards/margins": 3.628180742263794,
      "rewards/rejected": -2.563931941986084,
      "step": 504
    },
    {
      "epoch": 0.13,
      "grad_norm": 25.61597442626953,
      "kl": 0.0,
      "learning_rate": 4.339178225595393e-07,
      "logps/chosen": -242.67938232421875,
      "logps/rejected": -326.7820129394531,
      "loss": 0.2194,
      "rewards/chosen": -1.2337604761123657,
      "rewards/margins": 4.276817798614502,
      "rewards/rejected": -5.510578155517578,
      "step": 505
    },
    {
      "epoch": 0.13,
      "grad_norm": 35.94837951660156,
      "kl": 0.0,
      "learning_rate": 4.3378696676262757e-07,
      "logps/chosen": -195.439697265625,
      "logps/rejected": -228.76486206054688,
      "loss": 0.4639,
      "rewards/chosen": -0.31265154480934143,
      "rewards/margins": 1.3135621547698975,
      "rewards/rejected": -1.6262136697769165,
      "step": 506
    },
    {
      "epoch": 0.13,
      "grad_norm": 28.738576889038086,
      "kl": 0.0,
      "learning_rate": 4.3365611096571577e-07,
      "logps/chosen": -307.04888916015625,
      "logps/rejected": -234.9107208251953,
      "loss": 0.2035,
      "rewards/chosen": -0.4320366680622101,
      "rewards/margins": 3.774815797805786,
      "rewards/rejected": -4.206852436065674,
      "step": 507
    },
    {
      "epoch": 0.13,
      "grad_norm": 38.1010856628418,
      "kl": 0.0,
      "learning_rate": 4.3352525516880396e-07,
      "logps/chosen": -271.90325927734375,
      "logps/rejected": -230.39442443847656,
      "loss": 0.251,
      "rewards/chosen": 0.012623111717402935,
      "rewards/margins": 2.0225515365600586,
      "rewards/rejected": -2.0099284648895264,
      "step": 508
    },
    {
      "epoch": 0.13,
      "grad_norm": 36.99441146850586,
      "kl": 0.0,
      "learning_rate": 4.3339439937189216e-07,
      "logps/chosen": -268.9024353027344,
      "logps/rejected": -292.2628479003906,
      "loss": 0.1964,
      "rewards/chosen": 0.6849376559257507,
      "rewards/margins": 4.367363929748535,
      "rewards/rejected": -3.6824264526367188,
      "step": 509
    },
    {
      "epoch": 0.13,
      "grad_norm": 28.90747833251953,
      "kl": 0.0,
      "learning_rate": 4.3326354357498036e-07,
      "logps/chosen": -326.12493896484375,
      "logps/rejected": -215.9102020263672,
      "loss": 0.1947,
      "rewards/chosen": -0.9350789785385132,
      "rewards/margins": 2.6486573219299316,
      "rewards/rejected": -3.5837364196777344,
      "step": 510
    },
    {
      "epoch": 0.13,
      "grad_norm": 33.20688247680664,
      "kl": 0.0,
      "learning_rate": 4.3313268777806855e-07,
      "logps/chosen": -221.3362579345703,
      "logps/rejected": -250.4815673828125,
      "loss": 0.3717,
      "rewards/chosen": 0.06487315148115158,
      "rewards/margins": 3.2546393871307373,
      "rewards/rejected": -3.1897661685943604,
      "step": 511
    },
    {
      "epoch": 0.13,
      "grad_norm": 41.531349182128906,
      "kl": 0.0,
      "learning_rate": 4.3300183198115675e-07,
      "logps/chosen": -205.99996948242188,
      "logps/rejected": -335.3144226074219,
      "loss": 0.3992,
      "rewards/chosen": 0.4203885793685913,
      "rewards/margins": 3.3626184463500977,
      "rewards/rejected": -2.942229747772217,
      "step": 512
    },
    {
      "epoch": 0.13,
      "grad_norm": 38.602996826171875,
      "kl": 0.0,
      "learning_rate": 4.3287097618424494e-07,
      "logps/chosen": -310.7867126464844,
      "logps/rejected": -257.95892333984375,
      "loss": 0.2275,
      "rewards/chosen": -0.5336145758628845,
      "rewards/margins": 2.8806161880493164,
      "rewards/rejected": -3.4142308235168457,
      "step": 513
    },
    {
      "epoch": 0.13,
      "grad_norm": 27.161235809326172,
      "kl": 0.0,
      "learning_rate": 4.3274012038733314e-07,
      "logps/chosen": -326.2843017578125,
      "logps/rejected": -185.2740936279297,
      "loss": 0.3319,
      "rewards/chosen": -0.76652991771698,
      "rewards/margins": 1.6281522512435913,
      "rewards/rejected": -2.3946821689605713,
      "step": 514
    },
    {
      "epoch": 0.13,
      "grad_norm": 31.2752685546875,
      "kl": 0.0,
      "learning_rate": 4.3260926459042134e-07,
      "logps/chosen": -263.0508728027344,
      "logps/rejected": -212.15830993652344,
      "loss": 0.3423,
      "rewards/chosen": -2.017266035079956,
      "rewards/margins": 0.27599525451660156,
      "rewards/rejected": -2.2932612895965576,
      "step": 515
    },
    {
      "epoch": 0.14,
      "grad_norm": 30.69701385498047,
      "kl": 0.0,
      "learning_rate": 4.3247840879350953e-07,
      "logps/chosen": -248.09197998046875,
      "logps/rejected": -316.41058349609375,
      "loss": 0.1471,
      "rewards/chosen": -0.44200658798217773,
      "rewards/margins": 5.960631847381592,
      "rewards/rejected": -6.4026384353637695,
      "step": 516
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.12960433959961,
      "kl": 0.0,
      "learning_rate": 4.3234755299659773e-07,
      "logps/chosen": -255.32371520996094,
      "logps/rejected": -278.0555419921875,
      "loss": 0.2701,
      "rewards/chosen": -0.7212596535682678,
      "rewards/margins": 2.7007358074188232,
      "rewards/rejected": -3.4219954013824463,
      "step": 517
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.733726501464844,
      "kl": 0.0,
      "learning_rate": 4.32216697199686e-07,
      "logps/chosen": -276.49493408203125,
      "logps/rejected": -223.2489013671875,
      "loss": 0.3113,
      "rewards/chosen": -0.1942574679851532,
      "rewards/margins": 1.6792775392532349,
      "rewards/rejected": -1.8735350370407104,
      "step": 518
    },
    {
      "epoch": 0.14,
      "grad_norm": 38.33927536010742,
      "kl": 0.0,
      "learning_rate": 4.320858414027741e-07,
      "logps/chosen": -331.3040466308594,
      "logps/rejected": -308.75128173828125,
      "loss": 0.3721,
      "rewards/chosen": 0.15606045722961426,
      "rewards/margins": 3.0476534366607666,
      "rewards/rejected": -2.8915929794311523,
      "step": 519
    },
    {
      "epoch": 0.14,
      "grad_norm": 35.864749908447266,
      "kl": 0.0,
      "learning_rate": 4.319549856058623e-07,
      "logps/chosen": -233.13507080078125,
      "logps/rejected": -319.062255859375,
      "loss": 0.2592,
      "rewards/chosen": -1.1843230724334717,
      "rewards/margins": 2.3068020343780518,
      "rewards/rejected": -3.4911251068115234,
      "step": 520
    },
    {
      "epoch": 0.14,
      "grad_norm": 35.09690475463867,
      "kl": 0.0,
      "learning_rate": 4.318241298089505e-07,
      "logps/chosen": -216.5358123779297,
      "logps/rejected": -185.92214965820312,
      "loss": 0.3433,
      "rewards/chosen": -0.31711503863334656,
      "rewards/margins": 2.2386279106140137,
      "rewards/rejected": -2.5557429790496826,
      "step": 521
    },
    {
      "epoch": 0.14,
      "grad_norm": 31.03679656982422,
      "kl": 0.0,
      "learning_rate": 4.316932740120387e-07,
      "logps/chosen": -218.7315673828125,
      "logps/rejected": -237.93017578125,
      "loss": 0.2502,
      "rewards/chosen": 0.30230313539505005,
      "rewards/margins": 3.891742467880249,
      "rewards/rejected": -3.5894393920898438,
      "step": 522
    },
    {
      "epoch": 0.14,
      "grad_norm": 35.12614059448242,
      "kl": 0.0,
      "learning_rate": 4.315624182151269e-07,
      "logps/chosen": -248.89341735839844,
      "logps/rejected": -218.71405029296875,
      "loss": 0.4314,
      "rewards/chosen": -0.9112810492515564,
      "rewards/margins": 1.5321125984191895,
      "rewards/rejected": -2.4433937072753906,
      "step": 523
    },
    {
      "epoch": 0.14,
      "grad_norm": 34.43649673461914,
      "kl": 0.0,
      "learning_rate": 4.314315624182151e-07,
      "logps/chosen": -174.38783264160156,
      "logps/rejected": -224.63009643554688,
      "loss": 0.1099,
      "rewards/chosen": 1.0011378526687622,
      "rewards/margins": 4.093344688415527,
      "rewards/rejected": -3.0922069549560547,
      "step": 524
    },
    {
      "epoch": 0.14,
      "grad_norm": 34.96295166015625,
      "kl": 0.0,
      "learning_rate": 4.313007066213033e-07,
      "logps/chosen": -292.97552490234375,
      "logps/rejected": -306.924072265625,
      "loss": 0.3233,
      "rewards/chosen": -2.473991870880127,
      "rewards/margins": 2.015058994293213,
      "rewards/rejected": -4.48905086517334,
      "step": 525
    },
    {
      "epoch": 0.14,
      "grad_norm": 42.44701385498047,
      "kl": 0.0,
      "learning_rate": 4.311698508243915e-07,
      "logps/chosen": -219.43673706054688,
      "logps/rejected": -317.16217041015625,
      "loss": 0.3468,
      "rewards/chosen": -0.6824592351913452,
      "rewards/margins": 1.6270591020584106,
      "rewards/rejected": -2.309518337249756,
      "step": 526
    },
    {
      "epoch": 0.14,
      "grad_norm": 33.045780181884766,
      "kl": 0.0,
      "learning_rate": 4.310389950274797e-07,
      "logps/chosen": -237.01206970214844,
      "logps/rejected": -284.2718505859375,
      "loss": 0.3741,
      "rewards/chosen": -1.5432114601135254,
      "rewards/margins": 1.7887234687805176,
      "rewards/rejected": -3.331934928894043,
      "step": 527
    },
    {
      "epoch": 0.14,
      "grad_norm": 64.60256958007812,
      "kl": 0.0,
      "learning_rate": 4.309081392305679e-07,
      "logps/chosen": -213.1400909423828,
      "logps/rejected": -363.4927673339844,
      "loss": 0.2004,
      "rewards/chosen": -0.3685699701309204,
      "rewards/margins": 3.1882615089416504,
      "rewards/rejected": -3.5568313598632812,
      "step": 528
    },
    {
      "epoch": 0.14,
      "grad_norm": 45.85017776489258,
      "kl": 0.0,
      "learning_rate": 4.307772834336561e-07,
      "logps/chosen": -292.92279052734375,
      "logps/rejected": -285.4364013671875,
      "loss": 0.3617,
      "rewards/chosen": 0.07168924808502197,
      "rewards/margins": 2.3396801948547363,
      "rewards/rejected": -2.267990827560425,
      "step": 529
    },
    {
      "epoch": 0.14,
      "grad_norm": 29.200170516967773,
      "kl": 0.0,
      "learning_rate": 4.306464276367443e-07,
      "logps/chosen": -171.74261474609375,
      "logps/rejected": -179.36610412597656,
      "loss": 0.3663,
      "rewards/chosen": -0.33816060423851013,
      "rewards/margins": 2.3799824714660645,
      "rewards/rejected": -2.7181429862976074,
      "step": 530
    },
    {
      "epoch": 0.14,
      "grad_norm": 27.944580078125,
      "kl": 0.0,
      "learning_rate": 4.305155718398325e-07,
      "logps/chosen": -166.678466796875,
      "logps/rejected": -235.33338928222656,
      "loss": 0.3208,
      "rewards/chosen": -0.9628866910934448,
      "rewards/margins": 1.1621040105819702,
      "rewards/rejected": -2.124990701675415,
      "step": 531
    },
    {
      "epoch": 0.14,
      "grad_norm": 27.248119354248047,
      "kl": 0.0,
      "learning_rate": 4.303847160429207e-07,
      "logps/chosen": -226.89649963378906,
      "logps/rejected": -243.8878173828125,
      "loss": 0.3707,
      "rewards/chosen": -0.1505812406539917,
      "rewards/margins": 2.931156635284424,
      "rewards/rejected": -3.081737756729126,
      "step": 532
    },
    {
      "epoch": 0.14,
      "grad_norm": 30.732263565063477,
      "kl": 0.0,
      "learning_rate": 4.302538602460089e-07,
      "logps/chosen": -165.86468505859375,
      "logps/rejected": -238.6931915283203,
      "loss": 0.4329,
      "rewards/chosen": -0.7833044528961182,
      "rewards/margins": 1.562227725982666,
      "rewards/rejected": -2.345532178878784,
      "step": 533
    },
    {
      "epoch": 0.14,
      "grad_norm": 24.79857063293457,
      "kl": 0.0,
      "learning_rate": 4.3012300444909706e-07,
      "logps/chosen": -260.51019287109375,
      "logps/rejected": -267.1363830566406,
      "loss": 0.2559,
      "rewards/chosen": 0.522705078125,
      "rewards/margins": 4.974919319152832,
      "rewards/rejected": -4.452214241027832,
      "step": 534
    },
    {
      "epoch": 0.14,
      "grad_norm": 34.72758483886719,
      "kl": 0.0,
      "learning_rate": 4.2999214865218526e-07,
      "logps/chosen": -218.3809814453125,
      "logps/rejected": -179.02288818359375,
      "loss": 0.3901,
      "rewards/chosen": -0.4237063527107239,
      "rewards/margins": 2.7379310131073,
      "rewards/rejected": -3.161637306213379,
      "step": 535
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.278099060058594,
      "kl": 0.0,
      "learning_rate": 4.2986129285527345e-07,
      "logps/chosen": -238.82688903808594,
      "logps/rejected": -274.1971130371094,
      "loss": 0.3413,
      "rewards/chosen": 0.003850996494293213,
      "rewards/margins": 2.9562597274780273,
      "rewards/rejected": -2.952408790588379,
      "step": 536
    },
    {
      "epoch": 0.14,
      "grad_norm": 38.88985824584961,
      "kl": 0.0,
      "learning_rate": 4.2973043705836165e-07,
      "logps/chosen": -348.93212890625,
      "logps/rejected": -186.86294555664062,
      "loss": 0.3678,
      "rewards/chosen": -0.6001288294792175,
      "rewards/margins": 1.5558688640594482,
      "rewards/rejected": -2.1559977531433105,
      "step": 537
    },
    {
      "epoch": 0.14,
      "grad_norm": 43.40470504760742,
      "kl": 0.0,
      "learning_rate": 4.2959958126144985e-07,
      "logps/chosen": -260.8814392089844,
      "logps/rejected": -192.5634307861328,
      "loss": 0.3342,
      "rewards/chosen": -0.5589590072631836,
      "rewards/margins": 3.7600107192993164,
      "rewards/rejected": -4.3189697265625,
      "step": 538
    },
    {
      "epoch": 0.14,
      "grad_norm": 43.13268280029297,
      "kl": 0.0,
      "learning_rate": 4.2946872546453804e-07,
      "logps/chosen": -282.29571533203125,
      "logps/rejected": -256.6288757324219,
      "loss": 0.383,
      "rewards/chosen": 0.4491698741912842,
      "rewards/margins": 2.131098508834839,
      "rewards/rejected": -1.6819286346435547,
      "step": 539
    },
    {
      "epoch": 0.14,
      "grad_norm": 34.14078140258789,
      "kl": 0.0,
      "learning_rate": 4.2933786966762624e-07,
      "logps/chosen": -225.33982849121094,
      "logps/rejected": -292.35760498046875,
      "loss": 0.2406,
      "rewards/chosen": 0.6705055236816406,
      "rewards/margins": 3.4733963012695312,
      "rewards/rejected": -2.8028907775878906,
      "step": 540
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.075401306152344,
      "kl": 0.0,
      "learning_rate": 4.2920701387071443e-07,
      "logps/chosen": -250.06369018554688,
      "logps/rejected": -244.3541717529297,
      "loss": 0.2555,
      "rewards/chosen": 0.8597911596298218,
      "rewards/margins": 3.334747791290283,
      "rewards/rejected": -2.474956512451172,
      "step": 541
    },
    {
      "epoch": 0.14,
      "grad_norm": 26.739641189575195,
      "kl": 0.0,
      "learning_rate": 4.2907615807380263e-07,
      "logps/chosen": -330.94866943359375,
      "logps/rejected": -247.71241760253906,
      "loss": 0.2611,
      "rewards/chosen": 0.851098895072937,
      "rewards/margins": 4.732474327087402,
      "rewards/rejected": -3.881375551223755,
      "step": 542
    },
    {
      "epoch": 0.14,
      "grad_norm": 28.64678955078125,
      "kl": 0.0,
      "learning_rate": 4.289453022768908e-07,
      "logps/chosen": -200.18264770507812,
      "logps/rejected": -195.34963989257812,
      "loss": 0.3216,
      "rewards/chosen": -1.2550278902053833,
      "rewards/margins": 0.44221389293670654,
      "rewards/rejected": -1.6972417831420898,
      "step": 543
    },
    {
      "epoch": 0.14,
      "grad_norm": 33.95494842529297,
      "kl": 0.0,
      "learning_rate": 4.288144464799791e-07,
      "logps/chosen": -177.20236206054688,
      "logps/rejected": -241.26626586914062,
      "loss": 0.37,
      "rewards/chosen": -0.8912516832351685,
      "rewards/margins": 1.3814114332199097,
      "rewards/rejected": -2.272663116455078,
      "step": 544
    },
    {
      "epoch": 0.14,
      "grad_norm": 29.539703369140625,
      "kl": 0.0,
      "learning_rate": 4.2868359068306727e-07,
      "logps/chosen": -168.141845703125,
      "logps/rejected": -123.10795593261719,
      "loss": 0.2149,
      "rewards/chosen": -1.0771909952163696,
      "rewards/margins": 1.4760149717330933,
      "rewards/rejected": -2.553205966949463,
      "step": 545
    },
    {
      "epoch": 0.14,
      "grad_norm": 30.06838607788086,
      "kl": 0.0,
      "learning_rate": 4.2855273488615547e-07,
      "logps/chosen": -226.75885009765625,
      "logps/rejected": -265.8707275390625,
      "loss": 0.3497,
      "rewards/chosen": 0.03218793869018555,
      "rewards/margins": 2.9179019927978516,
      "rewards/rejected": -2.885714054107666,
      "step": 546
    },
    {
      "epoch": 0.14,
      "grad_norm": 24.598175048828125,
      "kl": 0.0,
      "learning_rate": 4.2842187908924366e-07,
      "logps/chosen": -201.21563720703125,
      "logps/rejected": -197.22674560546875,
      "loss": 0.1921,
      "rewards/chosen": -0.34368598461151123,
      "rewards/margins": 2.0713300704956055,
      "rewards/rejected": -2.415015935897827,
      "step": 547
    },
    {
      "epoch": 0.14,
      "grad_norm": 35.25346374511719,
      "kl": 0.0,
      "learning_rate": 4.2829102329233186e-07,
      "logps/chosen": -301.93560791015625,
      "logps/rejected": -147.30148315429688,
      "loss": 0.4078,
      "rewards/chosen": -2.1355183124542236,
      "rewards/margins": -0.27744102478027344,
      "rewards/rejected": -1.8580772876739502,
      "step": 548
    },
    {
      "epoch": 0.14,
      "grad_norm": 33.94302749633789,
      "kl": 0.0,
      "learning_rate": 4.2816016749542006e-07,
      "logps/chosen": -248.69700622558594,
      "logps/rejected": -231.81719970703125,
      "loss": 0.3855,
      "rewards/chosen": 0.03941810131072998,
      "rewards/margins": 2.8565940856933594,
      "rewards/rejected": -2.81717586517334,
      "step": 549
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.44513702392578,
      "kl": 0.0,
      "learning_rate": 4.280293116985082e-07,
      "logps/chosen": -176.3740997314453,
      "logps/rejected": -261.5088195800781,
      "loss": 0.319,
      "rewards/chosen": 0.6193097829818726,
      "rewards/margins": 3.7904415130615234,
      "rewards/rejected": -3.1711316108703613,
      "step": 550
    },
    {
      "epoch": 0.14,
      "grad_norm": 32.98072052001953,
      "kl": 0.0,
      "learning_rate": 4.278984559015964e-07,
      "logps/chosen": -182.24502563476562,
      "logps/rejected": -238.89453125,
      "loss": 0.3299,
      "rewards/chosen": -0.6983445882797241,
      "rewards/margins": 2.0171093940734863,
      "rewards/rejected": -2.7154541015625,
      "step": 551
    },
    {
      "epoch": 0.14,
      "grad_norm": 26.78199005126953,
      "kl": 0.0,
      "learning_rate": 4.277676001046846e-07,
      "logps/chosen": -162.42233276367188,
      "logps/rejected": -246.67506408691406,
      "loss": 0.3525,
      "rewards/chosen": -1.2917745113372803,
      "rewards/margins": 2.2916183471679688,
      "rewards/rejected": -3.583392858505249,
      "step": 552
    },
    {
      "epoch": 0.14,
      "grad_norm": 29.860422134399414,
      "kl": 0.0,
      "learning_rate": 4.276367443077728e-07,
      "logps/chosen": -237.3893280029297,
      "logps/rejected": -250.49899291992188,
      "loss": 0.2725,
      "rewards/chosen": -0.23593808710575104,
      "rewards/margins": 2.402143716812134,
      "rewards/rejected": -2.6380817890167236,
      "step": 553
    },
    {
      "epoch": 0.14,
      "grad_norm": 33.161163330078125,
      "kl": 0.0,
      "learning_rate": 4.27505888510861e-07,
      "logps/chosen": -237.7818145751953,
      "logps/rejected": -227.72335815429688,
      "loss": 0.3539,
      "rewards/chosen": 0.2122727930545807,
      "rewards/margins": 2.9937832355499268,
      "rewards/rejected": -2.781510353088379,
      "step": 554
    },
    {
      "epoch": 0.15,
      "grad_norm": 38.98054122924805,
      "kl": 0.0,
      "learning_rate": 4.273750327139492e-07,
      "logps/chosen": -214.34078979492188,
      "logps/rejected": -263.860595703125,
      "loss": 0.3239,
      "rewards/chosen": -0.39826700091362,
      "rewards/margins": 2.020453691482544,
      "rewards/rejected": -2.4187207221984863,
      "step": 555
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.922691345214844,
      "kl": 0.0,
      "learning_rate": 4.2724417691703743e-07,
      "logps/chosen": -206.01144409179688,
      "logps/rejected": -166.6465606689453,
      "loss": 0.3363,
      "rewards/chosen": -0.8341464996337891,
      "rewards/margins": 1.285893201828003,
      "rewards/rejected": -2.120039701461792,
      "step": 556
    },
    {
      "epoch": 0.15,
      "grad_norm": 30.631797790527344,
      "kl": 0.0,
      "learning_rate": 4.271133211201256e-07,
      "logps/chosen": -158.25732421875,
      "logps/rejected": -213.3656768798828,
      "loss": 0.2741,
      "rewards/chosen": -0.12498453259468079,
      "rewards/margins": 2.9033560752868652,
      "rewards/rejected": -3.0283405780792236,
      "step": 557
    },
    {
      "epoch": 0.15,
      "grad_norm": 24.324567794799805,
      "kl": 0.0,
      "learning_rate": 4.269824653232138e-07,
      "logps/chosen": -217.47898864746094,
      "logps/rejected": -165.88308715820312,
      "loss": 0.3939,
      "rewards/chosen": -1.5370630025863647,
      "rewards/margins": 0.32974278926849365,
      "rewards/rejected": -1.8668057918548584,
      "step": 558
    },
    {
      "epoch": 0.15,
      "grad_norm": 34.045230865478516,
      "kl": 0.0,
      "learning_rate": 4.26851609526302e-07,
      "logps/chosen": -294.7262268066406,
      "logps/rejected": -205.035400390625,
      "loss": 0.4029,
      "rewards/chosen": -1.259482502937317,
      "rewards/margins": 0.6094952821731567,
      "rewards/rejected": -1.8689777851104736,
      "step": 559
    },
    {
      "epoch": 0.15,
      "grad_norm": 31.53570556640625,
      "kl": 0.0,
      "learning_rate": 4.267207537293902e-07,
      "logps/chosen": -179.20379638671875,
      "logps/rejected": -265.7896728515625,
      "loss": 0.4417,
      "rewards/chosen": -0.9781392812728882,
      "rewards/margins": 3.028449535369873,
      "rewards/rejected": -4.006588935852051,
      "step": 560
    },
    {
      "epoch": 0.15,
      "grad_norm": 28.38507652282715,
      "kl": 0.0,
      "learning_rate": 4.265898979324784e-07,
      "logps/chosen": -105.09141540527344,
      "logps/rejected": -330.2618103027344,
      "loss": 0.3743,
      "rewards/chosen": -0.9884614944458008,
      "rewards/margins": 1.17330002784729,
      "rewards/rejected": -2.161761522293091,
      "step": 561
    },
    {
      "epoch": 0.15,
      "grad_norm": 34.71090316772461,
      "kl": 0.0,
      "learning_rate": 4.264590421355666e-07,
      "logps/chosen": -200.91526794433594,
      "logps/rejected": -304.50482177734375,
      "loss": 0.2676,
      "rewards/chosen": 1.216893196105957,
      "rewards/margins": 4.012282371520996,
      "rewards/rejected": -2.795389413833618,
      "step": 562
    },
    {
      "epoch": 0.15,
      "grad_norm": 37.67556381225586,
      "kl": 0.0,
      "learning_rate": 4.263281863386548e-07,
      "logps/chosen": -233.8171844482422,
      "logps/rejected": -209.13409423828125,
      "loss": 0.4123,
      "rewards/chosen": -0.7608001232147217,
      "rewards/margins": 1.143924593925476,
      "rewards/rejected": -1.9047247171401978,
      "step": 563
    },
    {
      "epoch": 0.15,
      "grad_norm": 37.23823547363281,
      "kl": 0.0,
      "learning_rate": 4.26197330541743e-07,
      "logps/chosen": -214.50765991210938,
      "logps/rejected": -472.725830078125,
      "loss": 0.3872,
      "rewards/chosen": -0.8464738726615906,
      "rewards/margins": 4.16305685043335,
      "rewards/rejected": -5.009530544281006,
      "step": 564
    },
    {
      "epoch": 0.15,
      "grad_norm": 35.8170166015625,
      "kl": 0.0,
      "learning_rate": 4.2606647474483114e-07,
      "logps/chosen": -287.5885314941406,
      "logps/rejected": -199.86541748046875,
      "loss": 0.3366,
      "rewards/chosen": -0.5958128571510315,
      "rewards/margins": 1.9663183689117432,
      "rewards/rejected": -2.56213116645813,
      "step": 565
    },
    {
      "epoch": 0.15,
      "grad_norm": 40.15536880493164,
      "kl": 0.0,
      "learning_rate": 4.2593561894791934e-07,
      "logps/chosen": -253.20193481445312,
      "logps/rejected": -249.92214965820312,
      "loss": 0.3344,
      "rewards/chosen": -0.08916880190372467,
      "rewards/margins": 2.356729745864868,
      "rewards/rejected": -2.4458985328674316,
      "step": 566
    },
    {
      "epoch": 0.15,
      "grad_norm": 31.04861068725586,
      "kl": 0.0,
      "learning_rate": 4.2580476315100753e-07,
      "logps/chosen": -196.5449981689453,
      "logps/rejected": -212.40931701660156,
      "loss": 0.33,
      "rewards/chosen": -0.20059321820735931,
      "rewards/margins": 2.8560163974761963,
      "rewards/rejected": -3.056609630584717,
      "step": 567
    },
    {
      "epoch": 0.15,
      "grad_norm": 35.633323669433594,
      "kl": 0.0,
      "learning_rate": 4.2567390735409573e-07,
      "logps/chosen": -246.609619140625,
      "logps/rejected": -201.65283203125,
      "loss": 0.3916,
      "rewards/chosen": -0.4009683132171631,
      "rewards/margins": 2.062058448791504,
      "rewards/rejected": -2.463026762008667,
      "step": 568
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.654693603515625,
      "kl": 0.0,
      "learning_rate": 4.25543051557184e-07,
      "logps/chosen": -264.0102844238281,
      "logps/rejected": -169.67617797851562,
      "loss": 0.314,
      "rewards/chosen": -0.4244236350059509,
      "rewards/margins": 3.7280030250549316,
      "rewards/rejected": -4.152426719665527,
      "step": 569
    },
    {
      "epoch": 0.15,
      "grad_norm": 38.11521911621094,
      "kl": 0.0,
      "learning_rate": 4.254121957602722e-07,
      "logps/chosen": -198.6380157470703,
      "logps/rejected": -192.98634338378906,
      "loss": 0.3554,
      "rewards/chosen": -0.11442930996417999,
      "rewards/margins": 2.008514165878296,
      "rewards/rejected": -2.12294340133667,
      "step": 570
    },
    {
      "epoch": 0.15,
      "grad_norm": 35.716495513916016,
      "kl": 0.0,
      "learning_rate": 4.2528133996336037e-07,
      "logps/chosen": -156.13453674316406,
      "logps/rejected": -272.1989440917969,
      "loss": 0.275,
      "rewards/chosen": -0.06980940699577332,
      "rewards/margins": 2.9614274501800537,
      "rewards/rejected": -3.0312368869781494,
      "step": 571
    },
    {
      "epoch": 0.15,
      "grad_norm": 38.65645980834961,
      "kl": 0.0,
      "learning_rate": 4.2515048416644857e-07,
      "logps/chosen": -178.9793243408203,
      "logps/rejected": -289.0226135253906,
      "loss": 0.3303,
      "rewards/chosen": -0.03785140812397003,
      "rewards/margins": 3.243406295776367,
      "rewards/rejected": -3.2812576293945312,
      "step": 572
    },
    {
      "epoch": 0.15,
      "grad_norm": 36.98851776123047,
      "kl": 0.0,
      "learning_rate": 4.2501962836953676e-07,
      "logps/chosen": -240.0013885498047,
      "logps/rejected": -235.85000610351562,
      "loss": 0.3211,
      "rewards/chosen": 1.605520248413086,
      "rewards/margins": 4.609771251678467,
      "rewards/rejected": -3.004251003265381,
      "step": 573
    },
    {
      "epoch": 0.15,
      "grad_norm": 52.9211311340332,
      "kl": 0.0,
      "learning_rate": 4.2488877257262496e-07,
      "logps/chosen": -170.80899047851562,
      "logps/rejected": -318.06646728515625,
      "loss": 0.3078,
      "rewards/chosen": -0.9244940876960754,
      "rewards/margins": 1.4279897212982178,
      "rewards/rejected": -2.3524837493896484,
      "step": 574
    },
    {
      "epoch": 0.15,
      "grad_norm": 36.183860778808594,
      "kl": 0.0,
      "learning_rate": 4.2475791677571315e-07,
      "logps/chosen": -203.8833770751953,
      "logps/rejected": -260.57208251953125,
      "loss": 0.2326,
      "rewards/chosen": -0.2753835618495941,
      "rewards/margins": 3.029111623764038,
      "rewards/rejected": -3.304495096206665,
      "step": 575
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.7669792175293,
      "kl": 0.0,
      "learning_rate": 4.2462706097880135e-07,
      "logps/chosen": -213.69007873535156,
      "logps/rejected": -252.35267639160156,
      "loss": 0.3478,
      "rewards/chosen": 0.12792816758155823,
      "rewards/margins": 3.7786970138549805,
      "rewards/rejected": -3.650768756866455,
      "step": 576
    },
    {
      "epoch": 0.15,
      "grad_norm": 40.28141784667969,
      "kl": 0.0,
      "learning_rate": 4.2449620518188955e-07,
      "logps/chosen": -275.3423156738281,
      "logps/rejected": -183.16616821289062,
      "loss": 0.3975,
      "rewards/chosen": -0.9379222393035889,
      "rewards/margins": 1.8455045223236084,
      "rewards/rejected": -2.7834267616271973,
      "step": 577
    },
    {
      "epoch": 0.15,
      "grad_norm": 41.42319869995117,
      "kl": 0.0,
      "learning_rate": 4.2436534938497774e-07,
      "logps/chosen": -192.60484313964844,
      "logps/rejected": -239.24285888671875,
      "loss": 0.3576,
      "rewards/chosen": 0.22006237506866455,
      "rewards/margins": 1.8599191904067993,
      "rewards/rejected": -1.6398568153381348,
      "step": 578
    },
    {
      "epoch": 0.15,
      "grad_norm": 22.584609985351562,
      "kl": 0.0,
      "learning_rate": 4.2423449358806594e-07,
      "logps/chosen": -168.68499755859375,
      "logps/rejected": -194.3863983154297,
      "loss": 0.3093,
      "rewards/chosen": 1.0403271913528442,
      "rewards/margins": 3.6925063133239746,
      "rewards/rejected": -2.65217924118042,
      "step": 579
    },
    {
      "epoch": 0.15,
      "grad_norm": 28.20046043395996,
      "kl": 0.0,
      "learning_rate": 4.2410363779115413e-07,
      "logps/chosen": -246.39317321777344,
      "logps/rejected": -256.670654296875,
      "loss": 0.3676,
      "rewards/chosen": 0.45120349526405334,
      "rewards/margins": 2.9600284099578857,
      "rewards/rejected": -2.5088248252868652,
      "step": 580
    },
    {
      "epoch": 0.15,
      "grad_norm": 34.99135971069336,
      "kl": 0.0,
      "learning_rate": 4.239727819942423e-07,
      "logps/chosen": -195.71902465820312,
      "logps/rejected": -272.074951171875,
      "loss": 0.3224,
      "rewards/chosen": -0.10005563497543335,
      "rewards/margins": 3.6908347606658936,
      "rewards/rejected": -3.7908904552459717,
      "step": 581
    },
    {
      "epoch": 0.15,
      "grad_norm": 37.57238006591797,
      "kl": 0.0,
      "learning_rate": 4.2384192619733053e-07,
      "logps/chosen": -322.5052185058594,
      "logps/rejected": -237.87149047851562,
      "loss": 0.4824,
      "rewards/chosen": -0.9541627168655396,
      "rewards/margins": 1.8996022939682007,
      "rewards/rejected": -2.8537650108337402,
      "step": 582
    },
    {
      "epoch": 0.15,
      "grad_norm": 32.19499969482422,
      "kl": 0.0,
      "learning_rate": 4.237110704004187e-07,
      "logps/chosen": -165.03366088867188,
      "logps/rejected": -246.01821899414062,
      "loss": 0.38,
      "rewards/chosen": -0.257803738117218,
      "rewards/margins": 2.8370163440704346,
      "rewards/rejected": -3.094820022583008,
      "step": 583
    },
    {
      "epoch": 0.15,
      "grad_norm": 37.46137237548828,
      "kl": 0.0,
      "learning_rate": 4.235802146035069e-07,
      "logps/chosen": -261.37066650390625,
      "logps/rejected": -227.03553771972656,
      "loss": 0.2971,
      "rewards/chosen": -0.23255617916584015,
      "rewards/margins": 1.408286213874817,
      "rewards/rejected": -1.6408424377441406,
      "step": 584
    },
    {
      "epoch": 0.15,
      "grad_norm": 22.948627471923828,
      "kl": 0.0,
      "learning_rate": 4.234493588065951e-07,
      "logps/chosen": -147.1380615234375,
      "logps/rejected": -239.35177612304688,
      "loss": 0.3552,
      "rewards/chosen": -0.27524644136428833,
      "rewards/margins": 1.9764175415039062,
      "rewards/rejected": -2.25166392326355,
      "step": 585
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.70252990722656,
      "kl": 0.0,
      "learning_rate": 4.233185030096833e-07,
      "logps/chosen": -183.9422149658203,
      "logps/rejected": -268.509033203125,
      "loss": 0.1464,
      "rewards/chosen": -0.05040533095598221,
      "rewards/margins": 6.646665573120117,
      "rewards/rejected": -6.697071075439453,
      "step": 586
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.52576446533203,
      "kl": 0.0,
      "learning_rate": 4.231876472127715e-07,
      "logps/chosen": -228.71929931640625,
      "logps/rejected": -263.7425231933594,
      "loss": 0.3134,
      "rewards/chosen": -1.5203044414520264,
      "rewards/margins": 1.3902077674865723,
      "rewards/rejected": -2.9105122089385986,
      "step": 587
    },
    {
      "epoch": 0.15,
      "grad_norm": 34.12139129638672,
      "kl": 0.0,
      "learning_rate": 4.230567914158597e-07,
      "logps/chosen": -153.0979766845703,
      "logps/rejected": -236.69737243652344,
      "loss": 0.2226,
      "rewards/chosen": 0.7525811791419983,
      "rewards/margins": 3.2662181854248047,
      "rewards/rejected": -2.513637065887451,
      "step": 588
    },
    {
      "epoch": 0.15,
      "grad_norm": 32.09413528442383,
      "kl": 0.0,
      "learning_rate": 4.229259356189479e-07,
      "logps/chosen": -185.1908416748047,
      "logps/rejected": -255.58477783203125,
      "loss": 0.3693,
      "rewards/chosen": -0.6914921402931213,
      "rewards/margins": 1.7158796787261963,
      "rewards/rejected": -2.407371759414673,
      "step": 589
    },
    {
      "epoch": 0.15,
      "grad_norm": 36.44728469848633,
      "kl": 0.0,
      "learning_rate": 4.227950798220361e-07,
      "logps/chosen": -188.67189025878906,
      "logps/rejected": -242.02938842773438,
      "loss": 0.3205,
      "rewards/chosen": 0.6397703886032104,
      "rewards/margins": 5.1770548820495605,
      "rewards/rejected": -4.5372843742370605,
      "step": 590
    },
    {
      "epoch": 0.15,
      "grad_norm": 33.02339172363281,
      "kl": 0.0,
      "learning_rate": 4.226642240251243e-07,
      "logps/chosen": -218.4967803955078,
      "logps/rejected": -230.99813842773438,
      "loss": 0.3533,
      "rewards/chosen": -1.1348559856414795,
      "rewards/margins": 1.3938672542572021,
      "rewards/rejected": -2.5287232398986816,
      "step": 591
    },
    {
      "epoch": 0.15,
      "grad_norm": 38.133026123046875,
      "kl": 0.0,
      "learning_rate": 4.225333682282125e-07,
      "logps/chosen": -347.8610534667969,
      "logps/rejected": -249.60537719726562,
      "loss": 0.2811,
      "rewards/chosen": 1.11420476436615,
      "rewards/margins": 3.734504222869873,
      "rewards/rejected": -2.6202993392944336,
      "step": 592
    },
    {
      "epoch": 0.16,
      "grad_norm": 32.02317428588867,
      "kl": 0.0,
      "learning_rate": 4.224025124313007e-07,
      "logps/chosen": -224.7441864013672,
      "logps/rejected": -204.56478881835938,
      "loss": 0.3995,
      "rewards/chosen": -1.2623833417892456,
      "rewards/margins": 2.1233153343200684,
      "rewards/rejected": -3.3856987953186035,
      "step": 593
    },
    {
      "epoch": 0.16,
      "grad_norm": 37.762245178222656,
      "kl": 0.0,
      "learning_rate": 4.2227165663438893e-07,
      "logps/chosen": -230.5582275390625,
      "logps/rejected": -199.21585083007812,
      "loss": 0.3758,
      "rewards/chosen": -0.6932766437530518,
      "rewards/margins": 0.7945023775100708,
      "rewards/rejected": -1.4877790212631226,
      "step": 594
    },
    {
      "epoch": 0.16,
      "grad_norm": 35.36671829223633,
      "kl": 0.0,
      "learning_rate": 4.2214080083747713e-07,
      "logps/chosen": -280.342041015625,
      "logps/rejected": -314.884765625,
      "loss": 0.3282,
      "rewards/chosen": 0.6207032203674316,
      "rewards/margins": 3.5934977531433105,
      "rewards/rejected": -2.972794532775879,
      "step": 595
    },
    {
      "epoch": 0.16,
      "grad_norm": 33.32785415649414,
      "kl": 0.0,
      "learning_rate": 4.2200994504056527e-07,
      "logps/chosen": -183.12124633789062,
      "logps/rejected": -246.1369171142578,
      "loss": 0.3199,
      "rewards/chosen": -0.36766961216926575,
      "rewards/margins": 2.0127646923065186,
      "rewards/rejected": -2.380434274673462,
      "step": 596
    },
    {
      "epoch": 0.16,
      "grad_norm": 38.22690200805664,
      "kl": 0.0,
      "learning_rate": 4.2187908924365347e-07,
      "logps/chosen": -195.23191833496094,
      "logps/rejected": -224.2406768798828,
      "loss": 0.3949,
      "rewards/chosen": -0.9440802931785583,
      "rewards/margins": 0.899925172328949,
      "rewards/rejected": -1.8440054655075073,
      "step": 597
    },
    {
      "epoch": 0.16,
      "grad_norm": 34.1783332824707,
      "kl": 0.0,
      "learning_rate": 4.2174823344674166e-07,
      "logps/chosen": -123.10539245605469,
      "logps/rejected": -198.38302612304688,
      "loss": 0.2468,
      "rewards/chosen": 0.9094799757003784,
      "rewards/margins": 3.0081400871276855,
      "rewards/rejected": -2.0986599922180176,
      "step": 598
    },
    {
      "epoch": 0.16,
      "grad_norm": 40.53157043457031,
      "kl": 0.0,
      "learning_rate": 4.2161737764982986e-07,
      "logps/chosen": -218.34637451171875,
      "logps/rejected": -308.47027587890625,
      "loss": 0.3564,
      "rewards/chosen": 0.3253116011619568,
      "rewards/margins": 4.2715067863464355,
      "rewards/rejected": -3.946195363998413,
      "step": 599
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.91612434387207,
      "kl": 0.0,
      "learning_rate": 4.2148652185291806e-07,
      "logps/chosen": -251.1618194580078,
      "logps/rejected": -243.45339965820312,
      "loss": 0.3264,
      "rewards/chosen": -1.2941466569900513,
      "rewards/margins": 1.6014741659164429,
      "rewards/rejected": -2.895620822906494,
      "step": 600
    },
    {
      "epoch": 0.16,
      "grad_norm": 37.07390213012695,
      "kl": 0.0,
      "learning_rate": 4.2135566605600625e-07,
      "logps/chosen": -293.2223815917969,
      "logps/rejected": -259.8509826660156,
      "loss": 0.3254,
      "rewards/chosen": -0.37220340967178345,
      "rewards/margins": 4.210824966430664,
      "rewards/rejected": -4.583028316497803,
      "step": 601
    },
    {
      "epoch": 0.16,
      "grad_norm": 33.21635818481445,
      "kl": 0.0,
      "learning_rate": 4.2122481025909445e-07,
      "logps/chosen": -229.32579040527344,
      "logps/rejected": -207.85536193847656,
      "loss": 0.3338,
      "rewards/chosen": -0.4946441054344177,
      "rewards/margins": 1.616485357284546,
      "rewards/rejected": -2.1111295223236084,
      "step": 602
    },
    {
      "epoch": 0.16,
      "grad_norm": 35.14589309692383,
      "kl": 0.0,
      "learning_rate": 4.2109395446218264e-07,
      "logps/chosen": -218.96194458007812,
      "logps/rejected": -198.21871948242188,
      "loss": 0.3888,
      "rewards/chosen": -1.2031952142715454,
      "rewards/margins": 0.5610319375991821,
      "rewards/rejected": -1.7642271518707275,
      "step": 603
    },
    {
      "epoch": 0.16,
      "grad_norm": 32.1645393371582,
      "kl": 0.0,
      "learning_rate": 4.2096309866527084e-07,
      "logps/chosen": -224.2860870361328,
      "logps/rejected": -265.3037109375,
      "loss": 0.2591,
      "rewards/chosen": 0.464304119348526,
      "rewards/margins": 3.8499107360839844,
      "rewards/rejected": -3.385606527328491,
      "step": 604
    },
    {
      "epoch": 0.16,
      "grad_norm": 36.952308654785156,
      "kl": 0.0,
      "learning_rate": 4.2083224286835904e-07,
      "logps/chosen": -258.63189697265625,
      "logps/rejected": -289.3390808105469,
      "loss": 0.3015,
      "rewards/chosen": -1.1682672500610352,
      "rewards/margins": 2.1730809211730957,
      "rewards/rejected": -3.341348171234131,
      "step": 605
    },
    {
      "epoch": 0.16,
      "grad_norm": 27.418455123901367,
      "kl": 0.0,
      "learning_rate": 4.2070138707144723e-07,
      "logps/chosen": -214.0426025390625,
      "logps/rejected": -213.5802764892578,
      "loss": 0.3561,
      "rewards/chosen": -0.10803361982107162,
      "rewards/margins": 1.9988174438476562,
      "rewards/rejected": -2.106851100921631,
      "step": 606
    },
    {
      "epoch": 0.16,
      "grad_norm": 34.8771858215332,
      "kl": 0.0,
      "learning_rate": 4.205705312745355e-07,
      "logps/chosen": -207.3766326904297,
      "logps/rejected": -240.21054077148438,
      "loss": 0.3563,
      "rewards/chosen": -0.24712109565734863,
      "rewards/margins": 3.139512300491333,
      "rewards/rejected": -3.3866333961486816,
      "step": 607
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.268268585205078,
      "kl": 0.0,
      "learning_rate": 4.204396754776237e-07,
      "logps/chosen": -240.3983154296875,
      "logps/rejected": -267.9146728515625,
      "loss": 0.3182,
      "rewards/chosen": -0.6161116361618042,
      "rewards/margins": 3.3313660621643066,
      "rewards/rejected": -3.9474778175354004,
      "step": 608
    },
    {
      "epoch": 0.16,
      "grad_norm": 38.52226257324219,
      "kl": 0.0,
      "learning_rate": 4.203088196807119e-07,
      "logps/chosen": -225.6261749267578,
      "logps/rejected": -227.69583129882812,
      "loss": 0.2613,
      "rewards/chosen": -0.07037395238876343,
      "rewards/margins": 2.7204816341400146,
      "rewards/rejected": -2.790855646133423,
      "step": 609
    },
    {
      "epoch": 0.16,
      "grad_norm": 36.78439712524414,
      "kl": 0.0,
      "learning_rate": 4.2017796388380007e-07,
      "logps/chosen": -194.47763061523438,
      "logps/rejected": -309.6812438964844,
      "loss": 0.3235,
      "rewards/chosen": -1.9775137901306152,
      "rewards/margins": 8.399917602539062,
      "rewards/rejected": -10.377431869506836,
      "step": 610
    },
    {
      "epoch": 0.16,
      "grad_norm": 36.382999420166016,
      "kl": 0.0,
      "learning_rate": 4.2004710808688827e-07,
      "logps/chosen": -271.60491943359375,
      "logps/rejected": -241.863037109375,
      "loss": 0.392,
      "rewards/chosen": -0.5924327373504639,
      "rewards/margins": 2.0697991847991943,
      "rewards/rejected": -2.662231922149658,
      "step": 611
    },
    {
      "epoch": 0.16,
      "grad_norm": 34.29161071777344,
      "kl": 0.0,
      "learning_rate": 4.199162522899764e-07,
      "logps/chosen": -264.40716552734375,
      "logps/rejected": -227.8290252685547,
      "loss": 0.3088,
      "rewards/chosen": -0.9955476522445679,
      "rewards/margins": 1.4795089960098267,
      "rewards/rejected": -2.4750566482543945,
      "step": 612
    },
    {
      "epoch": 0.16,
      "grad_norm": 32.622154235839844,
      "kl": 0.0,
      "learning_rate": 4.197853964930646e-07,
      "logps/chosen": -312.36907958984375,
      "logps/rejected": -288.0682678222656,
      "loss": 0.2823,
      "rewards/chosen": -2.0785109996795654,
      "rewards/margins": 3.3038604259490967,
      "rewards/rejected": -5.382371425628662,
      "step": 613
    },
    {
      "epoch": 0.16,
      "grad_norm": 37.70346450805664,
      "kl": 0.0,
      "learning_rate": 4.196545406961528e-07,
      "logps/chosen": -221.8252716064453,
      "logps/rejected": -356.6756591796875,
      "loss": 0.4246,
      "rewards/chosen": -0.5658776164054871,
      "rewards/margins": 2.9037880897521973,
      "rewards/rejected": -3.469665765762329,
      "step": 614
    },
    {
      "epoch": 0.16,
      "grad_norm": 30.7971248626709,
      "kl": 0.0,
      "learning_rate": 4.19523684899241e-07,
      "logps/chosen": -177.7958221435547,
      "logps/rejected": -268.3973693847656,
      "loss": 0.274,
      "rewards/chosen": 0.7021116018295288,
      "rewards/margins": 3.8640646934509277,
      "rewards/rejected": -3.1619529724121094,
      "step": 615
    },
    {
      "epoch": 0.16,
      "grad_norm": 26.313661575317383,
      "kl": 0.0,
      "learning_rate": 4.193928291023292e-07,
      "logps/chosen": -137.11647033691406,
      "logps/rejected": -263.5867919921875,
      "loss": 0.2909,
      "rewards/chosen": -1.9836171865463257,
      "rewards/margins": 0.7680078744888306,
      "rewards/rejected": -2.7516250610351562,
      "step": 616
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.26655387878418,
      "kl": 0.0,
      "learning_rate": 4.192619733054174e-07,
      "logps/chosen": -233.62945556640625,
      "logps/rejected": -212.066162109375,
      "loss": 0.2317,
      "rewards/chosen": 2.0348241329193115,
      "rewards/margins": 4.331893444061279,
      "rewards/rejected": -2.2970693111419678,
      "step": 617
    },
    {
      "epoch": 0.16,
      "grad_norm": 32.78192138671875,
      "kl": 0.0,
      "learning_rate": 4.191311175085056e-07,
      "logps/chosen": -142.2562713623047,
      "logps/rejected": -229.1559600830078,
      "loss": 0.3904,
      "rewards/chosen": -0.5529773235321045,
      "rewards/margins": 2.3756163120269775,
      "rewards/rejected": -2.928593635559082,
      "step": 618
    },
    {
      "epoch": 0.16,
      "grad_norm": 37.07200241088867,
      "kl": 0.0,
      "learning_rate": 4.190002617115938e-07,
      "logps/chosen": -250.66610717773438,
      "logps/rejected": -307.2555236816406,
      "loss": 0.3443,
      "rewards/chosen": 0.45598897337913513,
      "rewards/margins": 3.6246819496154785,
      "rewards/rejected": -3.1686930656433105,
      "step": 619
    },
    {
      "epoch": 0.16,
      "grad_norm": 34.067134857177734,
      "kl": 0.0,
      "learning_rate": 4.1886940591468203e-07,
      "logps/chosen": -231.55824279785156,
      "logps/rejected": -225.37667846679688,
      "loss": 0.2838,
      "rewards/chosen": -1.5656440258026123,
      "rewards/margins": 1.4117012023925781,
      "rewards/rejected": -2.9773452281951904,
      "step": 620
    },
    {
      "epoch": 0.16,
      "grad_norm": 33.20025634765625,
      "kl": 0.0,
      "learning_rate": 4.1873855011777023e-07,
      "logps/chosen": -286.56365966796875,
      "logps/rejected": -232.8338623046875,
      "loss": 0.3906,
      "rewards/chosen": -1.1976393461227417,
      "rewards/margins": 1.7728246450424194,
      "rewards/rejected": -2.970463991165161,
      "step": 621
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.43844223022461,
      "kl": 0.0,
      "learning_rate": 4.186076943208584e-07,
      "logps/chosen": -161.1515350341797,
      "logps/rejected": -221.69117736816406,
      "loss": 0.232,
      "rewards/chosen": -1.0699456930160522,
      "rewards/margins": 1.7152012586593628,
      "rewards/rejected": -2.785146951675415,
      "step": 622
    },
    {
      "epoch": 0.16,
      "grad_norm": 35.48064041137695,
      "kl": 0.0,
      "learning_rate": 4.184768385239466e-07,
      "logps/chosen": -266.043212890625,
      "logps/rejected": -332.0105895996094,
      "loss": 0.3535,
      "rewards/chosen": -0.6172822117805481,
      "rewards/margins": 2.453559398651123,
      "rewards/rejected": -3.0708415508270264,
      "step": 623
    },
    {
      "epoch": 0.16,
      "grad_norm": 30.414508819580078,
      "kl": 0.0,
      "learning_rate": 4.183459827270348e-07,
      "logps/chosen": -195.5116729736328,
      "logps/rejected": -175.37014770507812,
      "loss": 0.2711,
      "rewards/chosen": -0.0021307978313416243,
      "rewards/margins": 2.2265679836273193,
      "rewards/rejected": -2.22869873046875,
      "step": 624
    },
    {
      "epoch": 0.16,
      "grad_norm": 40.96977996826172,
      "kl": 0.0,
      "learning_rate": 4.18215126930123e-07,
      "logps/chosen": -211.9053497314453,
      "logps/rejected": -200.890869140625,
      "loss": 0.3433,
      "rewards/chosen": -0.9147431254386902,
      "rewards/margins": 1.683835744857788,
      "rewards/rejected": -2.598578929901123,
      "step": 625
    },
    {
      "epoch": 0.16,
      "grad_norm": 32.06422424316406,
      "kl": 0.0,
      "learning_rate": 4.180842711332112e-07,
      "logps/chosen": -168.45120239257812,
      "logps/rejected": -262.6234130859375,
      "loss": 0.4266,
      "rewards/chosen": -1.8037545680999756,
      "rewards/margins": 0.47751498222351074,
      "rewards/rejected": -2.2812695503234863,
      "step": 626
    },
    {
      "epoch": 0.16,
      "grad_norm": 29.26646614074707,
      "kl": 0.0,
      "learning_rate": 4.1795341533629935e-07,
      "logps/chosen": -184.18740844726562,
      "logps/rejected": -222.1627655029297,
      "loss": 0.2588,
      "rewards/chosen": 0.5016359686851501,
      "rewards/margins": 3.441005229949951,
      "rewards/rejected": -2.9393692016601562,
      "step": 627
    },
    {
      "epoch": 0.16,
      "grad_norm": 31.68562126159668,
      "kl": 0.0,
      "learning_rate": 4.1782255953938755e-07,
      "logps/chosen": -177.28375244140625,
      "logps/rejected": -206.3604736328125,
      "loss": 0.442,
      "rewards/chosen": -0.5679258704185486,
      "rewards/margins": 2.414395809173584,
      "rewards/rejected": -2.9823217391967773,
      "step": 628
    },
    {
      "epoch": 0.16,
      "grad_norm": 35.57976150512695,
      "kl": 0.0,
      "learning_rate": 4.1769170374247574e-07,
      "logps/chosen": -192.6014404296875,
      "logps/rejected": -193.0386505126953,
      "loss": 0.2338,
      "rewards/chosen": -0.7253746390342712,
      "rewards/margins": 1.539360761642456,
      "rewards/rejected": -2.264735460281372,
      "step": 629
    },
    {
      "epoch": 0.16,
      "grad_norm": 26.9422607421875,
      "kl": 0.0,
      "learning_rate": 4.1756084794556394e-07,
      "logps/chosen": -192.63441467285156,
      "logps/rejected": -205.19033813476562,
      "loss": 0.3647,
      "rewards/chosen": -1.236472487449646,
      "rewards/margins": 1.0552409887313843,
      "rewards/rejected": -2.2917134761810303,
      "step": 630
    },
    {
      "epoch": 0.17,
      "grad_norm": 34.09742736816406,
      "kl": 0.0,
      "learning_rate": 4.1742999214865214e-07,
      "logps/chosen": -277.0921936035156,
      "logps/rejected": -268.65423583984375,
      "loss": 0.3759,
      "rewards/chosen": -1.4564865827560425,
      "rewards/margins": 1.5203653573989868,
      "rewards/rejected": -2.9768519401550293,
      "step": 631
    },
    {
      "epoch": 0.17,
      "grad_norm": 35.76420211791992,
      "kl": 0.0,
      "learning_rate": 4.1729913635174033e-07,
      "logps/chosen": -220.2989044189453,
      "logps/rejected": -355.736083984375,
      "loss": 0.235,
      "rewards/chosen": 0.18057142198085785,
      "rewards/margins": 4.921010494232178,
      "rewards/rejected": -4.740438938140869,
      "step": 632
    },
    {
      "epoch": 0.17,
      "grad_norm": 24.0476131439209,
      "kl": 0.0,
      "learning_rate": 4.171682805548286e-07,
      "logps/chosen": -152.52206420898438,
      "logps/rejected": -300.75396728515625,
      "loss": 0.2029,
      "rewards/chosen": -0.4941352307796478,
      "rewards/margins": 4.297951698303223,
      "rewards/rejected": -4.792087078094482,
      "step": 633
    },
    {
      "epoch": 0.17,
      "grad_norm": 32.8790397644043,
      "kl": 0.0,
      "learning_rate": 4.170374247579168e-07,
      "logps/chosen": -149.47360229492188,
      "logps/rejected": -233.9814910888672,
      "loss": 0.3435,
      "rewards/chosen": -0.6808183789253235,
      "rewards/margins": 3.682387590408325,
      "rewards/rejected": -4.363205909729004,
      "step": 634
    },
    {
      "epoch": 0.17,
      "grad_norm": 31.50946617126465,
      "kl": 0.0,
      "learning_rate": 4.1690656896100497e-07,
      "logps/chosen": -199.74887084960938,
      "logps/rejected": -254.82545471191406,
      "loss": 0.2709,
      "rewards/chosen": 0.24805386364459991,
      "rewards/margins": 4.510081768035889,
      "rewards/rejected": -4.262027740478516,
      "step": 635
    },
    {
      "epoch": 0.17,
      "grad_norm": 40.11531066894531,
      "kl": 0.0,
      "learning_rate": 4.1677571316409317e-07,
      "logps/chosen": -203.65505981445312,
      "logps/rejected": -272.3819580078125,
      "loss": 0.3712,
      "rewards/chosen": -0.36780452728271484,
      "rewards/margins": 2.217893123626709,
      "rewards/rejected": -2.585697650909424,
      "step": 636
    },
    {
      "epoch": 0.17,
      "grad_norm": 35.829952239990234,
      "kl": 0.0,
      "learning_rate": 4.1664485736718136e-07,
      "logps/chosen": -211.70443725585938,
      "logps/rejected": -145.94801330566406,
      "loss": 0.4441,
      "rewards/chosen": -0.9229274988174438,
      "rewards/margins": 0.6648210287094116,
      "rewards/rejected": -1.5877485275268555,
      "step": 637
    },
    {
      "epoch": 0.17,
      "grad_norm": 33.948787689208984,
      "kl": 0.0,
      "learning_rate": 4.1651400157026956e-07,
      "logps/chosen": -205.24441528320312,
      "logps/rejected": -201.89073181152344,
      "loss": 0.3173,
      "rewards/chosen": 0.8895024061203003,
      "rewards/margins": 3.2825989723205566,
      "rewards/rejected": -2.393096446990967,
      "step": 638
    },
    {
      "epoch": 0.17,
      "grad_norm": 31.641845703125,
      "kl": 0.0,
      "learning_rate": 4.1638314577335776e-07,
      "logps/chosen": -215.01095581054688,
      "logps/rejected": -254.07305908203125,
      "loss": 0.3493,
      "rewards/chosen": -0.3878760039806366,
      "rewards/margins": 2.593442916870117,
      "rewards/rejected": -2.981318950653076,
      "step": 639
    },
    {
      "epoch": 0.17,
      "grad_norm": 27.793249130249023,
      "kl": 0.0,
      "learning_rate": 4.1625228997644595e-07,
      "logps/chosen": -200.00442504882812,
      "logps/rejected": -264.08099365234375,
      "loss": 0.3061,
      "rewards/chosen": -0.18434447050094604,
      "rewards/margins": 3.4469001293182373,
      "rewards/rejected": -3.631244659423828,
      "step": 640
    },
    {
      "epoch": 0.17,
      "grad_norm": 36.78108596801758,
      "kl": 0.0,
      "learning_rate": 4.1612143417953415e-07,
      "logps/chosen": -289.6894226074219,
      "logps/rejected": -291.83001708984375,
      "loss": 0.2781,
      "rewards/chosen": -1.6234275102615356,
      "rewards/margins": 1.1950386762619019,
      "rewards/rejected": -2.8184661865234375,
      "step": 641
    },
    {
      "epoch": 0.17,
      "grad_norm": 34.74748229980469,
      "kl": 0.0,
      "learning_rate": 4.159905783826223e-07,
      "logps/chosen": -244.11862182617188,
      "logps/rejected": -220.2618865966797,
      "loss": 0.4239,
      "rewards/chosen": -1.8187682628631592,
      "rewards/margins": 0.42972373962402344,
      "rewards/rejected": -2.2484920024871826,
      "step": 642
    },
    {
      "epoch": 0.17,
      "grad_norm": 36.53879928588867,
      "kl": 0.0,
      "learning_rate": 4.158597225857105e-07,
      "logps/chosen": -297.8824768066406,
      "logps/rejected": -242.40858459472656,
      "loss": 0.3813,
      "rewards/chosen": -0.17054195702075958,
      "rewards/margins": 3.0910396575927734,
      "rewards/rejected": -3.2615816593170166,
      "step": 643
    },
    {
      "epoch": 0.17,
      "grad_norm": 35.24003219604492,
      "kl": 0.0,
      "learning_rate": 4.157288667887987e-07,
      "logps/chosen": -203.4264373779297,
      "logps/rejected": -251.9635009765625,
      "loss": 0.2558,
      "rewards/chosen": -0.09030131250619888,
      "rewards/margins": 2.920102119445801,
      "rewards/rejected": -3.0104033946990967,
      "step": 644
    },
    {
      "epoch": 0.17,
      "grad_norm": 39.00501251220703,
      "kl": 0.0,
      "learning_rate": 4.155980109918869e-07,
      "logps/chosen": -320.9664306640625,
      "logps/rejected": -198.15609741210938,
      "loss": 0.4067,
      "rewards/chosen": -1.1451497077941895,
      "rewards/margins": 1.429276704788208,
      "rewards/rejected": -2.5744264125823975,
      "step": 645
    },
    {
      "epoch": 0.17,
      "grad_norm": 34.85394287109375,
      "kl": 0.0,
      "learning_rate": 4.1546715519497513e-07,
      "logps/chosen": -257.5217590332031,
      "logps/rejected": -276.9884033203125,
      "loss": 0.2091,
      "rewards/chosen": -0.7587192058563232,
      "rewards/margins": 2.568091630935669,
      "rewards/rejected": -3.326810836791992,
      "step": 646
    },
    {
      "epoch": 0.17,
      "grad_norm": 34.492549896240234,
      "kl": 0.0,
      "learning_rate": 4.153362993980633e-07,
      "logps/chosen": -238.52377319335938,
      "logps/rejected": -237.36590576171875,
      "loss": 0.4542,
      "rewards/chosen": -0.41595375537872314,
      "rewards/margins": 1.1079202890396118,
      "rewards/rejected": -1.523874044418335,
      "step": 647
    },
    {
      "epoch": 0.17,
      "grad_norm": 27.814579010009766,
      "kl": 0.0,
      "learning_rate": 4.152054436011515e-07,
      "logps/chosen": -177.15321350097656,
      "logps/rejected": -198.52593994140625,
      "loss": 0.2217,
      "rewards/chosen": 0.3595758378505707,
      "rewards/margins": 3.194960117340088,
      "rewards/rejected": -2.8353843688964844,
      "step": 648
    },
    {
      "epoch": 0.17,
      "grad_norm": 28.996912002563477,
      "kl": 0.0,
      "learning_rate": 4.150745878042397e-07,
      "logps/chosen": -227.43548583984375,
      "logps/rejected": -220.3048095703125,
      "loss": 0.3507,
      "rewards/chosen": -1.0934927463531494,
      "rewards/margins": 1.8975601196289062,
      "rewards/rejected": -2.9910528659820557,
      "step": 649
    },
    {
      "epoch": 0.17,
      "grad_norm": 36.756919860839844,
      "kl": 0.0,
      "learning_rate": 4.149437320073279e-07,
      "logps/chosen": -231.71620178222656,
      "logps/rejected": -262.20648193359375,
      "loss": 0.3661,
      "rewards/chosen": -0.6306547522544861,
      "rewards/margins": 2.2297322750091553,
      "rewards/rejected": -2.860387086868286,
      "step": 650
    },
    {
      "epoch": 0.17,
      "grad_norm": 36.70586013793945,
      "kl": 0.0,
      "learning_rate": 4.148128762104161e-07,
      "logps/chosen": -317.0033264160156,
      "logps/rejected": -279.13165283203125,
      "loss": 0.2521,
      "rewards/chosen": -0.37969204783439636,
      "rewards/margins": 3.3592135906219482,
      "rewards/rejected": -3.738905668258667,
      "step": 651
    },
    {
      "epoch": 0.17,
      "grad_norm": 46.720123291015625,
      "kl": 0.0,
      "learning_rate": 4.146820204135043e-07,
      "logps/chosen": -292.99957275390625,
      "logps/rejected": -220.20150756835938,
      "loss": 0.3879,
      "rewards/chosen": -0.10919928550720215,
      "rewards/margins": 2.287581205368042,
      "rewards/rejected": -2.396780490875244,
      "step": 652
    },
    {
      "epoch": 0.17,
      "grad_norm": 33.120811462402344,
      "kl": 0.0,
      "learning_rate": 4.145511646165925e-07,
      "logps/chosen": -309.48529052734375,
      "logps/rejected": -180.5233154296875,
      "loss": 0.3888,
      "rewards/chosen": -1.0322825908660889,
      "rewards/margins": 1.6642029285430908,
      "rewards/rejected": -2.6964855194091797,
      "step": 653
    },
    {
      "epoch": 0.17,
      "grad_norm": 30.720664978027344,
      "kl": 0.0,
      "learning_rate": 4.144203088196807e-07,
      "logps/chosen": -141.5933380126953,
      "logps/rejected": -177.95643615722656,
      "loss": 0.322,
      "rewards/chosen": -0.5805988311767578,
      "rewards/margins": 1.3864021301269531,
      "rewards/rejected": -1.967000961303711,
      "step": 654
    },
    {
      "epoch": 0.17,
      "grad_norm": 28.06136703491211,
      "kl": 0.0,
      "learning_rate": 4.142894530227689e-07,
      "logps/chosen": -245.1380615234375,
      "logps/rejected": -213.3980712890625,
      "loss": 0.2355,
      "rewards/chosen": -0.2578246295452118,
      "rewards/margins": 3.7815134525299072,
      "rewards/rejected": -4.039338111877441,
      "step": 655
    },
    {
      "epoch": 0.17,
      "grad_norm": 38.5828971862793,
      "kl": 0.0,
      "learning_rate": 4.141585972258571e-07,
      "logps/chosen": -239.96658325195312,
      "logps/rejected": -230.351318359375,
      "loss": 0.3052,
      "rewards/chosen": -0.9050352573394775,
      "rewards/margins": 1.6403262615203857,
      "rewards/rejected": -2.5453615188598633,
      "step": 656
    },
    {
      "epoch": 0.17,
      "grad_norm": 27.131685256958008,
      "kl": 0.0,
      "learning_rate": 4.140277414289453e-07,
      "logps/chosen": -194.69229125976562,
      "logps/rejected": -242.11758422851562,
      "loss": 0.4452,
      "rewards/chosen": -2.399684429168701,
      "rewards/margins": 1.4530136585235596,
      "rewards/rejected": -3.8526980876922607,
      "step": 657
    },
    {
      "epoch": 0.17,
      "grad_norm": 38.746849060058594,
      "kl": 0.0,
      "learning_rate": 4.1389688563203343e-07,
      "logps/chosen": -147.9233856201172,
      "logps/rejected": -222.4149627685547,
      "loss": 0.3078,
      "rewards/chosen": 1.145562767982483,
      "rewards/margins": 3.6172871589660645,
      "rewards/rejected": -2.471724271774292,
      "step": 658
    },
    {
      "epoch": 0.17,
      "grad_norm": 30.03120994567871,
      "kl": 0.0,
      "learning_rate": 4.137660298351217e-07,
      "logps/chosen": -189.79733276367188,
      "logps/rejected": -279.6260070800781,
      "loss": 0.2722,
      "rewards/chosen": -0.8100014925003052,
      "rewards/margins": 3.497687816619873,
      "rewards/rejected": -4.307689189910889,
      "step": 659
    },
    {
      "epoch": 0.17,
      "grad_norm": 35.37248229980469,
      "kl": 0.0,
      "learning_rate": 4.136351740382099e-07,
      "logps/chosen": -191.43023681640625,
      "logps/rejected": -192.27333068847656,
      "loss": 0.3035,
      "rewards/chosen": -0.273165225982666,
      "rewards/margins": 2.452721118927002,
      "rewards/rejected": -2.725886344909668,
      "step": 660
    },
    {
      "epoch": 0.17,
      "grad_norm": 34.3448600769043,
      "kl": 0.0,
      "learning_rate": 4.1350431824129807e-07,
      "logps/chosen": -280.6360778808594,
      "logps/rejected": -250.7725372314453,
      "loss": 0.3402,
      "rewards/chosen": -1.419081449508667,
      "rewards/margins": 1.5376768112182617,
      "rewards/rejected": -2.9567582607269287,
      "step": 661
    },
    {
      "epoch": 0.17,
      "grad_norm": 29.6294002532959,
      "kl": 0.0,
      "learning_rate": 4.1337346244438627e-07,
      "logps/chosen": -216.47866821289062,
      "logps/rejected": -193.3513946533203,
      "loss": 0.2504,
      "rewards/chosen": 1.5271568298339844,
      "rewards/margins": 4.2868781089782715,
      "rewards/rejected": -2.759721279144287,
      "step": 662
    },
    {
      "epoch": 0.17,
      "grad_norm": 31.874507904052734,
      "kl": 0.0,
      "learning_rate": 4.1324260664747446e-07,
      "logps/chosen": -242.6487274169922,
      "logps/rejected": -256.5516662597656,
      "loss": 0.308,
      "rewards/chosen": -0.4256035387516022,
      "rewards/margins": 2.785071611404419,
      "rewards/rejected": -3.2106752395629883,
      "step": 663
    },
    {
      "epoch": 0.17,
      "grad_norm": 33.38178634643555,
      "kl": 0.0,
      "learning_rate": 4.1311175085056266e-07,
      "logps/chosen": -278.8080139160156,
      "logps/rejected": -300.10455322265625,
      "loss": 0.2865,
      "rewards/chosen": -0.9544926881790161,
      "rewards/margins": 2.5183773040771484,
      "rewards/rejected": -3.472870111465454,
      "step": 664
    },
    {
      "epoch": 0.17,
      "grad_norm": 29.179576873779297,
      "kl": 0.0,
      "learning_rate": 4.1298089505365086e-07,
      "logps/chosen": -169.78851318359375,
      "logps/rejected": -210.986572265625,
      "loss": 0.2921,
      "rewards/chosen": -0.48213818669319153,
      "rewards/margins": 2.5531556606292725,
      "rewards/rejected": -3.0352938175201416,
      "step": 665
    },
    {
      "epoch": 0.17,
      "grad_norm": 29.703609466552734,
      "kl": 0.0,
      "learning_rate": 4.1285003925673905e-07,
      "logps/chosen": -247.66671752929688,
      "logps/rejected": -162.64111328125,
      "loss": 0.3197,
      "rewards/chosen": -0.4724392890930176,
      "rewards/margins": 1.8395106792449951,
      "rewards/rejected": -2.3119499683380127,
      "step": 666
    },
    {
      "epoch": 0.17,
      "grad_norm": 28.3862247467041,
      "kl": 0.0,
      "learning_rate": 4.1271918345982725e-07,
      "logps/chosen": -225.37664794921875,
      "logps/rejected": -327.6197204589844,
      "loss": 0.3564,
      "rewards/chosen": -1.468646764755249,
      "rewards/margins": 1.1545610427856445,
      "rewards/rejected": -2.6232078075408936,
      "step": 667
    },
    {
      "epoch": 0.17,
      "grad_norm": 38.491947174072266,
      "kl": 0.0,
      "learning_rate": 4.1258832766291544e-07,
      "logps/chosen": -182.65261840820312,
      "logps/rejected": -207.8057861328125,
      "loss": 0.4306,
      "rewards/chosen": -0.28253644704818726,
      "rewards/margins": 1.8097026348114014,
      "rewards/rejected": -2.0922391414642334,
      "step": 668
    },
    {
      "epoch": 0.18,
      "grad_norm": 40.138816833496094,
      "kl": 0.0,
      "learning_rate": 4.1245747186600364e-07,
      "logps/chosen": -298.6606140136719,
      "logps/rejected": -283.8145751953125,
      "loss": 0.2849,
      "rewards/chosen": -0.5111945271492004,
      "rewards/margins": 2.231806993484497,
      "rewards/rejected": -2.7430014610290527,
      "step": 669
    },
    {
      "epoch": 0.18,
      "grad_norm": 43.718994140625,
      "kl": 0.0,
      "learning_rate": 4.1232661606909184e-07,
      "logps/chosen": -194.55783081054688,
      "logps/rejected": -236.71018981933594,
      "loss": 0.3596,
      "rewards/chosen": -0.527443528175354,
      "rewards/margins": 2.3515138626098633,
      "rewards/rejected": -2.8789572715759277,
      "step": 670
    },
    {
      "epoch": 0.18,
      "grad_norm": 35.09980392456055,
      "kl": 0.0,
      "learning_rate": 4.121957602721801e-07,
      "logps/chosen": -190.4247589111328,
      "logps/rejected": -208.2495574951172,
      "loss": 0.2523,
      "rewards/chosen": 0.05250234156847,
      "rewards/margins": 1.295493483543396,
      "rewards/rejected": -1.2429910898208618,
      "step": 671
    },
    {
      "epoch": 0.18,
      "grad_norm": 27.459993362426758,
      "kl": 0.0,
      "learning_rate": 4.120649044752683e-07,
      "logps/chosen": -271.6382751464844,
      "logps/rejected": -225.0758514404297,
      "loss": 0.2745,
      "rewards/chosen": -1.4889888763427734,
      "rewards/margins": 1.235353708267212,
      "rewards/rejected": -2.7243425846099854,
      "step": 672
    },
    {
      "epoch": 0.18,
      "grad_norm": 28.141847610473633,
      "kl": 0.0,
      "learning_rate": 4.119340486783564e-07,
      "logps/chosen": -279.026123046875,
      "logps/rejected": -240.21389770507812,
      "loss": 0.2851,
      "rewards/chosen": 0.7028965950012207,
      "rewards/margins": 3.4597043991088867,
      "rewards/rejected": -2.756807804107666,
      "step": 673
    },
    {
      "epoch": 0.18,
      "grad_norm": 28.72968101501465,
      "kl": 0.0,
      "learning_rate": 4.118031928814446e-07,
      "logps/chosen": -163.95077514648438,
      "logps/rejected": -258.7970275878906,
      "loss": 0.4225,
      "rewards/chosen": -0.9373546242713928,
      "rewards/margins": 1.647026538848877,
      "rewards/rejected": -2.584381103515625,
      "step": 674
    },
    {
      "epoch": 0.18,
      "grad_norm": 38.473487854003906,
      "kl": 0.0,
      "learning_rate": 4.116723370845328e-07,
      "logps/chosen": -280.1283264160156,
      "logps/rejected": -214.69662475585938,
      "loss": 0.2826,
      "rewards/chosen": 0.9605988264083862,
      "rewards/margins": 2.3107542991638184,
      "rewards/rejected": -1.3501554727554321,
      "step": 675
    },
    {
      "epoch": 0.18,
      "grad_norm": 35.380428314208984,
      "kl": 0.0,
      "learning_rate": 4.11541481287621e-07,
      "logps/chosen": -258.97076416015625,
      "logps/rejected": -240.42025756835938,
      "loss": 0.3657,
      "rewards/chosen": 0.23240822553634644,
      "rewards/margins": 2.945939540863037,
      "rewards/rejected": -2.713531255722046,
      "step": 676
    },
    {
      "epoch": 0.18,
      "grad_norm": 32.557743072509766,
      "kl": 0.0,
      "learning_rate": 4.114106254907092e-07,
      "logps/chosen": -272.61553955078125,
      "logps/rejected": -235.87657165527344,
      "loss": 0.406,
      "rewards/chosen": -1.2601265907287598,
      "rewards/margins": 1.2022619247436523,
      "rewards/rejected": -2.462388515472412,
      "step": 677
    },
    {
      "epoch": 0.18,
      "grad_norm": 39.19186019897461,
      "kl": 0.0,
      "learning_rate": 4.112797696937974e-07,
      "logps/chosen": -255.88790893554688,
      "logps/rejected": -164.4857940673828,
      "loss": 0.401,
      "rewards/chosen": -0.7437317967414856,
      "rewards/margins": 0.8636577725410461,
      "rewards/rejected": -1.6073895692825317,
      "step": 678
    },
    {
      "epoch": 0.18,
      "grad_norm": 39.593902587890625,
      "kl": 0.0,
      "learning_rate": 4.111489138968856e-07,
      "logps/chosen": -185.75872802734375,
      "logps/rejected": -262.96502685546875,
      "loss": 0.2612,
      "rewards/chosen": 1.2888895273208618,
      "rewards/margins": 3.809917449951172,
      "rewards/rejected": -2.5210278034210205,
      "step": 679
    },
    {
      "epoch": 0.18,
      "grad_norm": 34.13742446899414,
      "kl": 0.0,
      "learning_rate": 4.110180580999738e-07,
      "logps/chosen": -247.64129638671875,
      "logps/rejected": -184.8863067626953,
      "loss": 0.3106,
      "rewards/chosen": 0.7847617864608765,
      "rewards/margins": 3.7181215286254883,
      "rewards/rejected": -2.9333596229553223,
      "step": 680
    },
    {
      "epoch": 0.18,
      "grad_norm": 31.16827392578125,
      "kl": 0.0,
      "learning_rate": 4.10887202303062e-07,
      "logps/chosen": -312.4378967285156,
      "logps/rejected": -152.6389923095703,
      "loss": 0.3165,
      "rewards/chosen": 0.8620835542678833,
      "rewards/margins": 2.3805339336395264,
      "rewards/rejected": -1.518450379371643,
      "step": 681
    },
    {
      "epoch": 0.18,
      "grad_norm": 34.24968338012695,
      "kl": 0.0,
      "learning_rate": 4.107563465061502e-07,
      "logps/chosen": -227.52528381347656,
      "logps/rejected": -248.9477996826172,
      "loss": 0.3509,
      "rewards/chosen": -0.8197565078735352,
      "rewards/margins": 0.4830266237258911,
      "rewards/rejected": -1.3027831315994263,
      "step": 682
    },
    {
      "epoch": 0.18,
      "grad_norm": 33.73033142089844,
      "kl": 0.0,
      "learning_rate": 4.106254907092384e-07,
      "logps/chosen": -190.738037109375,
      "logps/rejected": -196.4342498779297,
      "loss": 0.2827,
      "rewards/chosen": -0.5406025052070618,
      "rewards/margins": 2.1505613327026367,
      "rewards/rejected": -2.6911637783050537,
      "step": 683
    },
    {
      "epoch": 0.18,
      "grad_norm": 27.662273406982422,
      "kl": 0.0,
      "learning_rate": 4.1049463491232663e-07,
      "logps/chosen": -239.35816955566406,
      "logps/rejected": -242.4320068359375,
      "loss": 0.3093,
      "rewards/chosen": -1.608490228652954,
      "rewards/margins": 1.3844609260559082,
      "rewards/rejected": -2.9929511547088623,
      "step": 684
    },
    {
      "epoch": 0.18,
      "grad_norm": 42.05573654174805,
      "kl": 0.0,
      "learning_rate": 4.1036377911541483e-07,
      "logps/chosen": -277.0669860839844,
      "logps/rejected": -207.38720703125,
      "loss": 0.4063,
      "rewards/chosen": -0.9567424654960632,
      "rewards/margins": 0.5377658009529114,
      "rewards/rejected": -1.4945082664489746,
      "step": 685
    },
    {
      "epoch": 0.18,
      "grad_norm": 38.324893951416016,
      "kl": 0.0,
      "learning_rate": 4.10232923318503e-07,
      "logps/chosen": -287.05841064453125,
      "logps/rejected": -240.85494995117188,
      "loss": 0.2541,
      "rewards/chosen": 0.4614275097846985,
      "rewards/margins": 3.325218915939331,
      "rewards/rejected": -2.8637914657592773,
      "step": 686
    },
    {
      "epoch": 0.18,
      "grad_norm": 43.04709243774414,
      "kl": 0.0,
      "learning_rate": 4.101020675215912e-07,
      "logps/chosen": -195.44039916992188,
      "logps/rejected": -273.97906494140625,
      "loss": 0.2414,
      "rewards/chosen": 0.7585437297821045,
      "rewards/margins": 3.664416551589966,
      "rewards/rejected": -2.9058728218078613,
      "step": 687
    },
    {
      "epoch": 0.18,
      "grad_norm": 34.49761199951172,
      "kl": 0.0,
      "learning_rate": 4.099712117246794e-07,
      "logps/chosen": -261.4673767089844,
      "logps/rejected": -224.2903594970703,
      "loss": 0.2617,
      "rewards/chosen": 0.09347956627607346,
      "rewards/margins": 2.2225935459136963,
      "rewards/rejected": -2.1291139125823975,
      "step": 688
    },
    {
      "epoch": 0.18,
      "grad_norm": 32.107749938964844,
      "kl": 0.0,
      "learning_rate": 4.0984035592776756e-07,
      "logps/chosen": -219.35438537597656,
      "logps/rejected": -246.00823974609375,
      "loss": 0.2946,
      "rewards/chosen": -0.08469846844673157,
      "rewards/margins": 3.9561054706573486,
      "rewards/rejected": -4.040803909301758,
      "step": 689
    },
    {
      "epoch": 0.18,
      "grad_norm": 27.49036979675293,
      "kl": 0.0,
      "learning_rate": 4.0970950013085576e-07,
      "logps/chosen": -270.11932373046875,
      "logps/rejected": -237.36947631835938,
      "loss": 0.3557,
      "rewards/chosen": -0.5459706783294678,
      "rewards/margins": 2.900925397872925,
      "rewards/rejected": -3.4468960762023926,
      "step": 690
    },
    {
      "epoch": 0.18,
      "grad_norm": 30.500295639038086,
      "kl": 0.0,
      "learning_rate": 4.0957864433394395e-07,
      "logps/chosen": -253.16883850097656,
      "logps/rejected": -164.48680114746094,
      "loss": 0.3479,
      "rewards/chosen": -0.3456551730632782,
      "rewards/margins": 2.1069955825805664,
      "rewards/rejected": -2.452650785446167,
      "step": 691
    },
    {
      "epoch": 0.18,
      "grad_norm": 43.131839752197266,
      "kl": 0.0,
      "learning_rate": 4.0944778853703215e-07,
      "logps/chosen": -270.4461975097656,
      "logps/rejected": -240.70751953125,
      "loss": 0.337,
      "rewards/chosen": 0.804510235786438,
      "rewards/margins": 2.1393561363220215,
      "rewards/rejected": -1.334845781326294,
      "step": 692
    },
    {
      "epoch": 0.18,
      "grad_norm": 32.07833480834961,
      "kl": 0.0,
      "learning_rate": 4.0931693274012035e-07,
      "logps/chosen": -249.09523010253906,
      "logps/rejected": -230.19061279296875,
      "loss": 0.406,
      "rewards/chosen": -1.9368900060653687,
      "rewards/margins": 1.1023679971694946,
      "rewards/rejected": -3.0392580032348633,
      "step": 693
    },
    {
      "epoch": 0.18,
      "grad_norm": 33.6291618347168,
      "kl": 0.0,
      "learning_rate": 4.0918607694320854e-07,
      "logps/chosen": -245.869140625,
      "logps/rejected": -304.74322509765625,
      "loss": 0.3806,
      "rewards/chosen": -3.02115797996521,
      "rewards/margins": 0.01259756088256836,
      "rewards/rejected": -3.0337555408477783,
      "step": 694
    },
    {
      "epoch": 0.18,
      "grad_norm": 33.45709991455078,
      "kl": 0.0,
      "learning_rate": 4.0905522114629674e-07,
      "logps/chosen": -254.72018432617188,
      "logps/rejected": -244.65679931640625,
      "loss": 0.3315,
      "rewards/chosen": -1.7159273624420166,
      "rewards/margins": 0.9991257190704346,
      "rewards/rejected": -2.715053081512451,
      "step": 695
    },
    {
      "epoch": 0.18,
      "grad_norm": 40.40908432006836,
      "kl": 0.0,
      "learning_rate": 4.0892436534938493e-07,
      "logps/chosen": -275.8647155761719,
      "logps/rejected": -266.954345703125,
      "loss": 0.4751,
      "rewards/chosen": -1.3798705339431763,
      "rewards/margins": 0.4689023494720459,
      "rewards/rejected": -1.8487728834152222,
      "step": 696
    },
    {
      "epoch": 0.18,
      "grad_norm": 30.23027992248535,
      "kl": 0.0,
      "learning_rate": 4.087935095524732e-07,
      "logps/chosen": -178.09228515625,
      "logps/rejected": -278.37213134765625,
      "loss": 0.3044,
      "rewards/chosen": -0.40018245577812195,
      "rewards/margins": 4.596784591674805,
      "rewards/rejected": -4.99696683883667,
      "step": 697
    },
    {
      "epoch": 0.18,
      "grad_norm": 34.83966064453125,
      "kl": 0.0,
      "learning_rate": 4.086626537555614e-07,
      "logps/chosen": -211.6435546875,
      "logps/rejected": -201.00558471679688,
      "loss": 0.1905,
      "rewards/chosen": 0.7989832758903503,
      "rewards/margins": 4.710301876068115,
      "rewards/rejected": -3.91131854057312,
      "step": 698
    },
    {
      "epoch": 0.18,
      "grad_norm": 38.152313232421875,
      "kl": 0.0,
      "learning_rate": 4.085317979586496e-07,
      "logps/chosen": -196.98355102539062,
      "logps/rejected": -302.0910339355469,
      "loss": 0.279,
      "rewards/chosen": -1.0276482105255127,
      "rewards/margins": 2.165983200073242,
      "rewards/rejected": -3.193631410598755,
      "step": 699
    },
    {
      "epoch": 0.18,
      "grad_norm": 29.363718032836914,
      "kl": 0.0,
      "learning_rate": 4.0840094216173777e-07,
      "logps/chosen": -239.1320343017578,
      "logps/rejected": -264.94970703125,
      "loss": 0.2588,
      "rewards/chosen": 0.14816318452358246,
      "rewards/margins": 4.279280662536621,
      "rewards/rejected": -4.131117343902588,
      "step": 700
    },
    {
      "epoch": 0.18,
      "grad_norm": 44.35689926147461,
      "kl": 0.0,
      "learning_rate": 4.0827008636482597e-07,
      "logps/chosen": -221.08709716796875,
      "logps/rejected": -205.1977996826172,
      "loss": 0.3414,
      "rewards/chosen": 0.24511413276195526,
      "rewards/margins": 3.009829521179199,
      "rewards/rejected": -2.7647154331207275,
      "step": 701
    },
    {
      "epoch": 0.18,
      "grad_norm": 38.40766906738281,
      "kl": 0.0,
      "learning_rate": 4.0813923056791416e-07,
      "logps/chosen": -210.72152709960938,
      "logps/rejected": -245.8160858154297,
      "loss": 0.3501,
      "rewards/chosen": 0.41612035036087036,
      "rewards/margins": 3.429255485534668,
      "rewards/rejected": -3.0131351947784424,
      "step": 702
    },
    {
      "epoch": 0.18,
      "grad_norm": 26.7491512298584,
      "kl": 0.0,
      "learning_rate": 4.0800837477100236e-07,
      "logps/chosen": -259.2345275878906,
      "logps/rejected": -299.02435302734375,
      "loss": 0.4981,
      "rewards/chosen": -2.508211374282837,
      "rewards/margins": -1.5818876028060913,
      "rewards/rejected": -0.9263237714767456,
      "step": 703
    },
    {
      "epoch": 0.18,
      "grad_norm": 37.0341911315918,
      "kl": 0.0,
      "learning_rate": 4.078775189740905e-07,
      "logps/chosen": -234.02003479003906,
      "logps/rejected": -213.5331573486328,
      "loss": 0.2716,
      "rewards/chosen": 0.10059448331594467,
      "rewards/margins": 1.6921581029891968,
      "rewards/rejected": -1.5915635824203491,
      "step": 704
    },
    {
      "epoch": 0.18,
      "grad_norm": 35.68254470825195,
      "kl": 0.0,
      "learning_rate": 4.077466631771787e-07,
      "logps/chosen": -214.12158203125,
      "logps/rejected": -266.066162109375,
      "loss": 0.3309,
      "rewards/chosen": -0.1318395882844925,
      "rewards/margins": 2.9635441303253174,
      "rewards/rejected": -3.095383644104004,
      "step": 705
    },
    {
      "epoch": 0.18,
      "grad_norm": 31.081863403320312,
      "kl": 0.0,
      "learning_rate": 4.076158073802669e-07,
      "logps/chosen": -182.43540954589844,
      "logps/rejected": -272.5527038574219,
      "loss": 0.3125,
      "rewards/chosen": 0.043891992419958115,
      "rewards/margins": 2.875239372253418,
      "rewards/rejected": -2.8313474655151367,
      "step": 706
    },
    {
      "epoch": 0.19,
      "grad_norm": 38.6630973815918,
      "kl": 0.0,
      "learning_rate": 4.074849515833551e-07,
      "logps/chosen": -417.9666748046875,
      "logps/rejected": -233.2468719482422,
      "loss": 0.374,
      "rewards/chosen": -0.5666962265968323,
      "rewards/margins": 2.2654359340667725,
      "rewards/rejected": -2.83213210105896,
      "step": 707
    },
    {
      "epoch": 0.19,
      "grad_norm": 27.11510467529297,
      "kl": 0.0,
      "learning_rate": 4.073540957864433e-07,
      "logps/chosen": -112.95711517333984,
      "logps/rejected": -215.8408966064453,
      "loss": 0.2604,
      "rewards/chosen": 1.1336884498596191,
      "rewards/margins": 4.025924205780029,
      "rewards/rejected": -2.89223575592041,
      "step": 708
    },
    {
      "epoch": 0.19,
      "grad_norm": 28.093170166015625,
      "kl": 0.0,
      "learning_rate": 4.072232399895315e-07,
      "logps/chosen": -271.0452575683594,
      "logps/rejected": -252.80313110351562,
      "loss": 0.3726,
      "rewards/chosen": 0.4887799322605133,
      "rewards/margins": 4.524040222167969,
      "rewards/rejected": -4.035260200500488,
      "step": 709
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.50494384765625,
      "kl": 0.0,
      "learning_rate": 4.0709238419261973e-07,
      "logps/chosen": -262.0550537109375,
      "logps/rejected": -349.9918518066406,
      "loss": 0.2159,
      "rewards/chosen": -0.8234438896179199,
      "rewards/margins": 5.1398701667785645,
      "rewards/rejected": -5.963314056396484,
      "step": 710
    },
    {
      "epoch": 0.19,
      "grad_norm": 30.006282806396484,
      "kl": 0.0,
      "learning_rate": 4.0696152839570793e-07,
      "logps/chosen": -198.72296142578125,
      "logps/rejected": -301.7229919433594,
      "loss": 0.2859,
      "rewards/chosen": 1.8006759881973267,
      "rewards/margins": 5.980476379394531,
      "rewards/rejected": -4.179800510406494,
      "step": 711
    },
    {
      "epoch": 0.19,
      "grad_norm": 32.877410888671875,
      "kl": 0.0,
      "learning_rate": 4.068306725987961e-07,
      "logps/chosen": -285.0302429199219,
      "logps/rejected": -271.06414794921875,
      "loss": 0.2594,
      "rewards/chosen": -0.30313801765441895,
      "rewards/margins": 2.021474838256836,
      "rewards/rejected": -2.324612855911255,
      "step": 712
    },
    {
      "epoch": 0.19,
      "grad_norm": 29.744258880615234,
      "kl": 0.0,
      "learning_rate": 4.066998168018843e-07,
      "logps/chosen": -179.3466796875,
      "logps/rejected": -232.30099487304688,
      "loss": 0.2743,
      "rewards/chosen": 1.125133752822876,
      "rewards/margins": 5.002973556518555,
      "rewards/rejected": -3.877840042114258,
      "step": 713
    },
    {
      "epoch": 0.19,
      "grad_norm": 37.29856872558594,
      "kl": 0.0,
      "learning_rate": 4.065689610049725e-07,
      "logps/chosen": -248.66502380371094,
      "logps/rejected": -288.7413330078125,
      "loss": 0.3168,
      "rewards/chosen": -0.6923273205757141,
      "rewards/margins": 1.4932734966278076,
      "rewards/rejected": -2.185600757598877,
      "step": 714
    },
    {
      "epoch": 0.19,
      "grad_norm": 33.92723083496094,
      "kl": 0.0,
      "learning_rate": 4.064381052080607e-07,
      "logps/chosen": -213.46885681152344,
      "logps/rejected": -257.55938720703125,
      "loss": 0.2876,
      "rewards/chosen": -0.8362697958946228,
      "rewards/margins": 3.334930896759033,
      "rewards/rejected": -4.171200752258301,
      "step": 715
    },
    {
      "epoch": 0.19,
      "grad_norm": 30.284574508666992,
      "kl": 0.0,
      "learning_rate": 4.063072494111489e-07,
      "logps/chosen": -181.6216278076172,
      "logps/rejected": -147.3481903076172,
      "loss": 0.2717,
      "rewards/chosen": -0.6255307793617249,
      "rewards/margins": 2.0915329456329346,
      "rewards/rejected": -2.7170636653900146,
      "step": 716
    },
    {
      "epoch": 0.19,
      "grad_norm": 38.27483367919922,
      "kl": 0.0,
      "learning_rate": 4.061763936142371e-07,
      "logps/chosen": -180.31236267089844,
      "logps/rejected": -254.57325744628906,
      "loss": 0.3388,
      "rewards/chosen": -0.985579788684845,
      "rewards/margins": 3.0883657932281494,
      "rewards/rejected": -4.07394552230835,
      "step": 717
    },
    {
      "epoch": 0.19,
      "grad_norm": 25.0219669342041,
      "kl": 0.0,
      "learning_rate": 4.060455378173253e-07,
      "logps/chosen": -241.9582977294922,
      "logps/rejected": -283.1595458984375,
      "loss": 0.2424,
      "rewards/chosen": 0.6642072200775146,
      "rewards/margins": 5.217754364013672,
      "rewards/rejected": -4.553547382354736,
      "step": 718
    },
    {
      "epoch": 0.19,
      "grad_norm": 46.34675216674805,
      "kl": 0.0,
      "learning_rate": 4.059146820204135e-07,
      "logps/chosen": -250.87704467773438,
      "logps/rejected": -292.99713134765625,
      "loss": 0.4718,
      "rewards/chosen": -1.2733533382415771,
      "rewards/margins": 1.5549085140228271,
      "rewards/rejected": -2.8282618522644043,
      "step": 719
    },
    {
      "epoch": 0.19,
      "grad_norm": 32.27806091308594,
      "kl": 0.0,
      "learning_rate": 4.0578382622350164e-07,
      "logps/chosen": -293.7688903808594,
      "logps/rejected": -239.4044952392578,
      "loss": 0.2566,
      "rewards/chosen": -0.23062366247177124,
      "rewards/margins": 2.491293430328369,
      "rewards/rejected": -2.721917152404785,
      "step": 720
    },
    {
      "epoch": 0.19,
      "grad_norm": 34.6161003112793,
      "kl": 0.0,
      "learning_rate": 4.0565297042658984e-07,
      "logps/chosen": -192.505615234375,
      "logps/rejected": -220.963134765625,
      "loss": 0.3112,
      "rewards/chosen": -0.9563778638839722,
      "rewards/margins": 1.9204825162887573,
      "rewards/rejected": -2.8768603801727295,
      "step": 721
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.44890594482422,
      "kl": 0.0,
      "learning_rate": 4.0552211462967803e-07,
      "logps/chosen": -160.75257873535156,
      "logps/rejected": -272.4718933105469,
      "loss": 0.203,
      "rewards/chosen": 0.37976592779159546,
      "rewards/margins": 5.139260292053223,
      "rewards/rejected": -4.759494304656982,
      "step": 722
    },
    {
      "epoch": 0.19,
      "grad_norm": 42.7325439453125,
      "kl": 0.0,
      "learning_rate": 4.053912588327663e-07,
      "logps/chosen": -181.53472900390625,
      "logps/rejected": -288.3123779296875,
      "loss": 0.3532,
      "rewards/chosen": 0.9533110857009888,
      "rewards/margins": 3.2428879737854004,
      "rewards/rejected": -2.289577007293701,
      "step": 723
    },
    {
      "epoch": 0.19,
      "grad_norm": 38.448360443115234,
      "kl": 0.0,
      "learning_rate": 4.052604030358545e-07,
      "logps/chosen": -236.40652465820312,
      "logps/rejected": -234.30282592773438,
      "loss": 0.2709,
      "rewards/chosen": 0.140406996011734,
      "rewards/margins": 5.7353973388671875,
      "rewards/rejected": -5.594990253448486,
      "step": 724
    },
    {
      "epoch": 0.19,
      "grad_norm": 35.80790710449219,
      "kl": 0.0,
      "learning_rate": 4.051295472389427e-07,
      "logps/chosen": -330.27020263671875,
      "logps/rejected": -213.01524353027344,
      "loss": 0.2631,
      "rewards/chosen": -0.5340307950973511,
      "rewards/margins": 1.4044686555862427,
      "rewards/rejected": -1.9384994506835938,
      "step": 725
    },
    {
      "epoch": 0.19,
      "grad_norm": 36.39134216308594,
      "kl": 0.0,
      "learning_rate": 4.0499869144203087e-07,
      "logps/chosen": -249.36798095703125,
      "logps/rejected": -221.98782348632812,
      "loss": 0.3286,
      "rewards/chosen": -0.7803311347961426,
      "rewards/margins": 2.9748716354370117,
      "rewards/rejected": -3.7552027702331543,
      "step": 726
    },
    {
      "epoch": 0.19,
      "grad_norm": 29.210561752319336,
      "kl": 0.0,
      "learning_rate": 4.0486783564511907e-07,
      "logps/chosen": -224.4700164794922,
      "logps/rejected": -207.71670532226562,
      "loss": 0.2853,
      "rewards/chosen": 1.7469308376312256,
      "rewards/margins": 4.717103004455566,
      "rewards/rejected": -2.9701719284057617,
      "step": 727
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.407520294189453,
      "kl": 0.0,
      "learning_rate": 4.0473697984820726e-07,
      "logps/chosen": -224.3354034423828,
      "logps/rejected": -240.8251953125,
      "loss": 0.1928,
      "rewards/chosen": 1.139357089996338,
      "rewards/margins": 3.9633731842041016,
      "rewards/rejected": -2.8240160942077637,
      "step": 728
    },
    {
      "epoch": 0.19,
      "grad_norm": 29.426000595092773,
      "kl": 0.0,
      "learning_rate": 4.0460612405129546e-07,
      "logps/chosen": -196.70066833496094,
      "logps/rejected": -204.2018280029297,
      "loss": 0.3489,
      "rewards/chosen": 0.37072229385375977,
      "rewards/margins": 3.5842480659484863,
      "rewards/rejected": -3.2135257720947266,
      "step": 729
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.12305450439453,
      "kl": 0.0,
      "learning_rate": 4.0447526825438365e-07,
      "logps/chosen": -161.67770385742188,
      "logps/rejected": -298.96490478515625,
      "loss": 0.2341,
      "rewards/chosen": -0.16333018243312836,
      "rewards/margins": 3.9531426429748535,
      "rewards/rejected": -4.1164727210998535,
      "step": 730
    },
    {
      "epoch": 0.19,
      "grad_norm": 30.204750061035156,
      "kl": 0.0,
      "learning_rate": 4.0434441245747185e-07,
      "logps/chosen": -208.2093505859375,
      "logps/rejected": -243.5538787841797,
      "loss": 0.3252,
      "rewards/chosen": -0.614426851272583,
      "rewards/margins": 4.058218002319336,
      "rewards/rejected": -4.67264461517334,
      "step": 731
    },
    {
      "epoch": 0.19,
      "grad_norm": 40.43870544433594,
      "kl": 0.0,
      "learning_rate": 4.0421355666056005e-07,
      "logps/chosen": -240.05763244628906,
      "logps/rejected": -222.07400512695312,
      "loss": 0.4016,
      "rewards/chosen": -1.9779905080795288,
      "rewards/margins": 0.4816056489944458,
      "rewards/rejected": -2.4595961570739746,
      "step": 732
    },
    {
      "epoch": 0.19,
      "grad_norm": 34.84939956665039,
      "kl": 0.0,
      "learning_rate": 4.0408270086364824e-07,
      "logps/chosen": -223.86077880859375,
      "logps/rejected": -283.632568359375,
      "loss": 0.2978,
      "rewards/chosen": -0.9752624034881592,
      "rewards/margins": 3.3159701824188232,
      "rewards/rejected": -4.291232585906982,
      "step": 733
    },
    {
      "epoch": 0.19,
      "grad_norm": 27.223562240600586,
      "kl": 0.0,
      "learning_rate": 4.0395184506673644e-07,
      "logps/chosen": -153.81460571289062,
      "logps/rejected": -231.26126098632812,
      "loss": 0.3124,
      "rewards/chosen": -1.2617738246917725,
      "rewards/margins": 2.4990763664245605,
      "rewards/rejected": -3.760850191116333,
      "step": 734
    },
    {
      "epoch": 0.19,
      "grad_norm": 37.23539352416992,
      "kl": 0.0,
      "learning_rate": 4.038209892698246e-07,
      "logps/chosen": -207.6356964111328,
      "logps/rejected": -197.76123046875,
      "loss": 0.2752,
      "rewards/chosen": 0.7991582751274109,
      "rewards/margins": 4.932627201080322,
      "rewards/rejected": -4.133469104766846,
      "step": 735
    },
    {
      "epoch": 0.19,
      "grad_norm": 32.925113677978516,
      "kl": 0.0,
      "learning_rate": 4.0369013347291283e-07,
      "logps/chosen": -155.8492889404297,
      "logps/rejected": -218.74008178710938,
      "loss": 0.2874,
      "rewards/chosen": 0.39519715309143066,
      "rewards/margins": 3.102106809616089,
      "rewards/rejected": -2.706909656524658,
      "step": 736
    },
    {
      "epoch": 0.19,
      "grad_norm": 29.543956756591797,
      "kl": 0.0,
      "learning_rate": 4.0355927767600103e-07,
      "logps/chosen": -244.0968017578125,
      "logps/rejected": -110.96414184570312,
      "loss": 0.4021,
      "rewards/chosen": -0.7129060626029968,
      "rewards/margins": 1.0993714332580566,
      "rewards/rejected": -1.8122774362564087,
      "step": 737
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.560298919677734,
      "kl": 0.0,
      "learning_rate": 4.034284218790892e-07,
      "logps/chosen": -276.5502624511719,
      "logps/rejected": -237.48460388183594,
      "loss": 0.176,
      "rewards/chosen": -0.5135056376457214,
      "rewards/margins": 2.5795295238494873,
      "rewards/rejected": -3.0930352210998535,
      "step": 738
    },
    {
      "epoch": 0.19,
      "grad_norm": 41.65924835205078,
      "kl": 0.0,
      "learning_rate": 4.032975660821774e-07,
      "logps/chosen": -245.82003784179688,
      "logps/rejected": -316.614013671875,
      "loss": 0.3456,
      "rewards/chosen": -0.2647044062614441,
      "rewards/margins": 3.4045932292938232,
      "rewards/rejected": -3.669297695159912,
      "step": 739
    },
    {
      "epoch": 0.19,
      "grad_norm": 32.747581481933594,
      "kl": 0.0,
      "learning_rate": 4.031667102852656e-07,
      "logps/chosen": -249.56219482421875,
      "logps/rejected": -236.9765625,
      "loss": 0.197,
      "rewards/chosen": 0.49999967217445374,
      "rewards/margins": 4.147944450378418,
      "rewards/rejected": -3.647944688796997,
      "step": 740
    },
    {
      "epoch": 0.19,
      "grad_norm": 19.987966537475586,
      "kl": 0.0,
      "learning_rate": 4.030358544883538e-07,
      "logps/chosen": -179.17042541503906,
      "logps/rejected": -272.2667541503906,
      "loss": 0.3512,
      "rewards/chosen": -0.9662373065948486,
      "rewards/margins": 2.9937665462493896,
      "rewards/rejected": -3.9600038528442383,
      "step": 741
    },
    {
      "epoch": 0.19,
      "grad_norm": 29.590360641479492,
      "kl": 0.0,
      "learning_rate": 4.02904998691442e-07,
      "logps/chosen": -203.13223266601562,
      "logps/rejected": -251.3955078125,
      "loss": 0.2955,
      "rewards/chosen": 0.5274904370307922,
      "rewards/margins": 4.236715793609619,
      "rewards/rejected": -3.7092251777648926,
      "step": 742
    },
    {
      "epoch": 0.19,
      "grad_norm": 35.01331329345703,
      "kl": 0.0,
      "learning_rate": 4.027741428945302e-07,
      "logps/chosen": -230.51190185546875,
      "logps/rejected": -259.7726745605469,
      "loss": 0.3293,
      "rewards/chosen": 1.5364973545074463,
      "rewards/margins": 5.86799430847168,
      "rewards/rejected": -4.3314971923828125,
      "step": 743
    },
    {
      "epoch": 0.19,
      "grad_norm": 31.845294952392578,
      "kl": 0.0,
      "learning_rate": 4.026432870976184e-07,
      "logps/chosen": -213.4208984375,
      "logps/rejected": -289.29815673828125,
      "loss": 0.3252,
      "rewards/chosen": -0.8649560213088989,
      "rewards/margins": 2.6383767127990723,
      "rewards/rejected": -3.5033326148986816,
      "step": 744
    },
    {
      "epoch": 0.19,
      "grad_norm": 36.557167053222656,
      "kl": 0.0,
      "learning_rate": 4.025124313007066e-07,
      "logps/chosen": -256.5347900390625,
      "logps/rejected": -235.42843627929688,
      "loss": 0.3793,
      "rewards/chosen": -1.1741714477539062,
      "rewards/margins": 0.9963676929473877,
      "rewards/rejected": -2.170539140701294,
      "step": 745
    },
    {
      "epoch": 0.2,
      "grad_norm": 28.335918426513672,
      "kl": 0.0,
      "learning_rate": 4.023815755037948e-07,
      "logps/chosen": -231.30001831054688,
      "logps/rejected": -236.99227905273438,
      "loss": 0.2908,
      "rewards/chosen": -1.4739785194396973,
      "rewards/margins": 1.9531621932983398,
      "rewards/rejected": -3.427140712738037,
      "step": 746
    },
    {
      "epoch": 0.2,
      "grad_norm": 31.791410446166992,
      "kl": 0.0,
      "learning_rate": 4.02250719706883e-07,
      "logps/chosen": -297.44525146484375,
      "logps/rejected": -186.52516174316406,
      "loss": 0.3306,
      "rewards/chosen": -1.067051887512207,
      "rewards/margins": 1.990215539932251,
      "rewards/rejected": -3.057267427444458,
      "step": 747
    },
    {
      "epoch": 0.2,
      "grad_norm": 32.735816955566406,
      "kl": 0.0,
      "learning_rate": 4.0211986390997124e-07,
      "logps/chosen": -227.0963897705078,
      "logps/rejected": -272.2472839355469,
      "loss": 0.2703,
      "rewards/chosen": -0.6910209655761719,
      "rewards/margins": 2.193660259246826,
      "rewards/rejected": -2.884681224822998,
      "step": 748
    },
    {
      "epoch": 0.2,
      "grad_norm": 29.596969604492188,
      "kl": 0.0,
      "learning_rate": 4.0198900811305943e-07,
      "logps/chosen": -227.31190490722656,
      "logps/rejected": -416.7991638183594,
      "loss": 0.3962,
      "rewards/chosen": -1.4798189401626587,
      "rewards/margins": 1.5923632383346558,
      "rewards/rejected": -3.0721821784973145,
      "step": 749
    },
    {
      "epoch": 0.2,
      "grad_norm": 27.834747314453125,
      "kl": 0.0,
      "learning_rate": 4.0185815231614763e-07,
      "logps/chosen": -251.62158203125,
      "logps/rejected": -175.81283569335938,
      "loss": 0.3261,
      "rewards/chosen": 0.20242267847061157,
      "rewards/margins": 3.325491189956665,
      "rewards/rejected": -3.1230685710906982,
      "step": 750
    },
    {
      "epoch": 0.2,
      "grad_norm": 25.614028930664062,
      "kl": 0.0,
      "learning_rate": 4.0172729651923577e-07,
      "logps/chosen": -137.84701538085938,
      "logps/rejected": -316.3259582519531,
      "loss": 0.3197,
      "rewards/chosen": -1.730677604675293,
      "rewards/margins": 1.8563814163208008,
      "rewards/rejected": -3.5870590209960938,
      "step": 751
    },
    {
      "epoch": 0.2,
      "grad_norm": 21.44118881225586,
      "kl": 0.0,
      "learning_rate": 4.0159644072232397e-07,
      "logps/chosen": -244.3109893798828,
      "logps/rejected": -223.697021484375,
      "loss": 0.2882,
      "rewards/chosen": 3.215583324432373,
      "rewards/margins": 6.77751350402832,
      "rewards/rejected": -3.5619304180145264,
      "step": 752
    },
    {
      "epoch": 0.2,
      "grad_norm": 35.268070220947266,
      "kl": 0.0,
      "learning_rate": 4.0146558492541216e-07,
      "logps/chosen": -205.3878631591797,
      "logps/rejected": -285.7146301269531,
      "loss": 0.2874,
      "rewards/chosen": 0.2579948902130127,
      "rewards/margins": 6.138503074645996,
      "rewards/rejected": -5.880507946014404,
      "step": 753
    },
    {
      "epoch": 0.2,
      "grad_norm": 36.49318313598633,
      "kl": 0.0,
      "learning_rate": 4.0133472912850036e-07,
      "logps/chosen": -299.867919921875,
      "logps/rejected": -236.50704956054688,
      "loss": 0.3799,
      "rewards/chosen": 0.2085830122232437,
      "rewards/margins": 3.1150100231170654,
      "rewards/rejected": -2.9064269065856934,
      "step": 754
    },
    {
      "epoch": 0.2,
      "grad_norm": 40.88338851928711,
      "kl": 0.0,
      "learning_rate": 4.0120387333158856e-07,
      "logps/chosen": -255.27439880371094,
      "logps/rejected": -266.44586181640625,
      "loss": 0.4113,
      "rewards/chosen": -1.1876412630081177,
      "rewards/margins": 2.202840805053711,
      "rewards/rejected": -3.390482187271118,
      "step": 755
    },
    {
      "epoch": 0.2,
      "grad_norm": 29.45452880859375,
      "kl": 0.0,
      "learning_rate": 4.0107301753467675e-07,
      "logps/chosen": -275.1477966308594,
      "logps/rejected": -321.3170166015625,
      "loss": 0.3062,
      "rewards/chosen": -1.2740792036056519,
      "rewards/margins": 2.2294044494628906,
      "rewards/rejected": -3.503483772277832,
      "step": 756
    },
    {
      "epoch": 0.2,
      "grad_norm": 30.438161849975586,
      "kl": 0.0,
      "learning_rate": 4.0094216173776495e-07,
      "logps/chosen": -149.62611389160156,
      "logps/rejected": -329.66607666015625,
      "loss": 0.2647,
      "rewards/chosen": 0.9496825337409973,
      "rewards/margins": 3.765160083770752,
      "rewards/rejected": -2.8154776096343994,
      "step": 757
    },
    {
      "epoch": 0.2,
      "grad_norm": 36.988014221191406,
      "kl": 0.0,
      "learning_rate": 4.0081130594085314e-07,
      "logps/chosen": -206.1259765625,
      "logps/rejected": -309.4862976074219,
      "loss": 0.3293,
      "rewards/chosen": -0.4844217896461487,
      "rewards/margins": 1.4037492275238037,
      "rewards/rejected": -1.8881710767745972,
      "step": 758
    },
    {
      "epoch": 0.2,
      "grad_norm": 35.78725814819336,
      "kl": 0.0,
      "learning_rate": 4.0068045014394134e-07,
      "logps/chosen": -182.4107208251953,
      "logps/rejected": -163.76229858398438,
      "loss": 0.3308,
      "rewards/chosen": -0.25232523679733276,
      "rewards/margins": 2.208416700363159,
      "rewards/rejected": -2.4607419967651367,
      "step": 759
    },
    {
      "epoch": 0.2,
      "grad_norm": 24.877836227416992,
      "kl": 0.0,
      "learning_rate": 4.0054959434702954e-07,
      "logps/chosen": -205.92320251464844,
      "logps/rejected": -265.5527648925781,
      "loss": 0.3524,
      "rewards/chosen": 0.09859514236450195,
      "rewards/margins": 2.4194495677948,
      "rewards/rejected": -2.320854425430298,
      "step": 760
    },
    {
      "epoch": 0.2,
      "grad_norm": 34.51003646850586,
      "kl": 0.0,
      "learning_rate": 4.004187385501178e-07,
      "logps/chosen": -168.7631378173828,
      "logps/rejected": -230.7778778076172,
      "loss": 0.4157,
      "rewards/chosen": -1.2124521732330322,
      "rewards/margins": 0.986685037612915,
      "rewards/rejected": -2.1991372108459473,
      "step": 761
    },
    {
      "epoch": 0.2,
      "grad_norm": 41.73928451538086,
      "kl": 0.0,
      "learning_rate": 4.00287882753206e-07,
      "logps/chosen": -195.952880859375,
      "logps/rejected": -289.32244873046875,
      "loss": 0.3697,
      "rewards/chosen": -0.5307539105415344,
      "rewards/margins": 2.295314311981201,
      "rewards/rejected": -2.826068162918091,
      "step": 762
    },
    {
      "epoch": 0.2,
      "grad_norm": 41.18114471435547,
      "kl": 0.0,
      "learning_rate": 4.001570269562942e-07,
      "logps/chosen": -235.7907257080078,
      "logps/rejected": -300.2009582519531,
      "loss": 0.3011,
      "rewards/chosen": -0.7677313685417175,
      "rewards/margins": 2.5477404594421387,
      "rewards/rejected": -3.315471887588501,
      "step": 763
    },
    {
      "epoch": 0.2,
      "grad_norm": 31.07491683959961,
      "kl": 0.0,
      "learning_rate": 4.000261711593824e-07,
      "logps/chosen": -199.22885131835938,
      "logps/rejected": -272.57720947265625,
      "loss": 0.3352,
      "rewards/chosen": -1.1168694496154785,
      "rewards/margins": 3.9204258918762207,
      "rewards/rejected": -5.037295341491699,
      "step": 764
    },
    {
      "epoch": 0.2,
      "grad_norm": 31.78119659423828,
      "kl": 0.0,
      "learning_rate": 3.9989531536247057e-07,
      "logps/chosen": -228.1612091064453,
      "logps/rejected": -297.5334167480469,
      "loss": 0.2313,
      "rewards/chosen": 0.9916869401931763,
      "rewards/margins": 4.4241437911987305,
      "rewards/rejected": -3.4324569702148438,
      "step": 765
    },
    {
      "epoch": 0.2,
      "grad_norm": 29.278806686401367,
      "kl": 0.0,
      "learning_rate": 3.997644595655587e-07,
      "logps/chosen": -241.4243621826172,
      "logps/rejected": -190.7371826171875,
      "loss": 0.3067,
      "rewards/chosen": -1.7682384252548218,
      "rewards/margins": 1.0480161905288696,
      "rewards/rejected": -2.8162546157836914,
      "step": 766
    },
    {
      "epoch": 0.2,
      "grad_norm": 35.81673049926758,
      "kl": 0.0,
      "learning_rate": 3.996336037686469e-07,
      "logps/chosen": -196.4593963623047,
      "logps/rejected": -266.684814453125,
      "loss": 0.2646,
      "rewards/chosen": -1.6129564046859741,
      "rewards/margins": 1.3678327798843384,
      "rewards/rejected": -2.9807891845703125,
      "step": 767
    },
    {
      "epoch": 0.2,
      "grad_norm": 41.0791130065918,
      "kl": 0.0,
      "learning_rate": 3.995027479717351e-07,
      "logps/chosen": -232.69644165039062,
      "logps/rejected": -138.3663787841797,
      "loss": 0.3468,
      "rewards/chosen": -0.28881752490997314,
      "rewards/margins": 1.794512152671814,
      "rewards/rejected": -2.083329677581787,
      "step": 768
    },
    {
      "epoch": 0.2,
      "grad_norm": 42.20341110229492,
      "kl": 0.0,
      "learning_rate": 3.993718921748233e-07,
      "logps/chosen": -226.63327026367188,
      "logps/rejected": -290.8764343261719,
      "loss": 0.3823,
      "rewards/chosen": -0.42782607674598694,
      "rewards/margins": 5.769502639770508,
      "rewards/rejected": -6.197328567504883,
      "step": 769
    },
    {
      "epoch": 0.2,
      "grad_norm": 25.241504669189453,
      "kl": 0.0,
      "learning_rate": 3.992410363779115e-07,
      "logps/chosen": -147.48744201660156,
      "logps/rejected": -215.68814086914062,
      "loss": 0.2609,
      "rewards/chosen": -0.21872329711914062,
      "rewards/margins": 3.802682399749756,
      "rewards/rejected": -4.0214056968688965,
      "step": 770
    },
    {
      "epoch": 0.2,
      "grad_norm": 34.434165954589844,
      "kl": 0.0,
      "learning_rate": 3.991101805809997e-07,
      "logps/chosen": -243.2288818359375,
      "logps/rejected": -340.7574462890625,
      "loss": 0.3147,
      "rewards/chosen": 0.6935755014419556,
      "rewards/margins": 5.347521781921387,
      "rewards/rejected": -4.653946399688721,
      "step": 771
    },
    {
      "epoch": 0.2,
      "grad_norm": 30.723644256591797,
      "kl": 0.0,
      "learning_rate": 3.989793247840879e-07,
      "logps/chosen": -243.35061645507812,
      "logps/rejected": -166.70977783203125,
      "loss": 0.4454,
      "rewards/chosen": -0.8095527291297913,
      "rewards/margins": 2.356269359588623,
      "rewards/rejected": -3.1658220291137695,
      "step": 772
    },
    {
      "epoch": 0.2,
      "grad_norm": 31.855859756469727,
      "kl": 0.0,
      "learning_rate": 3.988484689871761e-07,
      "logps/chosen": -212.469970703125,
      "logps/rejected": -177.2041473388672,
      "loss": 0.2554,
      "rewards/chosen": 0.17340558767318726,
      "rewards/margins": 3.40055251121521,
      "rewards/rejected": -3.227146863937378,
      "step": 773
    },
    {
      "epoch": 0.2,
      "grad_norm": 31.05317497253418,
      "kl": 0.0,
      "learning_rate": 3.9871761319026434e-07,
      "logps/chosen": -134.61614990234375,
      "logps/rejected": -240.55783081054688,
      "loss": 0.2107,
      "rewards/chosen": 0.7359917163848877,
      "rewards/margins": 3.681432008743286,
      "rewards/rejected": -2.9454402923583984,
      "step": 774
    },
    {
      "epoch": 0.2,
      "grad_norm": 27.210365295410156,
      "kl": 0.0,
      "learning_rate": 3.9858675739335253e-07,
      "logps/chosen": -221.8593292236328,
      "logps/rejected": -191.375,
      "loss": 0.2359,
      "rewards/chosen": 0.355133056640625,
      "rewards/margins": 3.7575221061706543,
      "rewards/rejected": -3.4023890495300293,
      "step": 775
    },
    {
      "epoch": 0.2,
      "grad_norm": 25.456256866455078,
      "kl": 0.0,
      "learning_rate": 3.9845590159644073e-07,
      "logps/chosen": -272.27392578125,
      "logps/rejected": -279.5438232421875,
      "loss": 0.2631,
      "rewards/chosen": -1.3574875593185425,
      "rewards/margins": 2.907285690307617,
      "rewards/rejected": -4.264773368835449,
      "step": 776
    },
    {
      "epoch": 0.2,
      "grad_norm": 42.95475387573242,
      "kl": 0.0,
      "learning_rate": 3.983250457995289e-07,
      "logps/chosen": -223.12966918945312,
      "logps/rejected": -245.69009399414062,
      "loss": 0.4357,
      "rewards/chosen": -1.1708242893218994,
      "rewards/margins": 0.8256139755249023,
      "rewards/rejected": -1.9964382648468018,
      "step": 777
    },
    {
      "epoch": 0.2,
      "grad_norm": 41.72541427612305,
      "kl": 0.0,
      "learning_rate": 3.981941900026171e-07,
      "logps/chosen": -184.670654296875,
      "logps/rejected": -201.3939208984375,
      "loss": 0.2308,
      "rewards/chosen": -0.840877890586853,
      "rewards/margins": 1.4940797090530396,
      "rewards/rejected": -2.3349575996398926,
      "step": 778
    },
    {
      "epoch": 0.2,
      "grad_norm": 27.049043655395508,
      "kl": 0.0,
      "learning_rate": 3.980633342057053e-07,
      "logps/chosen": -215.769287109375,
      "logps/rejected": -183.2026824951172,
      "loss": 0.2251,
      "rewards/chosen": 1.3883291482925415,
      "rewards/margins": 4.350786209106445,
      "rewards/rejected": -2.9624569416046143,
      "step": 779
    },
    {
      "epoch": 0.2,
      "grad_norm": 42.32488250732422,
      "kl": 0.0,
      "learning_rate": 3.979324784087935e-07,
      "logps/chosen": -271.8233947753906,
      "logps/rejected": -286.89044189453125,
      "loss": 0.3337,
      "rewards/chosen": -2.232872724533081,
      "rewards/margins": 2.1625797748565674,
      "rewards/rejected": -4.395452499389648,
      "step": 780
    },
    {
      "epoch": 0.2,
      "grad_norm": 34.89527130126953,
      "kl": 0.0,
      "learning_rate": 3.978016226118817e-07,
      "logps/chosen": -201.700439453125,
      "logps/rejected": -280.5649719238281,
      "loss": 0.2726,
      "rewards/chosen": 1.1064777374267578,
      "rewards/margins": 5.503815174102783,
      "rewards/rejected": -4.397337436676025,
      "step": 781
    },
    {
      "epoch": 0.2,
      "grad_norm": 38.49657440185547,
      "kl": 0.0,
      "learning_rate": 3.9767076681496985e-07,
      "logps/chosen": -210.4178924560547,
      "logps/rejected": -293.0738525390625,
      "loss": 0.3281,
      "rewards/chosen": -1.2712124586105347,
      "rewards/margins": 3.368025302886963,
      "rewards/rejected": -4.639237880706787,
      "step": 782
    },
    {
      "epoch": 0.2,
      "grad_norm": 30.322834014892578,
      "kl": 0.0,
      "learning_rate": 3.9753991101805805e-07,
      "logps/chosen": -178.96539306640625,
      "logps/rejected": -257.24505615234375,
      "loss": 0.3139,
      "rewards/chosen": -0.5862289667129517,
      "rewards/margins": 6.326912879943848,
      "rewards/rejected": -6.91314172744751,
      "step": 783
    },
    {
      "epoch": 0.21,
      "grad_norm": 36.877342224121094,
      "kl": 0.0,
      "learning_rate": 3.9740905522114624e-07,
      "logps/chosen": -254.62535095214844,
      "logps/rejected": -236.10403442382812,
      "loss": 0.2649,
      "rewards/chosen": -0.45170319080352783,
      "rewards/margins": 2.7409448623657227,
      "rewards/rejected": -3.192647933959961,
      "step": 784
    },
    {
      "epoch": 0.21,
      "grad_norm": 35.72412109375,
      "kl": 0.0,
      "learning_rate": 3.9727819942423444e-07,
      "logps/chosen": -272.28033447265625,
      "logps/rejected": -329.8775939941406,
      "loss": 0.3099,
      "rewards/chosen": -2.322260856628418,
      "rewards/margins": 2.1673550605773926,
      "rewards/rejected": -4.4896159172058105,
      "step": 785
    },
    {
      "epoch": 0.21,
      "grad_norm": 35.31153869628906,
      "kl": 0.0,
      "learning_rate": 3.9714734362732264e-07,
      "logps/chosen": -208.0709228515625,
      "logps/rejected": -185.36216735839844,
      "loss": 0.3253,
      "rewards/chosen": -1.4396045207977295,
      "rewards/margins": 1.5532078742980957,
      "rewards/rejected": -2.992812395095825,
      "step": 786
    },
    {
      "epoch": 0.21,
      "grad_norm": 30.253881454467773,
      "kl": 0.0,
      "learning_rate": 3.970164878304109e-07,
      "logps/chosen": -256.6632080078125,
      "logps/rejected": -263.5473937988281,
      "loss": 0.243,
      "rewards/chosen": 3.115372657775879,
      "rewards/margins": 5.554112434387207,
      "rewards/rejected": -2.438739776611328,
      "step": 787
    },
    {
      "epoch": 0.21,
      "grad_norm": 32.833534240722656,
      "kl": 0.0,
      "learning_rate": 3.968856320334991e-07,
      "logps/chosen": -235.54193115234375,
      "logps/rejected": -272.4754333496094,
      "loss": 0.3234,
      "rewards/chosen": -1.1567193269729614,
      "rewards/margins": 1.9879382848739624,
      "rewards/rejected": -3.144657611846924,
      "step": 788
    },
    {
      "epoch": 0.21,
      "grad_norm": 43.316383361816406,
      "kl": 0.0,
      "learning_rate": 3.967547762365873e-07,
      "logps/chosen": -204.2913818359375,
      "logps/rejected": -260.9977111816406,
      "loss": 0.3539,
      "rewards/chosen": -0.3340170979499817,
      "rewards/margins": 2.9788312911987305,
      "rewards/rejected": -3.3128483295440674,
      "step": 789
    },
    {
      "epoch": 0.21,
      "grad_norm": 32.47597122192383,
      "kl": 0.0,
      "learning_rate": 3.9662392043967547e-07,
      "logps/chosen": -313.443603515625,
      "logps/rejected": -299.80126953125,
      "loss": 0.3158,
      "rewards/chosen": -2.1112234592437744,
      "rewards/margins": 4.413530349731445,
      "rewards/rejected": -6.524753570556641,
      "step": 790
    },
    {
      "epoch": 0.21,
      "grad_norm": 34.65275573730469,
      "kl": 0.0,
      "learning_rate": 3.9649306464276367e-07,
      "logps/chosen": -205.21121215820312,
      "logps/rejected": -269.4504089355469,
      "loss": 0.5045,
      "rewards/chosen": -1.286717414855957,
      "rewards/margins": 0.5504343509674072,
      "rewards/rejected": -1.8371517658233643,
      "step": 791
    },
    {
      "epoch": 0.21,
      "grad_norm": 29.15659523010254,
      "kl": 0.0,
      "learning_rate": 3.9636220884585187e-07,
      "logps/chosen": -244.59078979492188,
      "logps/rejected": -208.80099487304688,
      "loss": 0.4093,
      "rewards/chosen": -2.4560344219207764,
      "rewards/margins": 0.5167515277862549,
      "rewards/rejected": -2.9727859497070312,
      "step": 792
    },
    {
      "epoch": 0.21,
      "grad_norm": 35.65248107910156,
      "kl": 0.0,
      "learning_rate": 3.9623135304894006e-07,
      "logps/chosen": -271.3067932128906,
      "logps/rejected": -251.15574645996094,
      "loss": 0.3733,
      "rewards/chosen": -0.8604080677032471,
      "rewards/margins": 1.2890377044677734,
      "rewards/rejected": -2.1494457721710205,
      "step": 793
    },
    {
      "epoch": 0.21,
      "grad_norm": 37.173133850097656,
      "kl": 0.0,
      "learning_rate": 3.9610049725202826e-07,
      "logps/chosen": -294.5159606933594,
      "logps/rejected": -212.5648956298828,
      "loss": 0.3855,
      "rewards/chosen": -1.1113216876983643,
      "rewards/margins": 0.6509411334991455,
      "rewards/rejected": -1.7622628211975098,
      "step": 794
    },
    {
      "epoch": 0.21,
      "grad_norm": 47.86101531982422,
      "kl": 0.0,
      "learning_rate": 3.9596964145511645e-07,
      "logps/chosen": -238.9080047607422,
      "logps/rejected": -328.6092224121094,
      "loss": 0.3277,
      "rewards/chosen": -2.1608142852783203,
      "rewards/margins": 2.6770687103271484,
      "rewards/rejected": -4.837882995605469,
      "step": 795
    },
    {
      "epoch": 0.21,
      "grad_norm": 34.385520935058594,
      "kl": 0.0,
      "learning_rate": 3.9583878565820465e-07,
      "logps/chosen": -236.8154754638672,
      "logps/rejected": -218.8819122314453,
      "loss": 0.1998,
      "rewards/chosen": 0.4100772738456726,
      "rewards/margins": 3.6112098693847656,
      "rewards/rejected": -3.2011325359344482,
      "step": 796
    },
    {
      "epoch": 0.21,
      "grad_norm": 32.56373596191406,
      "kl": 0.0,
      "learning_rate": 3.957079298612928e-07,
      "logps/chosen": -166.31546020507812,
      "logps/rejected": -275.31103515625,
      "loss": 0.2913,
      "rewards/chosen": 0.30415087938308716,
      "rewards/margins": 4.374905586242676,
      "rewards/rejected": -4.070754528045654,
      "step": 797
    },
    {
      "epoch": 0.21,
      "grad_norm": 40.8636474609375,
      "kl": 0.0,
      "learning_rate": 3.95577074064381e-07,
      "logps/chosen": -213.78359985351562,
      "logps/rejected": -246.65036010742188,
      "loss": 0.3221,
      "rewards/chosen": -0.7096244096755981,
      "rewards/margins": 1.3897720575332642,
      "rewards/rejected": -2.0993964672088623,
      "step": 798
    },
    {
      "epoch": 0.21,
      "grad_norm": 43.956844329833984,
      "kl": 0.0,
      "learning_rate": 3.9544621826746924e-07,
      "logps/chosen": -202.52725219726562,
      "logps/rejected": -294.99359130859375,
      "loss": 0.3391,
      "rewards/chosen": -0.27378353476524353,
      "rewards/margins": 3.027306318283081,
      "rewards/rejected": -3.3010897636413574,
      "step": 799
    },
    {
      "epoch": 0.21,
      "grad_norm": 29.847885131835938,
      "kl": 0.0,
      "learning_rate": 3.9531536247055743e-07,
      "logps/chosen": -252.54580688476562,
      "logps/rejected": -257.87762451171875,
      "loss": 0.2948,
      "rewards/chosen": -0.18890556693077087,
      "rewards/margins": 3.8304316997528076,
      "rewards/rejected": -4.019337177276611,
      "step": 800
    },
    {
      "epoch": 0.21,
      "grad_norm": 31.92094612121582,
      "kl": 0.0,
      "learning_rate": 3.9518450667364563e-07,
      "logps/chosen": -227.11886596679688,
      "logps/rejected": -212.2671356201172,
      "loss": 0.3175,
      "rewards/chosen": -0.3426421582698822,
      "rewards/margins": 1.967758297920227,
      "rewards/rejected": -2.3104004859924316,
      "step": 801
    },
    {
      "epoch": 0.21,
      "grad_norm": 30.653305053710938,
      "kl": 0.0,
      "learning_rate": 3.950536508767338e-07,
      "logps/chosen": -308.0470886230469,
      "logps/rejected": -205.42645263671875,
      "loss": 0.4064,
      "rewards/chosen": -1.1722359657287598,
      "rewards/margins": 1.6847832202911377,
      "rewards/rejected": -2.8570191860198975,
      "step": 802
    },
    {
      "epoch": 0.21,
      "grad_norm": 27.230607986450195,
      "kl": 0.0,
      "learning_rate": 3.94922795079822e-07,
      "logps/chosen": -236.7538299560547,
      "logps/rejected": -220.7267303466797,
      "loss": 0.134,
      "rewards/chosen": -1.016723394393921,
      "rewards/margins": 2.2433347702026367,
      "rewards/rejected": -3.2600581645965576,
      "step": 803
    },
    {
      "epoch": 0.21,
      "grad_norm": 31.964181900024414,
      "kl": 0.0,
      "learning_rate": 3.947919392829102e-07,
      "logps/chosen": -150.38731384277344,
      "logps/rejected": -342.32757568359375,
      "loss": 0.2802,
      "rewards/chosen": 0.08924896270036697,
      "rewards/margins": 3.7942516803741455,
      "rewards/rejected": -3.705002784729004,
      "step": 804
    },
    {
      "epoch": 0.21,
      "grad_norm": 42.96985626220703,
      "kl": 0.0,
      "learning_rate": 3.946610834859984e-07,
      "logps/chosen": -248.42263793945312,
      "logps/rejected": -238.973876953125,
      "loss": 0.3615,
      "rewards/chosen": -0.34363865852355957,
      "rewards/margins": 2.0021135807037354,
      "rewards/rejected": -2.345752239227295,
      "step": 805
    },
    {
      "epoch": 0.21,
      "grad_norm": 45.3122673034668,
      "kl": 0.0,
      "learning_rate": 3.945302276890866e-07,
      "logps/chosen": -219.62779235839844,
      "logps/rejected": -255.35231018066406,
      "loss": 0.3963,
      "rewards/chosen": -0.4257380962371826,
      "rewards/margins": 2.4570958614349365,
      "rewards/rejected": -2.882833957672119,
      "step": 806
    },
    {
      "epoch": 0.21,
      "grad_norm": 49.268367767333984,
      "kl": 0.0,
      "learning_rate": 3.943993718921748e-07,
      "logps/chosen": -139.48182678222656,
      "logps/rejected": -227.3538055419922,
      "loss": 0.305,
      "rewards/chosen": -0.5484585165977478,
      "rewards/margins": 1.365720510482788,
      "rewards/rejected": -1.9141790866851807,
      "step": 807
    },
    {
      "epoch": 0.21,
      "grad_norm": 37.057613372802734,
      "kl": 0.0,
      "learning_rate": 3.94268516095263e-07,
      "logps/chosen": -265.90673828125,
      "logps/rejected": -205.68800354003906,
      "loss": 0.3106,
      "rewards/chosen": -1.8103305101394653,
      "rewards/margins": 1.3182934522628784,
      "rewards/rejected": -3.1286239624023438,
      "step": 808
    },
    {
      "epoch": 0.21,
      "grad_norm": 37.79048538208008,
      "kl": 0.0,
      "learning_rate": 3.941376602983512e-07,
      "logps/chosen": -219.385986328125,
      "logps/rejected": -269.8956298828125,
      "loss": 0.4439,
      "rewards/chosen": -1.3443603515625,
      "rewards/margins": 3.233832836151123,
      "rewards/rejected": -4.578193187713623,
      "step": 809
    },
    {
      "epoch": 0.21,
      "grad_norm": 31.44384002685547,
      "kl": 0.0,
      "learning_rate": 3.940068045014394e-07,
      "logps/chosen": -180.49928283691406,
      "logps/rejected": -192.64857482910156,
      "loss": 0.3292,
      "rewards/chosen": -0.6538400650024414,
      "rewards/margins": 2.0370359420776367,
      "rewards/rejected": -2.690876007080078,
      "step": 810
    },
    {
      "epoch": 0.21,
      "grad_norm": 37.697723388671875,
      "kl": 0.0,
      "learning_rate": 3.938759487045276e-07,
      "logps/chosen": -219.0982666015625,
      "logps/rejected": -220.82827758789062,
      "loss": 0.2949,
      "rewards/chosen": -0.7485886216163635,
      "rewards/margins": 2.1607255935668945,
      "rewards/rejected": -2.9093141555786133,
      "step": 811
    },
    {
      "epoch": 0.21,
      "grad_norm": 38.43693923950195,
      "kl": 0.0,
      "learning_rate": 3.9374509290761584e-07,
      "logps/chosen": -150.76431274414062,
      "logps/rejected": -232.7530059814453,
      "loss": 0.4337,
      "rewards/chosen": -0.958981990814209,
      "rewards/margins": 2.0869970321655273,
      "rewards/rejected": -3.0459790229797363,
      "step": 812
    },
    {
      "epoch": 0.21,
      "grad_norm": 32.30561447143555,
      "kl": 0.0,
      "learning_rate": 3.93614237110704e-07,
      "logps/chosen": -216.4032745361328,
      "logps/rejected": -408.9929504394531,
      "loss": 0.359,
      "rewards/chosen": 0.07472788542509079,
      "rewards/margins": 3.6807384490966797,
      "rewards/rejected": -3.606010675430298,
      "step": 813
    },
    {
      "epoch": 0.21,
      "grad_norm": 28.573406219482422,
      "kl": 0.0,
      "learning_rate": 3.934833813137922e-07,
      "logps/chosen": -224.9008026123047,
      "logps/rejected": -248.16273498535156,
      "loss": 0.3129,
      "rewards/chosen": -2.0450844764709473,
      "rewards/margins": 2.191556930541992,
      "rewards/rejected": -4.2366414070129395,
      "step": 814
    },
    {
      "epoch": 0.21,
      "grad_norm": 36.37868118286133,
      "kl": 0.0,
      "learning_rate": 3.933525255168804e-07,
      "logps/chosen": -284.7607116699219,
      "logps/rejected": -218.2286834716797,
      "loss": 0.2874,
      "rewards/chosen": -1.0571905374526978,
      "rewards/margins": 1.2404452562332153,
      "rewards/rejected": -2.297635793685913,
      "step": 815
    },
    {
      "epoch": 0.21,
      "grad_norm": 36.04764938354492,
      "kl": 0.0,
      "learning_rate": 3.9322166971996857e-07,
      "logps/chosen": -190.5691680908203,
      "logps/rejected": -253.7047576904297,
      "loss": 0.3125,
      "rewards/chosen": -0.7160643935203552,
      "rewards/margins": 1.988569974899292,
      "rewards/rejected": -2.704634428024292,
      "step": 816
    },
    {
      "epoch": 0.21,
      "grad_norm": 38.212379455566406,
      "kl": 0.0,
      "learning_rate": 3.9309081392305677e-07,
      "logps/chosen": -264.66156005859375,
      "logps/rejected": -331.8368225097656,
      "loss": 0.3209,
      "rewards/chosen": -1.0136640071868896,
      "rewards/margins": 3.238999605178833,
      "rewards/rejected": -4.252663612365723,
      "step": 817
    },
    {
      "epoch": 0.21,
      "grad_norm": 31.041854858398438,
      "kl": 0.0,
      "learning_rate": 3.9295995812614496e-07,
      "logps/chosen": -195.3871307373047,
      "logps/rejected": -211.3892059326172,
      "loss": 0.3107,
      "rewards/chosen": -0.8108863830566406,
      "rewards/margins": 2.6762940883636475,
      "rewards/rejected": -3.487180471420288,
      "step": 818
    },
    {
      "epoch": 0.21,
      "grad_norm": 37.01015853881836,
      "kl": 0.0,
      "learning_rate": 3.9282910232923316e-07,
      "logps/chosen": -186.79855346679688,
      "logps/rejected": -205.82369995117188,
      "loss": 0.3073,
      "rewards/chosen": -0.5886488556861877,
      "rewards/margins": 1.8132927417755127,
      "rewards/rejected": -2.4019415378570557,
      "step": 819
    },
    {
      "epoch": 0.21,
      "grad_norm": 35.67362976074219,
      "kl": 0.0,
      "learning_rate": 3.9269824653232136e-07,
      "logps/chosen": -237.49017333984375,
      "logps/rejected": -328.6663513183594,
      "loss": 0.3152,
      "rewards/chosen": 0.14792077243328094,
      "rewards/margins": 3.6670782566070557,
      "rewards/rejected": -3.5191574096679688,
      "step": 820
    },
    {
      "epoch": 0.21,
      "grad_norm": 39.48514175415039,
      "kl": 0.0,
      "learning_rate": 3.9256739073540955e-07,
      "logps/chosen": -218.6199493408203,
      "logps/rejected": -374.603271484375,
      "loss": 0.2194,
      "rewards/chosen": 0.45418599247932434,
      "rewards/margins": 3.1380836963653564,
      "rewards/rejected": -2.6838977336883545,
      "step": 821
    },
    {
      "epoch": 0.22,
      "grad_norm": 32.8681755065918,
      "kl": 0.0,
      "learning_rate": 3.9243653493849775e-07,
      "logps/chosen": -204.17843627929688,
      "logps/rejected": -323.87896728515625,
      "loss": 0.3093,
      "rewards/chosen": 0.1345633566379547,
      "rewards/margins": 3.1268160343170166,
      "rewards/rejected": -2.9922525882720947,
      "step": 822
    },
    {
      "epoch": 0.22,
      "grad_norm": 34.83428192138672,
      "kl": 0.0,
      "learning_rate": 3.9230567914158594e-07,
      "logps/chosen": -280.830078125,
      "logps/rejected": -185.23431396484375,
      "loss": 0.3816,
      "rewards/chosen": -0.6364680528640747,
      "rewards/margins": 2.2966771125793457,
      "rewards/rejected": -2.933145046234131,
      "step": 823
    },
    {
      "epoch": 0.22,
      "grad_norm": 32.58478546142578,
      "kl": 0.0,
      "learning_rate": 3.9217482334467414e-07,
      "logps/chosen": -229.3389434814453,
      "logps/rejected": -239.52867126464844,
      "loss": 0.41,
      "rewards/chosen": -0.861241340637207,
      "rewards/margins": 1.8840315341949463,
      "rewards/rejected": -2.7452728748321533,
      "step": 824
    },
    {
      "epoch": 0.22,
      "grad_norm": 36.9166374206543,
      "kl": 0.0,
      "learning_rate": 3.920439675477624e-07,
      "logps/chosen": -230.52386474609375,
      "logps/rejected": -196.36956787109375,
      "loss": 0.4067,
      "rewards/chosen": -0.9328143000602722,
      "rewards/margins": 1.2270958423614502,
      "rewards/rejected": -2.159910202026367,
      "step": 825
    },
    {
      "epoch": 0.22,
      "grad_norm": 30.93659210205078,
      "kl": 0.0,
      "learning_rate": 3.919131117508506e-07,
      "logps/chosen": -269.2838439941406,
      "logps/rejected": -263.96917724609375,
      "loss": 0.3151,
      "rewards/chosen": -0.5156112909317017,
      "rewards/margins": 3.1444249153137207,
      "rewards/rejected": -3.660036325454712,
      "step": 826
    },
    {
      "epoch": 0.22,
      "grad_norm": 38.69263458251953,
      "kl": 0.0,
      "learning_rate": 3.917822559539388e-07,
      "logps/chosen": -322.1570739746094,
      "logps/rejected": -225.15899658203125,
      "loss": 0.3515,
      "rewards/chosen": -0.8002814650535583,
      "rewards/margins": 2.331637382507324,
      "rewards/rejected": -3.1319189071655273,
      "step": 827
    },
    {
      "epoch": 0.22,
      "grad_norm": 30.892852783203125,
      "kl": 0.0,
      "learning_rate": 3.916514001570269e-07,
      "logps/chosen": -158.5875701904297,
      "logps/rejected": -316.0892028808594,
      "loss": 0.2362,
      "rewards/chosen": -0.21511435508728027,
      "rewards/margins": 5.075641632080078,
      "rewards/rejected": -5.2907562255859375,
      "step": 828
    },
    {
      "epoch": 0.22,
      "grad_norm": 24.3665771484375,
      "kl": 0.0,
      "learning_rate": 3.915205443601151e-07,
      "logps/chosen": -233.41485595703125,
      "logps/rejected": -254.3380584716797,
      "loss": 0.3087,
      "rewards/chosen": 0.7938384413719177,
      "rewards/margins": 4.078500747680664,
      "rewards/rejected": -3.2846624851226807,
      "step": 829
    },
    {
      "epoch": 0.22,
      "grad_norm": 26.198406219482422,
      "kl": 0.0,
      "learning_rate": 3.913896885632033e-07,
      "logps/chosen": -134.5040283203125,
      "logps/rejected": -127.63214111328125,
      "loss": 0.2277,
      "rewards/chosen": -0.1338224709033966,
      "rewards/margins": 1.969205379486084,
      "rewards/rejected": -2.103027820587158,
      "step": 830
    },
    {
      "epoch": 0.22,
      "grad_norm": 38.63385772705078,
      "kl": 0.0,
      "learning_rate": 3.912588327662915e-07,
      "logps/chosen": -223.22711181640625,
      "logps/rejected": -281.6058654785156,
      "loss": 0.3643,
      "rewards/chosen": -0.4442996084690094,
      "rewards/margins": 1.7670749425888062,
      "rewards/rejected": -2.211374521255493,
      "step": 831
    },
    {
      "epoch": 0.22,
      "grad_norm": 40.71464538574219,
      "kl": 0.0,
      "learning_rate": 3.911279769693797e-07,
      "logps/chosen": -272.858154296875,
      "logps/rejected": -323.8492126464844,
      "loss": 0.3892,
      "rewards/chosen": -1.7337590456008911,
      "rewards/margins": 2.563958168029785,
      "rewards/rejected": -4.297717094421387,
      "step": 832
    },
    {
      "epoch": 0.22,
      "grad_norm": 41.5645751953125,
      "kl": 0.0,
      "learning_rate": 3.909971211724679e-07,
      "logps/chosen": -202.53445434570312,
      "logps/rejected": -205.5897979736328,
      "loss": 0.3593,
      "rewards/chosen": -0.435829758644104,
      "rewards/margins": 3.275907039642334,
      "rewards/rejected": -3.7117366790771484,
      "step": 833
    },
    {
      "epoch": 0.22,
      "grad_norm": 36.07457733154297,
      "kl": 0.0,
      "learning_rate": 3.908662653755561e-07,
      "logps/chosen": -190.55044555664062,
      "logps/rejected": -263.7154846191406,
      "loss": 0.2823,
      "rewards/chosen": 1.6647402048110962,
      "rewards/margins": 6.3530097007751465,
      "rewards/rejected": -4.68826961517334,
      "step": 834
    },
    {
      "epoch": 0.22,
      "grad_norm": 37.38973617553711,
      "kl": 0.0,
      "learning_rate": 3.907354095786443e-07,
      "logps/chosen": -190.0474090576172,
      "logps/rejected": -329.23077392578125,
      "loss": 0.3735,
      "rewards/chosen": 0.38552647829055786,
      "rewards/margins": 5.102424621582031,
      "rewards/rejected": -4.716897964477539,
      "step": 835
    },
    {
      "epoch": 0.22,
      "grad_norm": 34.40928268432617,
      "kl": 0.0,
      "learning_rate": 3.906045537817325e-07,
      "logps/chosen": -252.2081298828125,
      "logps/rejected": -211.90760803222656,
      "loss": 0.2441,
      "rewards/chosen": 0.08200878649950027,
      "rewards/margins": 3.04998517036438,
      "rewards/rejected": -2.9679763317108154,
      "step": 836
    },
    {
      "epoch": 0.22,
      "grad_norm": 29.55794906616211,
      "kl": 0.0,
      "learning_rate": 3.9047369798482074e-07,
      "logps/chosen": -313.3184814453125,
      "logps/rejected": -272.58660888671875,
      "loss": 0.275,
      "rewards/chosen": 0.34192368388175964,
      "rewards/margins": 4.394167900085449,
      "rewards/rejected": -4.052244186401367,
      "step": 837
    },
    {
      "epoch": 0.22,
      "grad_norm": 41.28226089477539,
      "kl": 0.0,
      "learning_rate": 3.9034284218790894e-07,
      "logps/chosen": -200.53805541992188,
      "logps/rejected": -207.01441955566406,
      "loss": 0.4246,
      "rewards/chosen": -0.12353704869747162,
      "rewards/margins": 2.117542028427124,
      "rewards/rejected": -2.241079092025757,
      "step": 838
    },
    {
      "epoch": 0.22,
      "grad_norm": 35.18424606323242,
      "kl": 0.0,
      "learning_rate": 3.9021198639099713e-07,
      "logps/chosen": -211.63534545898438,
      "logps/rejected": -174.66592407226562,
      "loss": 0.453,
      "rewards/chosen": -1.374969244003296,
      "rewards/margins": 1.2978932857513428,
      "rewards/rejected": -2.6728625297546387,
      "step": 839
    },
    {
      "epoch": 0.22,
      "grad_norm": 25.139108657836914,
      "kl": 0.0,
      "learning_rate": 3.9008113059408533e-07,
      "logps/chosen": -101.32666778564453,
      "logps/rejected": -270.806640625,
      "loss": 0.3132,
      "rewards/chosen": -0.28592658042907715,
      "rewards/margins": 2.875089168548584,
      "rewards/rejected": -3.161015748977661,
      "step": 840
    },
    {
      "epoch": 0.22,
      "grad_norm": 27.314390182495117,
      "kl": 0.0,
      "learning_rate": 3.8995027479717353e-07,
      "logps/chosen": -173.18409729003906,
      "logps/rejected": -310.06292724609375,
      "loss": 0.3556,
      "rewards/chosen": 0.756372332572937,
      "rewards/margins": 4.136096477508545,
      "rewards/rejected": -3.3797240257263184,
      "step": 841
    },
    {
      "epoch": 0.22,
      "grad_norm": 28.0109806060791,
      "kl": 0.0,
      "learning_rate": 3.898194190002617e-07,
      "logps/chosen": -255.11325073242188,
      "logps/rejected": -244.13511657714844,
      "loss": 0.3061,
      "rewards/chosen": -1.0839815139770508,
      "rewards/margins": 2.8800461292266846,
      "rewards/rejected": -3.9640276432037354,
      "step": 842
    },
    {
      "epoch": 0.22,
      "grad_norm": 30.117618560791016,
      "kl": 0.0,
      "learning_rate": 3.896885632033499e-07,
      "logps/chosen": -222.35382080078125,
      "logps/rejected": -156.29025268554688,
      "loss": 0.3124,
      "rewards/chosen": 0.9357742667198181,
      "rewards/margins": 3.4082491397857666,
      "rewards/rejected": -2.4724748134613037,
      "step": 843
    },
    {
      "epoch": 0.22,
      "grad_norm": 32.259674072265625,
      "kl": 0.0,
      "learning_rate": 3.8955770740643806e-07,
      "logps/chosen": -228.333984375,
      "logps/rejected": -230.0655059814453,
      "loss": 0.3317,
      "rewards/chosen": -1.7928178310394287,
      "rewards/margins": 0.8958656787872314,
      "rewards/rejected": -2.68868350982666,
      "step": 844
    },
    {
      "epoch": 0.22,
      "grad_norm": 33.866641998291016,
      "kl": 0.0,
      "learning_rate": 3.8942685160952626e-07,
      "logps/chosen": -162.68394470214844,
      "logps/rejected": -260.8937072753906,
      "loss": 0.2789,
      "rewards/chosen": -0.48359790444374084,
      "rewards/margins": 1.7450591325759888,
      "rewards/rejected": -2.2286570072174072,
      "step": 845
    },
    {
      "epoch": 0.22,
      "grad_norm": 41.97733688354492,
      "kl": 0.0,
      "learning_rate": 3.8929599581261445e-07,
      "logps/chosen": -211.76185607910156,
      "logps/rejected": -203.70687866210938,
      "loss": 0.353,
      "rewards/chosen": -0.6491997838020325,
      "rewards/margins": 1.6363976001739502,
      "rewards/rejected": -2.285597324371338,
      "step": 846
    },
    {
      "epoch": 0.22,
      "grad_norm": 40.185462951660156,
      "kl": 0.0,
      "learning_rate": 3.8916514001570265e-07,
      "logps/chosen": -247.6580047607422,
      "logps/rejected": -251.53753662109375,
      "loss": 0.3429,
      "rewards/chosen": -0.5201565623283386,
      "rewards/margins": 2.708573579788208,
      "rewards/rejected": -3.2287302017211914,
      "step": 847
    },
    {
      "epoch": 0.22,
      "grad_norm": 35.77328872680664,
      "kl": 0.0,
      "learning_rate": 3.8903428421879085e-07,
      "logps/chosen": -221.64239501953125,
      "logps/rejected": -269.6127014160156,
      "loss": 0.4148,
      "rewards/chosen": -1.399546504020691,
      "rewards/margins": 1.6255131959915161,
      "rewards/rejected": -3.025059700012207,
      "step": 848
    },
    {
      "epoch": 0.22,
      "grad_norm": 34.25099182128906,
      "kl": 0.0,
      "learning_rate": 3.8890342842187904e-07,
      "logps/chosen": -205.93341064453125,
      "logps/rejected": -214.56007385253906,
      "loss": 0.3326,
      "rewards/chosen": 0.8295797109603882,
      "rewards/margins": 3.103825569152832,
      "rewards/rejected": -2.2742457389831543,
      "step": 849
    },
    {
      "epoch": 0.22,
      "grad_norm": 34.43983840942383,
      "kl": 0.0,
      "learning_rate": 3.887725726249673e-07,
      "logps/chosen": -381.5731201171875,
      "logps/rejected": -172.75927734375,
      "loss": 0.299,
      "rewards/chosen": -0.3489433228969574,
      "rewards/margins": 4.253884315490723,
      "rewards/rejected": -4.602827548980713,
      "step": 850
    },
    {
      "epoch": 0.22,
      "grad_norm": 34.45042419433594,
      "kl": 0.0,
      "learning_rate": 3.886417168280555e-07,
      "logps/chosen": -308.0503845214844,
      "logps/rejected": -223.3968963623047,
      "loss": 0.2272,
      "rewards/chosen": -1.4904717206954956,
      "rewards/margins": 0.9229556322097778,
      "rewards/rejected": -2.4134273529052734,
      "step": 851
    },
    {
      "epoch": 0.22,
      "grad_norm": 24.188337326049805,
      "kl": 0.0,
      "learning_rate": 3.885108610311437e-07,
      "logps/chosen": -248.4654541015625,
      "logps/rejected": -141.71022033691406,
      "loss": 0.4756,
      "rewards/chosen": -1.6014578342437744,
      "rewards/margins": 1.601494550704956,
      "rewards/rejected": -3.2029523849487305,
      "step": 852
    },
    {
      "epoch": 0.22,
      "grad_norm": 35.598716735839844,
      "kl": 0.0,
      "learning_rate": 3.883800052342319e-07,
      "logps/chosen": -248.89134216308594,
      "logps/rejected": -201.86526489257812,
      "loss": 0.2738,
      "rewards/chosen": -1.4978164434432983,
      "rewards/margins": 1.9217320680618286,
      "rewards/rejected": -3.419548511505127,
      "step": 853
    },
    {
      "epoch": 0.22,
      "grad_norm": 33.07661056518555,
      "kl": 0.0,
      "learning_rate": 3.882491494373201e-07,
      "logps/chosen": -201.11595153808594,
      "logps/rejected": -292.6474914550781,
      "loss": 0.3333,
      "rewards/chosen": -0.014609907753765583,
      "rewards/margins": 3.7823586463928223,
      "rewards/rejected": -3.796968460083008,
      "step": 854
    },
    {
      "epoch": 0.22,
      "grad_norm": 33.12773132324219,
      "kl": 0.0,
      "learning_rate": 3.8811829364040827e-07,
      "logps/chosen": -280.09844970703125,
      "logps/rejected": -193.63916015625,
      "loss": 0.3676,
      "rewards/chosen": -1.4199799299240112,
      "rewards/margins": 0.9965299367904663,
      "rewards/rejected": -2.4165098667144775,
      "step": 855
    },
    {
      "epoch": 0.22,
      "grad_norm": 39.756919860839844,
      "kl": 0.0,
      "learning_rate": 3.8798743784349647e-07,
      "logps/chosen": -260.8897705078125,
      "logps/rejected": -261.7392578125,
      "loss": 0.37,
      "rewards/chosen": 0.2770477533340454,
      "rewards/margins": 3.7935256958007812,
      "rewards/rejected": -3.5164780616760254,
      "step": 856
    },
    {
      "epoch": 0.22,
      "grad_norm": 37.37472152709961,
      "kl": 0.0,
      "learning_rate": 3.8785658204658466e-07,
      "logps/chosen": -322.6257629394531,
      "logps/rejected": -273.3817443847656,
      "loss": 0.2932,
      "rewards/chosen": 0.63135826587677,
      "rewards/margins": 3.3113021850585938,
      "rewards/rejected": -2.6799440383911133,
      "step": 857
    },
    {
      "epoch": 0.22,
      "grad_norm": 28.249853134155273,
      "kl": 0.0,
      "learning_rate": 3.8772572624967286e-07,
      "logps/chosen": -176.0353546142578,
      "logps/rejected": -240.82408142089844,
      "loss": 0.2774,
      "rewards/chosen": -0.18540804088115692,
      "rewards/margins": 3.2752466201782227,
      "rewards/rejected": -3.4606547355651855,
      "step": 858
    },
    {
      "epoch": 0.22,
      "grad_norm": 30.973310470581055,
      "kl": 0.0,
      "learning_rate": 3.87594870452761e-07,
      "logps/chosen": -225.46209716796875,
      "logps/rejected": -178.56396484375,
      "loss": 0.368,
      "rewards/chosen": -1.4540555477142334,
      "rewards/margins": 2.279228925704956,
      "rewards/rejected": -3.7332844734191895,
      "step": 859
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.65345001220703,
      "kl": 0.0,
      "learning_rate": 3.874640146558492e-07,
      "logps/chosen": -363.8309020996094,
      "logps/rejected": -218.1306915283203,
      "loss": 0.261,
      "rewards/chosen": 0.6604343056678772,
      "rewards/margins": 3.1387128829956055,
      "rewards/rejected": -2.478278636932373,
      "step": 860
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.49645233154297,
      "kl": 0.0,
      "learning_rate": 3.873331588589374e-07,
      "logps/chosen": -191.21910095214844,
      "logps/rejected": -216.03738403320312,
      "loss": 0.3289,
      "rewards/chosen": 0.7137885689735413,
      "rewards/margins": 3.4422836303710938,
      "rewards/rejected": -2.7284951210021973,
      "step": 861
    },
    {
      "epoch": 0.23,
      "grad_norm": 37.977264404296875,
      "kl": 0.0,
      "learning_rate": 3.872023030620256e-07,
      "logps/chosen": -245.6107940673828,
      "logps/rejected": -322.30938720703125,
      "loss": 0.4959,
      "rewards/chosen": -0.9423640966415405,
      "rewards/margins": 2.3004255294799805,
      "rewards/rejected": -3.2427897453308105,
      "step": 862
    },
    {
      "epoch": 0.23,
      "grad_norm": 29.341732025146484,
      "kl": 0.0,
      "learning_rate": 3.8707144726511384e-07,
      "logps/chosen": -230.37039184570312,
      "logps/rejected": -185.85226440429688,
      "loss": 0.3602,
      "rewards/chosen": 0.03912970423698425,
      "rewards/margins": 3.828538417816162,
      "rewards/rejected": -3.7894086837768555,
      "step": 863
    },
    {
      "epoch": 0.23,
      "grad_norm": 35.85225296020508,
      "kl": 0.0,
      "learning_rate": 3.8694059146820204e-07,
      "logps/chosen": -275.0382995605469,
      "logps/rejected": -252.705078125,
      "loss": 0.2673,
      "rewards/chosen": -0.8635746836662292,
      "rewards/margins": 2.1875228881835938,
      "rewards/rejected": -3.0510976314544678,
      "step": 864
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.60776901245117,
      "kl": 0.0,
      "learning_rate": 3.8680973567129023e-07,
      "logps/chosen": -262.7815246582031,
      "logps/rejected": -232.62852478027344,
      "loss": 0.2906,
      "rewards/chosen": 1.0779635906219482,
      "rewards/margins": 4.774511337280273,
      "rewards/rejected": -3.696547508239746,
      "step": 865
    },
    {
      "epoch": 0.23,
      "grad_norm": 30.603410720825195,
      "kl": 0.0,
      "learning_rate": 3.8667887987437843e-07,
      "logps/chosen": -282.4066162109375,
      "logps/rejected": -290.36651611328125,
      "loss": 0.3335,
      "rewards/chosen": -0.6011098623275757,
      "rewards/margins": 3.6880950927734375,
      "rewards/rejected": -4.289205074310303,
      "step": 866
    },
    {
      "epoch": 0.23,
      "grad_norm": 26.779672622680664,
      "kl": 0.0,
      "learning_rate": 3.865480240774666e-07,
      "logps/chosen": -200.43505859375,
      "logps/rejected": -276.28826904296875,
      "loss": 0.2913,
      "rewards/chosen": 0.06558636575937271,
      "rewards/margins": 3.628774881362915,
      "rewards/rejected": -3.5631885528564453,
      "step": 867
    },
    {
      "epoch": 0.23,
      "grad_norm": 32.30617141723633,
      "kl": 0.0,
      "learning_rate": 3.864171682805548e-07,
      "logps/chosen": -196.51950073242188,
      "logps/rejected": -198.25823974609375,
      "loss": 0.4009,
      "rewards/chosen": -0.655818521976471,
      "rewards/margins": 1.8044543266296387,
      "rewards/rejected": -2.460272789001465,
      "step": 868
    },
    {
      "epoch": 0.23,
      "grad_norm": 34.12016677856445,
      "kl": 0.0,
      "learning_rate": 3.86286312483643e-07,
      "logps/chosen": -168.55343627929688,
      "logps/rejected": -235.79209899902344,
      "loss": 0.2177,
      "rewards/chosen": 0.27080053091049194,
      "rewards/margins": 2.634535789489746,
      "rewards/rejected": -2.3637351989746094,
      "step": 869
    },
    {
      "epoch": 0.23,
      "grad_norm": 36.46745681762695,
      "kl": 0.0,
      "learning_rate": 3.861554566867312e-07,
      "logps/chosen": -256.4330139160156,
      "logps/rejected": -251.66062927246094,
      "loss": 0.3366,
      "rewards/chosen": 0.3164384663105011,
      "rewards/margins": 3.171847105026245,
      "rewards/rejected": -2.8554086685180664,
      "step": 870
    },
    {
      "epoch": 0.23,
      "grad_norm": 35.32799530029297,
      "kl": 0.0,
      "learning_rate": 3.860246008898194e-07,
      "logps/chosen": -299.51617431640625,
      "logps/rejected": -287.8265075683594,
      "loss": 0.3167,
      "rewards/chosen": 0.08460255712270737,
      "rewards/margins": 3.289461135864258,
      "rewards/rejected": -3.2048585414886475,
      "step": 871
    },
    {
      "epoch": 0.23,
      "grad_norm": 29.19240379333496,
      "kl": 0.0,
      "learning_rate": 3.858937450929076e-07,
      "logps/chosen": -231.63706970214844,
      "logps/rejected": -258.1219482421875,
      "loss": 0.3226,
      "rewards/chosen": -1.3912551403045654,
      "rewards/margins": 1.5225615501403809,
      "rewards/rejected": -2.9138166904449463,
      "step": 872
    },
    {
      "epoch": 0.23,
      "grad_norm": 32.26476287841797,
      "kl": 0.0,
      "learning_rate": 3.857628892959958e-07,
      "logps/chosen": -148.3664093017578,
      "logps/rejected": -233.32034301757812,
      "loss": 0.1882,
      "rewards/chosen": -0.08406653255224228,
      "rewards/margins": 3.803556203842163,
      "rewards/rejected": -3.887622833251953,
      "step": 873
    },
    {
      "epoch": 0.23,
      "grad_norm": 29.396602630615234,
      "kl": 0.0,
      "learning_rate": 3.85632033499084e-07,
      "logps/chosen": -265.3435974121094,
      "logps/rejected": -235.33168029785156,
      "loss": 0.2295,
      "rewards/chosen": -0.5107182860374451,
      "rewards/margins": 3.317000389099121,
      "rewards/rejected": -3.827718734741211,
      "step": 874
    },
    {
      "epoch": 0.23,
      "grad_norm": 35.100589752197266,
      "kl": 0.0,
      "learning_rate": 3.8550117770217214e-07,
      "logps/chosen": -235.42428588867188,
      "logps/rejected": -248.19354248046875,
      "loss": 0.31,
      "rewards/chosen": -2.912658452987671,
      "rewards/margins": 0.17116618156433105,
      "rewards/rejected": -3.083824634552002,
      "step": 875
    },
    {
      "epoch": 0.23,
      "grad_norm": 34.2608528137207,
      "kl": 0.0,
      "learning_rate": 3.853703219052604e-07,
      "logps/chosen": -310.35650634765625,
      "logps/rejected": -265.16485595703125,
      "loss": 0.4798,
      "rewards/chosen": -2.0445914268493652,
      "rewards/margins": 1.4393863677978516,
      "rewards/rejected": -3.483977794647217,
      "step": 876
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.69778060913086,
      "kl": 0.0,
      "learning_rate": 3.852394661083486e-07,
      "logps/chosen": -184.93099975585938,
      "logps/rejected": -218.10235595703125,
      "loss": 0.42,
      "rewards/chosen": -1.306616187095642,
      "rewards/margins": 1.0505958795547485,
      "rewards/rejected": -2.3572120666503906,
      "step": 877
    },
    {
      "epoch": 0.23,
      "grad_norm": 28.923768997192383,
      "kl": 0.0,
      "learning_rate": 3.851086103114368e-07,
      "logps/chosen": -179.19200134277344,
      "logps/rejected": -285.06695556640625,
      "loss": 0.3786,
      "rewards/chosen": -1.5913536548614502,
      "rewards/margins": 3.530456304550171,
      "rewards/rejected": -5.121809959411621,
      "step": 878
    },
    {
      "epoch": 0.23,
      "grad_norm": 30.29254913330078,
      "kl": 0.0,
      "learning_rate": 3.84977754514525e-07,
      "logps/chosen": -225.9005126953125,
      "logps/rejected": -261.67474365234375,
      "loss": 0.2837,
      "rewards/chosen": -0.349773108959198,
      "rewards/margins": 2.802173376083374,
      "rewards/rejected": -3.151946544647217,
      "step": 879
    },
    {
      "epoch": 0.23,
      "grad_norm": 28.67768096923828,
      "kl": 0.0,
      "learning_rate": 3.848468987176132e-07,
      "logps/chosen": -226.13919067382812,
      "logps/rejected": -247.85031127929688,
      "loss": 0.3464,
      "rewards/chosen": -0.07159680128097534,
      "rewards/margins": 3.374845504760742,
      "rewards/rejected": -3.4464423656463623,
      "step": 880
    },
    {
      "epoch": 0.23,
      "grad_norm": 34.244720458984375,
      "kl": 0.0,
      "learning_rate": 3.8471604292070137e-07,
      "logps/chosen": -279.54034423828125,
      "logps/rejected": -245.26930236816406,
      "loss": 0.3628,
      "rewards/chosen": -1.0659271478652954,
      "rewards/margins": 1.1372548341751099,
      "rewards/rejected": -2.2031819820404053,
      "step": 881
    },
    {
      "epoch": 0.23,
      "grad_norm": 26.053386688232422,
      "kl": 0.0,
      "learning_rate": 3.8458518712378957e-07,
      "logps/chosen": -226.8448944091797,
      "logps/rejected": -236.7382049560547,
      "loss": 0.2382,
      "rewards/chosen": -0.6372510194778442,
      "rewards/margins": 4.326333045959473,
      "rewards/rejected": -4.963583946228027,
      "step": 882
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.15485763549805,
      "kl": 0.0,
      "learning_rate": 3.8445433132687776e-07,
      "logps/chosen": -248.4652099609375,
      "logps/rejected": -286.5483703613281,
      "loss": 0.4762,
      "rewards/chosen": -0.453130841255188,
      "rewards/margins": 1.2334181070327759,
      "rewards/rejected": -1.6865489482879639,
      "step": 883
    },
    {
      "epoch": 0.23,
      "grad_norm": 25.893075942993164,
      "kl": 0.0,
      "learning_rate": 3.8432347552996596e-07,
      "logps/chosen": -133.47567749023438,
      "logps/rejected": -277.131103515625,
      "loss": 0.3321,
      "rewards/chosen": -0.7266333699226379,
      "rewards/margins": 3.487426519393921,
      "rewards/rejected": -4.214059829711914,
      "step": 884
    },
    {
      "epoch": 0.23,
      "grad_norm": 36.91793441772461,
      "kl": 0.0,
      "learning_rate": 3.8419261973305415e-07,
      "logps/chosen": -308.4381408691406,
      "logps/rejected": -243.12611389160156,
      "loss": 0.3189,
      "rewards/chosen": -1.1646596193313599,
      "rewards/margins": 2.519777774810791,
      "rewards/rejected": -3.6844375133514404,
      "step": 885
    },
    {
      "epoch": 0.23,
      "grad_norm": 32.67780685424805,
      "kl": 0.0,
      "learning_rate": 3.8406176393614235e-07,
      "logps/chosen": -372.8730163574219,
      "logps/rejected": -238.40402221679688,
      "loss": 0.3381,
      "rewards/chosen": -1.422531247138977,
      "rewards/margins": 0.20666325092315674,
      "rewards/rejected": -1.6291944980621338,
      "step": 886
    },
    {
      "epoch": 0.23,
      "grad_norm": 30.643545150756836,
      "kl": 0.0,
      "learning_rate": 3.8393090813923055e-07,
      "logps/chosen": -182.9138946533203,
      "logps/rejected": -255.97509765625,
      "loss": 0.3896,
      "rewards/chosen": 0.642753005027771,
      "rewards/margins": 5.055312156677246,
      "rewards/rejected": -4.4125590324401855,
      "step": 887
    },
    {
      "epoch": 0.23,
      "grad_norm": 31.73491668701172,
      "kl": 0.0,
      "learning_rate": 3.838000523423188e-07,
      "logps/chosen": -246.7891845703125,
      "logps/rejected": -216.09461975097656,
      "loss": 0.3923,
      "rewards/chosen": 0.15031063556671143,
      "rewards/margins": 3.669569969177246,
      "rewards/rejected": -3.519259452819824,
      "step": 888
    },
    {
      "epoch": 0.23,
      "grad_norm": 33.767276763916016,
      "kl": 0.0,
      "learning_rate": 3.83669196545407e-07,
      "logps/chosen": -181.1678466796875,
      "logps/rejected": -218.5722198486328,
      "loss": 0.3537,
      "rewards/chosen": -0.03673672676086426,
      "rewards/margins": 3.3610239028930664,
      "rewards/rejected": -3.3977606296539307,
      "step": 889
    },
    {
      "epoch": 0.23,
      "grad_norm": 34.10417175292969,
      "kl": 0.0,
      "learning_rate": 3.8353834074849514e-07,
      "logps/chosen": -225.9097900390625,
      "logps/rejected": -223.67532348632812,
      "loss": 0.2948,
      "rewards/chosen": 1.104257583618164,
      "rewards/margins": 2.821953535079956,
      "rewards/rejected": -1.717695951461792,
      "step": 890
    },
    {
      "epoch": 0.23,
      "grad_norm": 31.37761688232422,
      "kl": 0.0,
      "learning_rate": 3.8340748495158333e-07,
      "logps/chosen": -196.47811889648438,
      "logps/rejected": -288.2806396484375,
      "loss": 0.3518,
      "rewards/chosen": -1.7377431392669678,
      "rewards/margins": 4.380807876586914,
      "rewards/rejected": -6.118551254272461,
      "step": 891
    },
    {
      "epoch": 0.23,
      "grad_norm": 29.576194763183594,
      "kl": 0.0,
      "learning_rate": 3.8327662915467153e-07,
      "logps/chosen": -229.60830688476562,
      "logps/rejected": -204.51947021484375,
      "loss": 0.3093,
      "rewards/chosen": -0.2672429084777832,
      "rewards/margins": 3.7184066772460938,
      "rewards/rejected": -3.985649585723877,
      "step": 892
    },
    {
      "epoch": 0.23,
      "grad_norm": 41.917518615722656,
      "kl": 0.0,
      "learning_rate": 3.831457733577597e-07,
      "logps/chosen": -236.6829071044922,
      "logps/rejected": -225.75125122070312,
      "loss": 0.2346,
      "rewards/chosen": -0.9902133941650391,
      "rewards/margins": 2.6195900440216064,
      "rewards/rejected": -3.6098034381866455,
      "step": 893
    },
    {
      "epoch": 0.23,
      "grad_norm": 36.87592697143555,
      "kl": 0.0,
      "learning_rate": 3.830149175608479e-07,
      "logps/chosen": -267.770263671875,
      "logps/rejected": -300.6760559082031,
      "loss": 0.2915,
      "rewards/chosen": 0.8566461801528931,
      "rewards/margins": 3.4000487327575684,
      "rewards/rejected": -2.5434024333953857,
      "step": 894
    },
    {
      "epoch": 0.23,
      "grad_norm": 32.469627380371094,
      "kl": 0.0,
      "learning_rate": 3.828840617639361e-07,
      "logps/chosen": -283.3698425292969,
      "logps/rejected": -317.1371765136719,
      "loss": 0.2433,
      "rewards/chosen": 0.33935195207595825,
      "rewards/margins": 4.569455623626709,
      "rewards/rejected": -4.230103492736816,
      "step": 895
    },
    {
      "epoch": 0.23,
      "grad_norm": 29.65595817565918,
      "kl": 0.0,
      "learning_rate": 3.827532059670243e-07,
      "logps/chosen": -210.16015625,
      "logps/rejected": -274.7388000488281,
      "loss": 0.3122,
      "rewards/chosen": -0.0729527473449707,
      "rewards/margins": 3.0755321979522705,
      "rewards/rejected": -3.148484945297241,
      "step": 896
    },
    {
      "epoch": 0.23,
      "grad_norm": 32.243221282958984,
      "kl": 0.0,
      "learning_rate": 3.826223501701125e-07,
      "logps/chosen": -187.37014770507812,
      "logps/rejected": -288.8951110839844,
      "loss": 0.2861,
      "rewards/chosen": -0.23226681351661682,
      "rewards/margins": 4.278383731842041,
      "rewards/rejected": -4.510650634765625,
      "step": 897
    },
    {
      "epoch": 0.24,
      "grad_norm": 25.51363182067871,
      "kl": 0.0,
      "learning_rate": 3.824914943732007e-07,
      "logps/chosen": -116.35990142822266,
      "logps/rejected": -182.46832275390625,
      "loss": 0.2709,
      "rewards/chosen": 0.5371560454368591,
      "rewards/margins": 2.7382872104644775,
      "rewards/rejected": -2.2011311054229736,
      "step": 898
    },
    {
      "epoch": 0.24,
      "grad_norm": 28.72956085205078,
      "kl": 0.0,
      "learning_rate": 3.823606385762889e-07,
      "logps/chosen": -159.8656768798828,
      "logps/rejected": -194.38632202148438,
      "loss": 0.3667,
      "rewards/chosen": -0.5046699047088623,
      "rewards/margins": 3.2915899753570557,
      "rewards/rejected": -3.796259880065918,
      "step": 899
    },
    {
      "epoch": 0.24,
      "grad_norm": 36.71482467651367,
      "kl": 0.0,
      "learning_rate": 3.822297827793771e-07,
      "logps/chosen": -159.0552978515625,
      "logps/rejected": -281.41558837890625,
      "loss": 0.3563,
      "rewards/chosen": -0.36698049306869507,
      "rewards/margins": 2.405285358428955,
      "rewards/rejected": -2.772265911102295,
      "step": 900
    },
    {
      "epoch": 0.24,
      "grad_norm": 37.9292106628418,
      "kl": 0.0,
      "learning_rate": 3.8209892698246535e-07,
      "logps/chosen": -272.8384094238281,
      "logps/rejected": -299.947021484375,
      "loss": 0.3332,
      "rewards/chosen": -0.057388510555028915,
      "rewards/margins": 3.6654114723205566,
      "rewards/rejected": -3.7228000164031982,
      "step": 901
    },
    {
      "epoch": 0.24,
      "grad_norm": 35.52886199951172,
      "kl": 0.0,
      "learning_rate": 3.8196807118555354e-07,
      "logps/chosen": -202.97857666015625,
      "logps/rejected": -237.06822204589844,
      "loss": 0.2748,
      "rewards/chosen": 0.6133555769920349,
      "rewards/margins": 4.030307292938232,
      "rewards/rejected": -3.4169516563415527,
      "step": 902
    },
    {
      "epoch": 0.24,
      "grad_norm": 24.192476272583008,
      "kl": 0.0,
      "learning_rate": 3.8183721538864174e-07,
      "logps/chosen": -108.77693939208984,
      "logps/rejected": -283.0856628417969,
      "loss": 0.1872,
      "rewards/chosen": -0.7498208284378052,
      "rewards/margins": 0.35967063903808594,
      "rewards/rejected": -1.1094914674758911,
      "step": 903
    },
    {
      "epoch": 0.24,
      "grad_norm": 42.334014892578125,
      "kl": 0.0,
      "learning_rate": 3.8170635959172993e-07,
      "logps/chosen": -282.7518310546875,
      "logps/rejected": -201.28744506835938,
      "loss": 0.3745,
      "rewards/chosen": 0.6112672090530396,
      "rewards/margins": 3.277454376220703,
      "rewards/rejected": -2.666187286376953,
      "step": 904
    },
    {
      "epoch": 0.24,
      "grad_norm": 34.252220153808594,
      "kl": 0.0,
      "learning_rate": 3.815755037948181e-07,
      "logps/chosen": -229.54544067382812,
      "logps/rejected": -201.85923767089844,
      "loss": 0.3428,
      "rewards/chosen": -1.0642545223236084,
      "rewards/margins": 2.2222585678100586,
      "rewards/rejected": -3.286513090133667,
      "step": 905
    },
    {
      "epoch": 0.24,
      "grad_norm": 42.46119689941406,
      "kl": 0.0,
      "learning_rate": 3.8144464799790627e-07,
      "logps/chosen": -321.1701354980469,
      "logps/rejected": -230.4801025390625,
      "loss": 0.3239,
      "rewards/chosen": 0.8055683374404907,
      "rewards/margins": 3.0229482650756836,
      "rewards/rejected": -2.2173800468444824,
      "step": 906
    },
    {
      "epoch": 0.24,
      "grad_norm": 25.848482131958008,
      "kl": 0.0,
      "learning_rate": 3.8131379220099447e-07,
      "logps/chosen": -149.85707092285156,
      "logps/rejected": -245.2413787841797,
      "loss": 0.3329,
      "rewards/chosen": 0.381633460521698,
      "rewards/margins": 5.1404709815979,
      "rewards/rejected": -4.758837699890137,
      "step": 907
    },
    {
      "epoch": 0.24,
      "grad_norm": 42.254573822021484,
      "kl": 0.0,
      "learning_rate": 3.8118293640408266e-07,
      "logps/chosen": -219.65554809570312,
      "logps/rejected": -204.49111938476562,
      "loss": 0.405,
      "rewards/chosen": 0.17839092016220093,
      "rewards/margins": 3.0522470474243164,
      "rewards/rejected": -2.8738560676574707,
      "step": 908
    },
    {
      "epoch": 0.24,
      "grad_norm": 23.2181339263916,
      "kl": 0.0,
      "learning_rate": 3.8105208060717086e-07,
      "logps/chosen": -205.587646484375,
      "logps/rejected": -261.8460998535156,
      "loss": 0.355,
      "rewards/chosen": -0.022454485297203064,
      "rewards/margins": 4.495090007781982,
      "rewards/rejected": -4.517544269561768,
      "step": 909
    },
    {
      "epoch": 0.24,
      "grad_norm": 30.790456771850586,
      "kl": 0.0,
      "learning_rate": 3.8092122481025906e-07,
      "logps/chosen": -284.98150634765625,
      "logps/rejected": -311.32025146484375,
      "loss": 0.3364,
      "rewards/chosen": -0.573701024055481,
      "rewards/margins": 3.3590402603149414,
      "rewards/rejected": -3.932741403579712,
      "step": 910
    },
    {
      "epoch": 0.24,
      "grad_norm": 32.44673538208008,
      "kl": 0.0,
      "learning_rate": 3.8079036901334725e-07,
      "logps/chosen": -270.0657653808594,
      "logps/rejected": -197.69857788085938,
      "loss": 0.3679,
      "rewards/chosen": -0.34433862566947937,
      "rewards/margins": 2.41924786567688,
      "rewards/rejected": -2.7635865211486816,
      "step": 911
    },
    {
      "epoch": 0.24,
      "grad_norm": 33.279083251953125,
      "kl": 0.0,
      "learning_rate": 3.8065951321643545e-07,
      "logps/chosen": -202.7400360107422,
      "logps/rejected": -321.2475891113281,
      "loss": 0.2201,
      "rewards/chosen": 2.0015366077423096,
      "rewards/margins": 5.378347873687744,
      "rewards/rejected": -3.3768112659454346,
      "step": 912
    },
    {
      "epoch": 0.24,
      "grad_norm": 41.26420593261719,
      "kl": 0.0,
      "learning_rate": 3.8052865741952365e-07,
      "logps/chosen": -196.24246215820312,
      "logps/rejected": -219.8006591796875,
      "loss": 0.3636,
      "rewards/chosen": 0.6023075580596924,
      "rewards/margins": 2.8048958778381348,
      "rewards/rejected": -2.2025883197784424,
      "step": 913
    },
    {
      "epoch": 0.24,
      "grad_norm": 24.39764404296875,
      "kl": 0.0,
      "learning_rate": 3.803978016226119e-07,
      "logps/chosen": -209.90945434570312,
      "logps/rejected": -319.9163513183594,
      "loss": 0.3177,
      "rewards/chosen": 0.4894677400588989,
      "rewards/margins": 4.936036109924316,
      "rewards/rejected": -4.446568489074707,
      "step": 914
    },
    {
      "epoch": 0.24,
      "grad_norm": 34.143978118896484,
      "kl": 0.0,
      "learning_rate": 3.802669458257001e-07,
      "logps/chosen": -214.00375366210938,
      "logps/rejected": -250.49264526367188,
      "loss": 0.393,
      "rewards/chosen": 0.20385503768920898,
      "rewards/margins": 3.571147918701172,
      "rewards/rejected": -3.367292881011963,
      "step": 915
    },
    {
      "epoch": 0.24,
      "grad_norm": 36.70888137817383,
      "kl": 0.0,
      "learning_rate": 3.801360900287883e-07,
      "logps/chosen": -231.2817840576172,
      "logps/rejected": -239.27769470214844,
      "loss": 0.3282,
      "rewards/chosen": -0.3268897831439972,
      "rewards/margins": 1.634190320968628,
      "rewards/rejected": -1.9610800743103027,
      "step": 916
    },
    {
      "epoch": 0.24,
      "grad_norm": 27.67314338684082,
      "kl": 0.0,
      "learning_rate": 3.800052342318765e-07,
      "logps/chosen": -184.77256774902344,
      "logps/rejected": -347.36932373046875,
      "loss": 0.257,
      "rewards/chosen": 0.6008602380752563,
      "rewards/margins": 4.103627681732178,
      "rewards/rejected": -3.502767324447632,
      "step": 917
    },
    {
      "epoch": 0.24,
      "grad_norm": 36.47007369995117,
      "kl": 0.0,
      "learning_rate": 3.798743784349647e-07,
      "logps/chosen": -250.0063934326172,
      "logps/rejected": -304.2515869140625,
      "loss": 0.2986,
      "rewards/chosen": -0.7417954802513123,
      "rewards/margins": 4.9903411865234375,
      "rewards/rejected": -5.7321367263793945,
      "step": 918
    },
    {
      "epoch": 0.24,
      "grad_norm": 39.0871467590332,
      "kl": 0.0,
      "learning_rate": 3.797435226380529e-07,
      "logps/chosen": -236.37786865234375,
      "logps/rejected": -192.05194091796875,
      "loss": 0.2918,
      "rewards/chosen": -0.055040180683135986,
      "rewards/margins": 2.505793571472168,
      "rewards/rejected": -2.560833692550659,
      "step": 919
    },
    {
      "epoch": 0.24,
      "grad_norm": 29.87541389465332,
      "kl": 0.0,
      "learning_rate": 3.7961266684114107e-07,
      "logps/chosen": -159.6341094970703,
      "logps/rejected": -225.9227294921875,
      "loss": 0.3817,
      "rewards/chosen": 0.7967673540115356,
      "rewards/margins": 3.12998628616333,
      "rewards/rejected": -2.333218812942505,
      "step": 920
    },
    {
      "epoch": 0.24,
      "grad_norm": 39.75651550292969,
      "kl": 0.0,
      "learning_rate": 3.794818110442292e-07,
      "logps/chosen": -192.595703125,
      "logps/rejected": -275.8404235839844,
      "loss": 0.3581,
      "rewards/chosen": -2.0372021198272705,
      "rewards/margins": -0.12220478057861328,
      "rewards/rejected": -1.9149973392486572,
      "step": 921
    },
    {
      "epoch": 0.24,
      "grad_norm": 26.79347038269043,
      "kl": 0.0,
      "learning_rate": 3.793509552473174e-07,
      "logps/chosen": -189.87782287597656,
      "logps/rejected": -210.1076202392578,
      "loss": 0.1836,
      "rewards/chosen": 1.3419767618179321,
      "rewards/margins": 5.476966381072998,
      "rewards/rejected": -4.1349897384643555,
      "step": 922
    },
    {
      "epoch": 0.24,
      "grad_norm": 43.24937438964844,
      "kl": 0.0,
      "learning_rate": 3.792200994504056e-07,
      "logps/chosen": -194.89622497558594,
      "logps/rejected": -409.5931701660156,
      "loss": 0.4562,
      "rewards/chosen": -1.0916842222213745,
      "rewards/margins": 0.15808093547821045,
      "rewards/rejected": -1.249765157699585,
      "step": 923
    },
    {
      "epoch": 0.24,
      "grad_norm": 36.0235595703125,
      "kl": 0.0,
      "learning_rate": 3.790892436534938e-07,
      "logps/chosen": -212.34266662597656,
      "logps/rejected": -264.5822448730469,
      "loss": 0.3522,
      "rewards/chosen": -0.9759610295295715,
      "rewards/margins": 2.166531562805176,
      "rewards/rejected": -3.1424925327301025,
      "step": 924
    },
    {
      "epoch": 0.24,
      "grad_norm": 39.438472747802734,
      "kl": 0.0,
      "learning_rate": 3.78958387856582e-07,
      "logps/chosen": -199.68655395507812,
      "logps/rejected": -297.06494140625,
      "loss": 0.3211,
      "rewards/chosen": 0.956540584564209,
      "rewards/margins": 2.8180646896362305,
      "rewards/rejected": -1.861523985862732,
      "step": 925
    },
    {
      "epoch": 0.24,
      "grad_norm": 33.72963333129883,
      "kl": 0.0,
      "learning_rate": 3.788275320596702e-07,
      "logps/chosen": -216.25003051757812,
      "logps/rejected": -326.2884826660156,
      "loss": 0.2645,
      "rewards/chosen": -0.2656521499156952,
      "rewards/margins": 4.0716471672058105,
      "rewards/rejected": -4.337299346923828,
      "step": 926
    },
    {
      "epoch": 0.24,
      "grad_norm": 28.301097869873047,
      "kl": 0.0,
      "learning_rate": 3.7869667626275844e-07,
      "logps/chosen": -217.94644165039062,
      "logps/rejected": -200.2283935546875,
      "loss": 0.3988,
      "rewards/chosen": -0.7481070160865784,
      "rewards/margins": 1.4763429164886475,
      "rewards/rejected": -2.224449872970581,
      "step": 927
    },
    {
      "epoch": 0.24,
      "grad_norm": 37.24142837524414,
      "kl": 0.0,
      "learning_rate": 3.7856582046584664e-07,
      "logps/chosen": -178.01388549804688,
      "logps/rejected": -253.77645874023438,
      "loss": 0.42,
      "rewards/chosen": -0.5237429141998291,
      "rewards/margins": 1.6036875247955322,
      "rewards/rejected": -2.1274304389953613,
      "step": 928
    },
    {
      "epoch": 0.24,
      "grad_norm": 38.92591094970703,
      "kl": 0.0,
      "learning_rate": 3.7843496466893484e-07,
      "logps/chosen": -250.7688446044922,
      "logps/rejected": -279.5671691894531,
      "loss": 0.2998,
      "rewards/chosen": 0.08170495927333832,
      "rewards/margins": 2.645064353942871,
      "rewards/rejected": -2.563359498977661,
      "step": 929
    },
    {
      "epoch": 0.24,
      "grad_norm": 41.20185852050781,
      "kl": 0.0,
      "learning_rate": 3.7830410887202303e-07,
      "logps/chosen": -176.0547637939453,
      "logps/rejected": -232.66940307617188,
      "loss": 0.2198,
      "rewards/chosen": 0.13103172183036804,
      "rewards/margins": 2.8757386207580566,
      "rewards/rejected": -2.744706869125366,
      "step": 930
    },
    {
      "epoch": 0.24,
      "grad_norm": 31.94757652282715,
      "kl": 0.0,
      "learning_rate": 3.7817325307511123e-07,
      "logps/chosen": -205.85032653808594,
      "logps/rejected": -168.09828186035156,
      "loss": 0.2974,
      "rewards/chosen": 0.5673466920852661,
      "rewards/margins": 3.6650705337524414,
      "rewards/rejected": -3.097723960876465,
      "step": 931
    },
    {
      "epoch": 0.24,
      "grad_norm": 28.481760025024414,
      "kl": 0.0,
      "learning_rate": 3.780423972781994e-07,
      "logps/chosen": -194.0419464111328,
      "logps/rejected": -204.07566833496094,
      "loss": 0.3024,
      "rewards/chosen": -1.1814078092575073,
      "rewards/margins": 2.4324450492858887,
      "rewards/rejected": -3.6138527393341064,
      "step": 932
    },
    {
      "epoch": 0.24,
      "grad_norm": 36.05358123779297,
      "kl": 0.0,
      "learning_rate": 3.779115414812876e-07,
      "logps/chosen": -160.29495239257812,
      "logps/rejected": -250.9978790283203,
      "loss": 0.3588,
      "rewards/chosen": -0.2665542662143707,
      "rewards/margins": 2.5105628967285156,
      "rewards/rejected": -2.7771172523498535,
      "step": 933
    },
    {
      "epoch": 0.24,
      "grad_norm": 37.9229736328125,
      "kl": 0.0,
      "learning_rate": 3.777806856843758e-07,
      "logps/chosen": -296.37762451171875,
      "logps/rejected": -274.84759521484375,
      "loss": 0.3803,
      "rewards/chosen": 0.46562808752059937,
      "rewards/margins": 2.7332608699798584,
      "rewards/rejected": -2.2676327228546143,
      "step": 934
    },
    {
      "epoch": 0.24,
      "grad_norm": 32.8103141784668,
      "kl": 0.0,
      "learning_rate": 3.77649829887464e-07,
      "logps/chosen": -194.97434997558594,
      "logps/rejected": -187.0049285888672,
      "loss": 0.3435,
      "rewards/chosen": -0.5105401873588562,
      "rewards/margins": 2.990797281265259,
      "rewards/rejected": -3.5013375282287598,
      "step": 935
    },
    {
      "epoch": 0.24,
      "grad_norm": 38.560508728027344,
      "kl": 0.0,
      "learning_rate": 3.7751897409055216e-07,
      "logps/chosen": -317.18994140625,
      "logps/rejected": -281.34332275390625,
      "loss": 0.3472,
      "rewards/chosen": -1.0646255016326904,
      "rewards/margins": 0.3802691698074341,
      "rewards/rejected": -1.4448946714401245,
      "step": 936
    },
    {
      "epoch": 0.25,
      "grad_norm": 38.34934997558594,
      "kl": 0.0,
      "learning_rate": 3.7738811829364035e-07,
      "logps/chosen": -191.30926513671875,
      "logps/rejected": -268.2271728515625,
      "loss": 0.3716,
      "rewards/chosen": -0.6462159752845764,
      "rewards/margins": 1.863279104232788,
      "rewards/rejected": -2.5094950199127197,
      "step": 937
    },
    {
      "epoch": 0.25,
      "grad_norm": 46.346153259277344,
      "kl": 0.0,
      "learning_rate": 3.7725726249672855e-07,
      "logps/chosen": -325.8685302734375,
      "logps/rejected": -253.2593231201172,
      "loss": 0.4326,
      "rewards/chosen": 0.49079614877700806,
      "rewards/margins": 1.7913362979888916,
      "rewards/rejected": -1.3005400896072388,
      "step": 938
    },
    {
      "epoch": 0.25,
      "grad_norm": 36.08719253540039,
      "kl": 0.0,
      "learning_rate": 3.7712640669981674e-07,
      "logps/chosen": -213.1573486328125,
      "logps/rejected": -184.4655303955078,
      "loss": 0.3137,
      "rewards/chosen": -0.3603510856628418,
      "rewards/margins": 3.5041239261627197,
      "rewards/rejected": -3.8644750118255615,
      "step": 939
    },
    {
      "epoch": 0.25,
      "grad_norm": 37.11168670654297,
      "kl": 0.0,
      "learning_rate": 3.76995550902905e-07,
      "logps/chosen": -281.20904541015625,
      "logps/rejected": -166.6403045654297,
      "loss": 0.3994,
      "rewards/chosen": -1.179037094116211,
      "rewards/margins": 0.9344363212585449,
      "rewards/rejected": -2.113473415374756,
      "step": 940
    },
    {
      "epoch": 0.25,
      "grad_norm": 40.791709899902344,
      "kl": 0.0,
      "learning_rate": 3.768646951059932e-07,
      "logps/chosen": -173.112548828125,
      "logps/rejected": -214.59646606445312,
      "loss": 0.3807,
      "rewards/chosen": 0.023116042837500572,
      "rewards/margins": 2.422356128692627,
      "rewards/rejected": -2.399240016937256,
      "step": 941
    },
    {
      "epoch": 0.25,
      "grad_norm": 33.149959564208984,
      "kl": 0.0,
      "learning_rate": 3.767338393090814e-07,
      "logps/chosen": -169.68991088867188,
      "logps/rejected": -309.6609191894531,
      "loss": 0.2299,
      "rewards/chosen": 0.19015148282051086,
      "rewards/margins": 3.0547852516174316,
      "rewards/rejected": -2.864633798599243,
      "step": 942
    },
    {
      "epoch": 0.25,
      "grad_norm": 39.015960693359375,
      "kl": 0.0,
      "learning_rate": 3.766029835121696e-07,
      "logps/chosen": -251.8404998779297,
      "logps/rejected": -208.2914581298828,
      "loss": 0.332,
      "rewards/chosen": 0.2635638117790222,
      "rewards/margins": 2.563786745071411,
      "rewards/rejected": -2.300222873687744,
      "step": 943
    },
    {
      "epoch": 0.25,
      "grad_norm": 38.60781478881836,
      "kl": 0.0,
      "learning_rate": 3.764721277152578e-07,
      "logps/chosen": -215.9370574951172,
      "logps/rejected": -249.64276123046875,
      "loss": 0.3206,
      "rewards/chosen": -0.013658404350280762,
      "rewards/margins": 3.1306090354919434,
      "rewards/rejected": -3.1442675590515137,
      "step": 944
    },
    {
      "epoch": 0.25,
      "grad_norm": 28.41880989074707,
      "kl": 0.0,
      "learning_rate": 3.7634127191834597e-07,
      "logps/chosen": -146.95925903320312,
      "logps/rejected": -206.71322631835938,
      "loss": 0.3208,
      "rewards/chosen": -0.11132961511611938,
      "rewards/margins": 3.1574666500091553,
      "rewards/rejected": -3.26879620552063,
      "step": 945
    },
    {
      "epoch": 0.25,
      "grad_norm": 38.186038970947266,
      "kl": 0.0,
      "learning_rate": 3.7621041612143417e-07,
      "logps/chosen": -236.7716827392578,
      "logps/rejected": -213.30288696289062,
      "loss": 0.3769,
      "rewards/chosen": -0.7647838592529297,
      "rewards/margins": 4.22573184967041,
      "rewards/rejected": -4.99051570892334,
      "step": 946
    },
    {
      "epoch": 0.25,
      "grad_norm": 38.70585250854492,
      "kl": 0.0,
      "learning_rate": 3.7607956032452237e-07,
      "logps/chosen": -191.19248962402344,
      "logps/rejected": -237.6758575439453,
      "loss": 0.3382,
      "rewards/chosen": -0.5101767778396606,
      "rewards/margins": 1.7929340600967407,
      "rewards/rejected": -2.3031108379364014,
      "step": 947
    },
    {
      "epoch": 0.25,
      "grad_norm": 36.55864334106445,
      "kl": 0.0,
      "learning_rate": 3.7594870452761056e-07,
      "logps/chosen": -240.2667236328125,
      "logps/rejected": -323.38397216796875,
      "loss": 0.3667,
      "rewards/chosen": -0.042002975940704346,
      "rewards/margins": 2.5760934352874756,
      "rewards/rejected": -2.618096351623535,
      "step": 948
    },
    {
      "epoch": 0.25,
      "grad_norm": 32.7099494934082,
      "kl": 0.0,
      "learning_rate": 3.7581784873069876e-07,
      "logps/chosen": -211.2158203125,
      "logps/rejected": -200.59913635253906,
      "loss": 0.2249,
      "rewards/chosen": 0.7871925234794617,
      "rewards/margins": 3.0763463973999023,
      "rewards/rejected": -2.289153814315796,
      "step": 949
    },
    {
      "epoch": 0.25,
      "grad_norm": 28.505664825439453,
      "kl": 0.0,
      "learning_rate": 3.7568699293378695e-07,
      "logps/chosen": -158.95338439941406,
      "logps/rejected": -240.470703125,
      "loss": 0.269,
      "rewards/chosen": -0.5699848532676697,
      "rewards/margins": 2.3539459705352783,
      "rewards/rejected": -2.9239308834075928,
      "step": 950
    },
    {
      "epoch": 0.25,
      "grad_norm": 28.49224281311035,
      "kl": 0.0,
      "learning_rate": 3.7555613713687515e-07,
      "logps/chosen": -180.0927734375,
      "logps/rejected": -222.00131225585938,
      "loss": 0.2562,
      "rewards/chosen": 1.0241466760635376,
      "rewards/margins": 4.006009578704834,
      "rewards/rejected": -2.981863021850586,
      "step": 951
    },
    {
      "epoch": 0.25,
      "grad_norm": 43.41466522216797,
      "kl": 0.0,
      "learning_rate": 3.754252813399633e-07,
      "logps/chosen": -233.43089294433594,
      "logps/rejected": -260.57025146484375,
      "loss": 0.2668,
      "rewards/chosen": -0.16725710034370422,
      "rewards/margins": 3.9502956867218018,
      "rewards/rejected": -4.117552757263184,
      "step": 952
    },
    {
      "epoch": 0.25,
      "grad_norm": 27.8471736907959,
      "kl": 0.0,
      "learning_rate": 3.7529442554305154e-07,
      "logps/chosen": -264.3834228515625,
      "logps/rejected": -317.200927734375,
      "loss": 0.2867,
      "rewards/chosen": -0.396359920501709,
      "rewards/margins": 2.800766944885254,
      "rewards/rejected": -3.197126865386963,
      "step": 953
    },
    {
      "epoch": 0.25,
      "grad_norm": 34.129188537597656,
      "kl": 0.0,
      "learning_rate": 3.7516356974613974e-07,
      "logps/chosen": -183.26345825195312,
      "logps/rejected": -237.68011474609375,
      "loss": 0.3368,
      "rewards/chosen": 0.26516619324684143,
      "rewards/margins": 3.0184478759765625,
      "rewards/rejected": -2.753281593322754,
      "step": 954
    },
    {
      "epoch": 0.25,
      "grad_norm": 34.6745491027832,
      "kl": 0.0,
      "learning_rate": 3.7503271394922793e-07,
      "logps/chosen": -284.04638671875,
      "logps/rejected": -179.2754669189453,
      "loss": 0.348,
      "rewards/chosen": 0.24357259273529053,
      "rewards/margins": 2.473604679107666,
      "rewards/rejected": -2.230031967163086,
      "step": 955
    },
    {
      "epoch": 0.25,
      "grad_norm": 31.912418365478516,
      "kl": 0.0,
      "learning_rate": 3.7490185815231613e-07,
      "logps/chosen": -135.66848754882812,
      "logps/rejected": -251.6403045654297,
      "loss": 0.357,
      "rewards/chosen": -0.8683047890663147,
      "rewards/margins": 2.402848482131958,
      "rewards/rejected": -3.271153211593628,
      "step": 956
    },
    {
      "epoch": 0.25,
      "grad_norm": 38.589805603027344,
      "kl": 0.0,
      "learning_rate": 3.747710023554043e-07,
      "logps/chosen": -295.62432861328125,
      "logps/rejected": -193.68881225585938,
      "loss": 0.3627,
      "rewards/chosen": -0.835448145866394,
      "rewards/margins": 1.8972326517105103,
      "rewards/rejected": -2.7326807975769043,
      "step": 957
    },
    {
      "epoch": 0.25,
      "grad_norm": 33.903377532958984,
      "kl": 0.0,
      "learning_rate": 3.746401465584925e-07,
      "logps/chosen": -220.63259887695312,
      "logps/rejected": -307.62664794921875,
      "loss": 0.2633,
      "rewards/chosen": 1.0228039026260376,
      "rewards/margins": 5.325528144836426,
      "rewards/rejected": -4.302724361419678,
      "step": 958
    },
    {
      "epoch": 0.25,
      "grad_norm": 42.30472946166992,
      "kl": 0.0,
      "learning_rate": 3.745092907615807e-07,
      "logps/chosen": -271.06011962890625,
      "logps/rejected": -179.56674194335938,
      "loss": 0.3717,
      "rewards/chosen": -0.6451493501663208,
      "rewards/margins": 2.17818021774292,
      "rewards/rejected": -2.8233296871185303,
      "step": 959
    },
    {
      "epoch": 0.25,
      "grad_norm": 44.385440826416016,
      "kl": 0.0,
      "learning_rate": 3.743784349646689e-07,
      "logps/chosen": -278.98663330078125,
      "logps/rejected": -321.0003356933594,
      "loss": 0.3034,
      "rewards/chosen": -0.3178820312023163,
      "rewards/margins": 2.567906379699707,
      "rewards/rejected": -2.8857884407043457,
      "step": 960
    },
    {
      "epoch": 0.25,
      "grad_norm": 30.137678146362305,
      "kl": 0.0,
      "learning_rate": 3.742475791677571e-07,
      "logps/chosen": -188.7030029296875,
      "logps/rejected": -228.55557250976562,
      "loss": 0.4215,
      "rewards/chosen": -0.25972381234169006,
      "rewards/margins": 2.330620765686035,
      "rewards/rejected": -2.5903446674346924,
      "step": 961
    },
    {
      "epoch": 0.25,
      "grad_norm": 31.367036819458008,
      "kl": 0.0,
      "learning_rate": 3.741167233708453e-07,
      "logps/chosen": -173.17393493652344,
      "logps/rejected": -288.6369323730469,
      "loss": 0.3492,
      "rewards/chosen": -0.5615468621253967,
      "rewards/margins": 3.085667371749878,
      "rewards/rejected": -3.64721417427063,
      "step": 962
    },
    {
      "epoch": 0.25,
      "grad_norm": 36.26879119873047,
      "kl": 0.0,
      "learning_rate": 3.739858675739335e-07,
      "logps/chosen": -234.41134643554688,
      "logps/rejected": -241.38648986816406,
      "loss": 0.388,
      "rewards/chosen": -0.6199156641960144,
      "rewards/margins": 2.953280210494995,
      "rewards/rejected": -3.5731959342956543,
      "step": 963
    },
    {
      "epoch": 0.25,
      "grad_norm": 37.49041748046875,
      "kl": 0.0,
      "learning_rate": 3.738550117770217e-07,
      "logps/chosen": -270.44732666015625,
      "logps/rejected": -258.4364929199219,
      "loss": 0.3476,
      "rewards/chosen": -0.3076787292957306,
      "rewards/margins": 2.230152130126953,
      "rewards/rejected": -2.5378308296203613,
      "step": 964
    },
    {
      "epoch": 0.25,
      "grad_norm": 31.639141082763672,
      "kl": 0.0,
      "learning_rate": 3.7372415598010995e-07,
      "logps/chosen": -190.83172607421875,
      "logps/rejected": -251.1099853515625,
      "loss": 0.3503,
      "rewards/chosen": 0.7067262530326843,
      "rewards/margins": 3.0306055545806885,
      "rewards/rejected": -2.3238792419433594,
      "step": 965
    },
    {
      "epoch": 0.25,
      "grad_norm": 33.14474105834961,
      "kl": 0.0,
      "learning_rate": 3.7359330018319814e-07,
      "logps/chosen": -229.0324249267578,
      "logps/rejected": -255.16261291503906,
      "loss": 0.2465,
      "rewards/chosen": -0.09261378645896912,
      "rewards/margins": 2.950747013092041,
      "rewards/rejected": -3.043360710144043,
      "step": 966
    },
    {
      "epoch": 0.25,
      "grad_norm": 22.569007873535156,
      "kl": 0.0,
      "learning_rate": 3.734624443862863e-07,
      "logps/chosen": -254.2834930419922,
      "logps/rejected": -233.958251953125,
      "loss": 0.2995,
      "rewards/chosen": -0.8716516494750977,
      "rewards/margins": 3.8399643898010254,
      "rewards/rejected": -4.711616039276123,
      "step": 967
    },
    {
      "epoch": 0.25,
      "grad_norm": 46.63541793823242,
      "kl": 0.0,
      "learning_rate": 3.733315885893745e-07,
      "logps/chosen": -187.2761993408203,
      "logps/rejected": -234.6182098388672,
      "loss": 0.2998,
      "rewards/chosen": -0.7562770247459412,
      "rewards/margins": 1.2845125198364258,
      "rewards/rejected": -2.0407896041870117,
      "step": 968
    },
    {
      "epoch": 0.25,
      "grad_norm": 37.27079772949219,
      "kl": 0.0,
      "learning_rate": 3.732007327924627e-07,
      "logps/chosen": -147.63406372070312,
      "logps/rejected": -245.00823974609375,
      "loss": 0.3315,
      "rewards/chosen": -0.24929417669773102,
      "rewards/margins": 3.6097397804260254,
      "rewards/rejected": -3.8590340614318848,
      "step": 969
    },
    {
      "epoch": 0.25,
      "grad_norm": 28.529054641723633,
      "kl": 0.0,
      "learning_rate": 3.730698769955509e-07,
      "logps/chosen": -217.55023193359375,
      "logps/rejected": -245.34091186523438,
      "loss": 0.2898,
      "rewards/chosen": 0.5950266122817993,
      "rewards/margins": 3.8983559608459473,
      "rewards/rejected": -3.3033292293548584,
      "step": 970
    },
    {
      "epoch": 0.25,
      "grad_norm": 33.69866943359375,
      "kl": 0.0,
      "learning_rate": 3.7293902119863907e-07,
      "logps/chosen": -273.6669921875,
      "logps/rejected": -220.08108520507812,
      "loss": 0.336,
      "rewards/chosen": -0.6207393407821655,
      "rewards/margins": 3.184846878051758,
      "rewards/rejected": -3.805586099624634,
      "step": 971
    },
    {
      "epoch": 0.25,
      "grad_norm": 33.63166427612305,
      "kl": 0.0,
      "learning_rate": 3.7280816540172727e-07,
      "logps/chosen": -204.73757934570312,
      "logps/rejected": -367.3917236328125,
      "loss": 0.1605,
      "rewards/chosen": 0.2242099791765213,
      "rewards/margins": 4.352242469787598,
      "rewards/rejected": -4.128032684326172,
      "step": 972
    },
    {
      "epoch": 0.25,
      "grad_norm": 37.7935791015625,
      "kl": 0.0,
      "learning_rate": 3.7267730960481546e-07,
      "logps/chosen": -163.07469177246094,
      "logps/rejected": -307.3078918457031,
      "loss": 0.3856,
      "rewards/chosen": -0.6944271922111511,
      "rewards/margins": 4.600676536560059,
      "rewards/rejected": -5.295103549957275,
      "step": 973
    },
    {
      "epoch": 0.25,
      "grad_norm": 27.959571838378906,
      "kl": 0.0,
      "learning_rate": 3.7254645380790366e-07,
      "logps/chosen": -164.4769287109375,
      "logps/rejected": -240.49453735351562,
      "loss": 0.3669,
      "rewards/chosen": -0.3397435247898102,
      "rewards/margins": 7.074169158935547,
      "rewards/rejected": -7.413912773132324,
      "step": 974
    },
    {
      "epoch": 0.26,
      "grad_norm": 28.11006736755371,
      "kl": 0.0,
      "learning_rate": 3.7241559801099186e-07,
      "logps/chosen": -197.59146118164062,
      "logps/rejected": -236.4452362060547,
      "loss": 0.3146,
      "rewards/chosen": 0.7232022881507874,
      "rewards/margins": 3.5962724685668945,
      "rewards/rejected": -2.873070240020752,
      "step": 975
    },
    {
      "epoch": 0.26,
      "grad_norm": 29.414844512939453,
      "kl": 0.0,
      "learning_rate": 3.7228474221408005e-07,
      "logps/chosen": -287.6759338378906,
      "logps/rejected": -163.3478546142578,
      "loss": 0.2951,
      "rewards/chosen": -0.5998216271400452,
      "rewards/margins": 2.333730936050415,
      "rewards/rejected": -2.9335525035858154,
      "step": 976
    },
    {
      "epoch": 0.26,
      "grad_norm": 42.738162994384766,
      "kl": 0.0,
      "learning_rate": 3.7215388641716825e-07,
      "logps/chosen": -211.0493927001953,
      "logps/rejected": -292.9945068359375,
      "loss": 0.3142,
      "rewards/chosen": -0.3580980896949768,
      "rewards/margins": 2.8586883544921875,
      "rewards/rejected": -3.2167863845825195,
      "step": 977
    },
    {
      "epoch": 0.26,
      "grad_norm": 26.3310546875,
      "kl": 0.0,
      "learning_rate": 3.720230306202565e-07,
      "logps/chosen": -208.38685607910156,
      "logps/rejected": -178.67808532714844,
      "loss": 0.2584,
      "rewards/chosen": -0.2977018356323242,
      "rewards/margins": 2.307711601257324,
      "rewards/rejected": -2.6054134368896484,
      "step": 978
    },
    {
      "epoch": 0.26,
      "grad_norm": 28.863971710205078,
      "kl": 0.0,
      "learning_rate": 3.718921748233447e-07,
      "logps/chosen": -194.7991180419922,
      "logps/rejected": -223.70895385742188,
      "loss": 0.2059,
      "rewards/chosen": 0.4047398567199707,
      "rewards/margins": 4.218716621398926,
      "rewards/rejected": -3.813977003097534,
      "step": 979
    },
    {
      "epoch": 0.26,
      "grad_norm": 45.329856872558594,
      "kl": 0.0,
      "learning_rate": 3.717613190264329e-07,
      "logps/chosen": -277.4205627441406,
      "logps/rejected": -271.2000732421875,
      "loss": 0.4038,
      "rewards/chosen": -0.817046046257019,
      "rewards/margins": 4.592546463012695,
      "rewards/rejected": -5.409592628479004,
      "step": 980
    },
    {
      "epoch": 0.26,
      "grad_norm": 37.37732696533203,
      "kl": 0.0,
      "learning_rate": 3.716304632295211e-07,
      "logps/chosen": -165.05406188964844,
      "logps/rejected": -333.9656677246094,
      "loss": 0.3201,
      "rewards/chosen": -0.2642420828342438,
      "rewards/margins": 3.700796604156494,
      "rewards/rejected": -3.965038776397705,
      "step": 981
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.896753311157227,
      "kl": 0.0,
      "learning_rate": 3.714996074326093e-07,
      "logps/chosen": -220.74844360351562,
      "logps/rejected": -204.1945037841797,
      "loss": 0.4103,
      "rewards/chosen": -1.318084716796875,
      "rewards/margins": 1.4971325397491455,
      "rewards/rejected": -2.8152172565460205,
      "step": 982
    },
    {
      "epoch": 0.26,
      "grad_norm": 39.432945251464844,
      "kl": 0.0,
      "learning_rate": 3.713687516356974e-07,
      "logps/chosen": -197.0574951171875,
      "logps/rejected": -227.69265747070312,
      "loss": 0.3001,
      "rewards/chosen": -0.5320191979408264,
      "rewards/margins": 2.9302799701690674,
      "rewards/rejected": -3.462299108505249,
      "step": 983
    },
    {
      "epoch": 0.26,
      "grad_norm": 35.62767028808594,
      "kl": 0.0,
      "learning_rate": 3.712378958387856e-07,
      "logps/chosen": -236.9004364013672,
      "logps/rejected": -172.47108459472656,
      "loss": 0.2269,
      "rewards/chosen": 1.4592981338500977,
      "rewards/margins": 4.804227828979492,
      "rewards/rejected": -3.3449299335479736,
      "step": 984
    },
    {
      "epoch": 0.26,
      "grad_norm": 34.60026550292969,
      "kl": 0.0,
      "learning_rate": 3.711070400418738e-07,
      "logps/chosen": -221.7421112060547,
      "logps/rejected": -215.35638427734375,
      "loss": 0.3953,
      "rewards/chosen": -1.0239485502243042,
      "rewards/margins": 1.4995425939559937,
      "rewards/rejected": -2.523491144180298,
      "step": 985
    },
    {
      "epoch": 0.26,
      "grad_norm": 36.66464614868164,
      "kl": 0.0,
      "learning_rate": 3.70976184244962e-07,
      "logps/chosen": -286.3569030761719,
      "logps/rejected": -235.4409942626953,
      "loss": 0.3558,
      "rewards/chosen": -0.6151680946350098,
      "rewards/margins": 2.8036887645721436,
      "rewards/rejected": -3.4188568592071533,
      "step": 986
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.001144409179688,
      "kl": 0.0,
      "learning_rate": 3.708453284480502e-07,
      "logps/chosen": -172.70333862304688,
      "logps/rejected": -230.7540740966797,
      "loss": 0.2102,
      "rewards/chosen": -1.314502477645874,
      "rewards/margins": 3.040217161178589,
      "rewards/rejected": -4.354719638824463,
      "step": 987
    },
    {
      "epoch": 0.26,
      "grad_norm": 36.64275360107422,
      "kl": 0.0,
      "learning_rate": 3.707144726511384e-07,
      "logps/chosen": -308.76202392578125,
      "logps/rejected": -272.7708435058594,
      "loss": 0.3881,
      "rewards/chosen": 0.16566914319992065,
      "rewards/margins": 2.5625765323638916,
      "rewards/rejected": -2.396907329559326,
      "step": 988
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.150535583496094,
      "kl": 0.0,
      "learning_rate": 3.705836168542266e-07,
      "logps/chosen": -214.9908447265625,
      "logps/rejected": -304.88226318359375,
      "loss": 0.224,
      "rewards/chosen": 2.17590594291687,
      "rewards/margins": 6.284664154052734,
      "rewards/rejected": -4.108758449554443,
      "step": 989
    },
    {
      "epoch": 0.26,
      "grad_norm": 37.415401458740234,
      "kl": 0.0,
      "learning_rate": 3.704527610573148e-07,
      "logps/chosen": -260.45904541015625,
      "logps/rejected": -256.9763488769531,
      "loss": 0.3893,
      "rewards/chosen": 1.2356760501861572,
      "rewards/margins": 3.375903367996216,
      "rewards/rejected": -2.1402273178100586,
      "step": 990
    },
    {
      "epoch": 0.26,
      "grad_norm": 38.0907096862793,
      "kl": 0.0,
      "learning_rate": 3.7032190526040305e-07,
      "logps/chosen": -186.80435180664062,
      "logps/rejected": -257.00506591796875,
      "loss": 0.4833,
      "rewards/chosen": -0.41919440031051636,
      "rewards/margins": 0.7835555672645569,
      "rewards/rejected": -1.2027499675750732,
      "step": 991
    },
    {
      "epoch": 0.26,
      "grad_norm": 30.297321319580078,
      "kl": 0.0,
      "learning_rate": 3.7019104946349124e-07,
      "logps/chosen": -231.01339721679688,
      "logps/rejected": -323.2631530761719,
      "loss": 0.171,
      "rewards/chosen": 0.8052429556846619,
      "rewards/margins": 5.508130073547363,
      "rewards/rejected": -4.702887058258057,
      "step": 992
    },
    {
      "epoch": 0.26,
      "grad_norm": 28.98295783996582,
      "kl": 0.0,
      "learning_rate": 3.7006019366657944e-07,
      "logps/chosen": -195.5869903564453,
      "logps/rejected": -368.0828857421875,
      "loss": 0.2751,
      "rewards/chosen": 0.75277179479599,
      "rewards/margins": 3.5699214935302734,
      "rewards/rejected": -2.8171496391296387,
      "step": 993
    },
    {
      "epoch": 0.26,
      "grad_norm": 41.256099700927734,
      "kl": 0.0,
      "learning_rate": 3.6992933786966763e-07,
      "logps/chosen": -282.90325927734375,
      "logps/rejected": -265.955078125,
      "loss": 0.2973,
      "rewards/chosen": -0.04004257172346115,
      "rewards/margins": 2.438786268234253,
      "rewards/rejected": -2.4788289070129395,
      "step": 994
    },
    {
      "epoch": 0.26,
      "grad_norm": 40.493995666503906,
      "kl": 0.0,
      "learning_rate": 3.6979848207275583e-07,
      "logps/chosen": -198.56317138671875,
      "logps/rejected": -287.1308288574219,
      "loss": 0.2858,
      "rewards/chosen": 0.22279243171215057,
      "rewards/margins": 3.379775285720825,
      "rewards/rejected": -3.156982898712158,
      "step": 995
    },
    {
      "epoch": 0.26,
      "grad_norm": 38.28390121459961,
      "kl": 0.0,
      "learning_rate": 3.6966762627584403e-07,
      "logps/chosen": -252.07237243652344,
      "logps/rejected": -300.477294921875,
      "loss": 0.4081,
      "rewards/chosen": 0.12824928760528564,
      "rewards/margins": 3.357633590698242,
      "rewards/rejected": -3.229384422302246,
      "step": 996
    },
    {
      "epoch": 0.26,
      "grad_norm": 30.633607864379883,
      "kl": 0.0,
      "learning_rate": 3.695367704789322e-07,
      "logps/chosen": -268.8228454589844,
      "logps/rejected": -261.2051696777344,
      "loss": 0.2608,
      "rewards/chosen": 0.4649485945701599,
      "rewards/margins": 3.7446982860565186,
      "rewards/rejected": -3.279749631881714,
      "step": 997
    },
    {
      "epoch": 0.26,
      "grad_norm": 36.42918395996094,
      "kl": 0.0,
      "learning_rate": 3.6940591468202037e-07,
      "logps/chosen": -346.8229675292969,
      "logps/rejected": -241.703857421875,
      "loss": 0.2317,
      "rewards/chosen": 0.23028622567653656,
      "rewards/margins": 3.4296822547912598,
      "rewards/rejected": -3.1993961334228516,
      "step": 998
    },
    {
      "epoch": 0.26,
      "grad_norm": 39.02206802368164,
      "kl": 0.0,
      "learning_rate": 3.6927505888510856e-07,
      "logps/chosen": -216.50253295898438,
      "logps/rejected": -127.69355773925781,
      "loss": 0.3738,
      "rewards/chosen": -0.8492110967636108,
      "rewards/margins": 1.4893790483474731,
      "rewards/rejected": -2.338590145111084,
      "step": 999
    },
    {
      "epoch": 0.26,
      "grad_norm": 30.959394454956055,
      "kl": 0.0,
      "learning_rate": 3.6914420308819676e-07,
      "logps/chosen": -235.33822631835938,
      "logps/rejected": -237.93800354003906,
      "loss": 0.293,
      "rewards/chosen": -0.13448606431484222,
      "rewards/margins": 3.330787181854248,
      "rewards/rejected": -3.465273141860962,
      "step": 1000
    },
    {
      "epoch": 0.26,
      "grad_norm": 40.610050201416016,
      "kl": 0.0,
      "learning_rate": 3.6901334729128495e-07,
      "logps/chosen": -264.3585510253906,
      "logps/rejected": -255.59652709960938,
      "loss": 0.3573,
      "rewards/chosen": 0.12295226007699966,
      "rewards/margins": 1.9584699869155884,
      "rewards/rejected": -1.8355177640914917,
      "step": 1001
    },
    {
      "epoch": 0.26,
      "grad_norm": 22.982267379760742,
      "kl": 0.0,
      "learning_rate": 3.6888249149437315e-07,
      "logps/chosen": -179.1318817138672,
      "logps/rejected": -223.6998291015625,
      "loss": 0.2512,
      "rewards/chosen": 0.2519915997982025,
      "rewards/margins": 5.339731693267822,
      "rewards/rejected": -5.087739944458008,
      "step": 1002
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.051963806152344,
      "kl": 0.0,
      "learning_rate": 3.6875163569746135e-07,
      "logps/chosen": -215.51844787597656,
      "logps/rejected": -269.37322998046875,
      "loss": 0.3854,
      "rewards/chosen": -0.3046747148036957,
      "rewards/margins": 2.4635965824127197,
      "rewards/rejected": -2.7682712078094482,
      "step": 1003
    },
    {
      "epoch": 0.26,
      "grad_norm": 38.849945068359375,
      "kl": 0.0,
      "learning_rate": 3.686207799005496e-07,
      "logps/chosen": -179.42074584960938,
      "logps/rejected": -280.94488525390625,
      "loss": 0.3472,
      "rewards/chosen": 0.19322259724140167,
      "rewards/margins": 3.179107666015625,
      "rewards/rejected": -2.9858851432800293,
      "step": 1004
    },
    {
      "epoch": 0.26,
      "grad_norm": 34.98317337036133,
      "kl": 0.0,
      "learning_rate": 3.684899241036378e-07,
      "logps/chosen": -191.39337158203125,
      "logps/rejected": -247.249755859375,
      "loss": 0.3104,
      "rewards/chosen": -0.00705593079328537,
      "rewards/margins": 3.7272536754608154,
      "rewards/rejected": -3.734309673309326,
      "step": 1005
    },
    {
      "epoch": 0.26,
      "grad_norm": 29.55091667175293,
      "kl": 0.0,
      "learning_rate": 3.68359068306726e-07,
      "logps/chosen": -195.71263122558594,
      "logps/rejected": -191.72177124023438,
      "loss": 0.1906,
      "rewards/chosen": 2.661048412322998,
      "rewards/margins": 6.463957786560059,
      "rewards/rejected": -3.8029093742370605,
      "step": 1006
    },
    {
      "epoch": 0.26,
      "grad_norm": 28.976953506469727,
      "kl": 0.0,
      "learning_rate": 3.682282125098142e-07,
      "logps/chosen": -243.90280151367188,
      "logps/rejected": -176.75965881347656,
      "loss": 0.3992,
      "rewards/chosen": -0.5529724955558777,
      "rewards/margins": 1.5031301975250244,
      "rewards/rejected": -2.056102752685547,
      "step": 1007
    },
    {
      "epoch": 0.26,
      "grad_norm": 35.87976837158203,
      "kl": 0.0,
      "learning_rate": 3.680973567129024e-07,
      "logps/chosen": -242.5333709716797,
      "logps/rejected": -211.3227996826172,
      "loss": 0.3383,
      "rewards/chosen": 0.3571178615093231,
      "rewards/margins": 3.2810115814208984,
      "rewards/rejected": -2.923893690109253,
      "step": 1008
    },
    {
      "epoch": 0.26,
      "grad_norm": 39.02558898925781,
      "kl": 0.0,
      "learning_rate": 3.679665009159906e-07,
      "logps/chosen": -225.7318878173828,
      "logps/rejected": -179.76136779785156,
      "loss": 0.3238,
      "rewards/chosen": 0.4625554084777832,
      "rewards/margins": 2.3718295097351074,
      "rewards/rejected": -1.9092742204666138,
      "step": 1009
    },
    {
      "epoch": 0.26,
      "grad_norm": 31.04306983947754,
      "kl": 0.0,
      "learning_rate": 3.6783564511907877e-07,
      "logps/chosen": -158.1611785888672,
      "logps/rejected": -218.263916015625,
      "loss": 0.3687,
      "rewards/chosen": -0.2737899720668793,
      "rewards/margins": 1.0832759141921997,
      "rewards/rejected": -1.3570659160614014,
      "step": 1010
    },
    {
      "epoch": 0.26,
      "grad_norm": 38.5332145690918,
      "kl": 0.0,
      "learning_rate": 3.6770478932216697e-07,
      "logps/chosen": -216.95333862304688,
      "logps/rejected": -232.80751037597656,
      "loss": 0.3252,
      "rewards/chosen": -1.0354280471801758,
      "rewards/margins": 1.416306972503662,
      "rewards/rejected": -2.451735019683838,
      "step": 1011
    },
    {
      "epoch": 0.26,
      "grad_norm": 37.71039962768555,
      "kl": 0.0,
      "learning_rate": 3.6757393352525516e-07,
      "logps/chosen": -181.9486083984375,
      "logps/rejected": -266.159423828125,
      "loss": 0.3085,
      "rewards/chosen": -0.29809871315956116,
      "rewards/margins": 3.1903493404388428,
      "rewards/rejected": -3.488448143005371,
      "step": 1012
    },
    {
      "epoch": 0.27,
      "grad_norm": 33.82704162597656,
      "kl": 0.0,
      "learning_rate": 3.6744307772834336e-07,
      "logps/chosen": -163.29898071289062,
      "logps/rejected": -301.127685546875,
      "loss": 0.2887,
      "rewards/chosen": -0.5370348691940308,
      "rewards/margins": 2.407470226287842,
      "rewards/rejected": -2.944504976272583,
      "step": 1013
    },
    {
      "epoch": 0.27,
      "grad_norm": 30.01554298400879,
      "kl": 0.0,
      "learning_rate": 3.673122219314315e-07,
      "logps/chosen": -235.93983459472656,
      "logps/rejected": -293.60986328125,
      "loss": 0.2571,
      "rewards/chosen": 0.16475680470466614,
      "rewards/margins": 3.1383562088012695,
      "rewards/rejected": -2.973599433898926,
      "step": 1014
    },
    {
      "epoch": 0.27,
      "grad_norm": 43.945167541503906,
      "kl": 0.0,
      "learning_rate": 3.671813661345197e-07,
      "logps/chosen": -229.14370727539062,
      "logps/rejected": -220.0166473388672,
      "loss": 0.3219,
      "rewards/chosen": -0.44912272691726685,
      "rewards/margins": 3.00689435005188,
      "rewards/rejected": -3.456017017364502,
      "step": 1015
    },
    {
      "epoch": 0.27,
      "grad_norm": 37.656532287597656,
      "kl": 0.0,
      "learning_rate": 3.670505103376079e-07,
      "logps/chosen": -241.76356506347656,
      "logps/rejected": -229.84144592285156,
      "loss": 0.2412,
      "rewards/chosen": 0.739287793636322,
      "rewards/margins": 5.566799163818359,
      "rewards/rejected": -4.827511310577393,
      "step": 1016
    },
    {
      "epoch": 0.27,
      "grad_norm": 35.11753845214844,
      "kl": 0.0,
      "learning_rate": 3.6691965454069614e-07,
      "logps/chosen": -211.39744567871094,
      "logps/rejected": -246.71319580078125,
      "loss": 0.2869,
      "rewards/chosen": -0.1698482483625412,
      "rewards/margins": 2.793739080429077,
      "rewards/rejected": -2.9635872840881348,
      "step": 1017
    },
    {
      "epoch": 0.27,
      "grad_norm": 33.98426055908203,
      "kl": 0.0,
      "learning_rate": 3.6678879874378434e-07,
      "logps/chosen": -219.35289001464844,
      "logps/rejected": -232.19667053222656,
      "loss": 0.2576,
      "rewards/chosen": 0.028564453125,
      "rewards/margins": 2.506056070327759,
      "rewards/rejected": -2.477491617202759,
      "step": 1018
    },
    {
      "epoch": 0.27,
      "grad_norm": 30.304336547851562,
      "kl": 0.0,
      "learning_rate": 3.6665794294687254e-07,
      "logps/chosen": -205.853515625,
      "logps/rejected": -215.15550231933594,
      "loss": 0.277,
      "rewards/chosen": 0.4227977395057678,
      "rewards/margins": 3.2086880207061768,
      "rewards/rejected": -2.7858903408050537,
      "step": 1019
    },
    {
      "epoch": 0.27,
      "grad_norm": 32.362892150878906,
      "kl": 0.0,
      "learning_rate": 3.6652708714996073e-07,
      "logps/chosen": -169.9617919921875,
      "logps/rejected": -322.0443115234375,
      "loss": 0.4108,
      "rewards/chosen": -1.0704345703125,
      "rewards/margins": 1.948420524597168,
      "rewards/rejected": -3.018855094909668,
      "step": 1020
    },
    {
      "epoch": 0.27,
      "grad_norm": 31.939716339111328,
      "kl": 0.0,
      "learning_rate": 3.6639623135304893e-07,
      "logps/chosen": -210.80897521972656,
      "logps/rejected": -196.7965545654297,
      "loss": 0.3068,
      "rewards/chosen": 1.1073076725006104,
      "rewards/margins": 3.1537821292877197,
      "rewards/rejected": -2.0464744567871094,
      "step": 1021
    },
    {
      "epoch": 0.27,
      "grad_norm": 47.13640213012695,
      "kl": 0.0,
      "learning_rate": 3.662653755561371e-07,
      "logps/chosen": -177.97195434570312,
      "logps/rejected": -257.19281005859375,
      "loss": 0.3351,
      "rewards/chosen": 0.2421102672815323,
      "rewards/margins": 2.5237832069396973,
      "rewards/rejected": -2.281672954559326,
      "step": 1022
    },
    {
      "epoch": 0.27,
      "grad_norm": 35.96914291381836,
      "kl": 0.0,
      "learning_rate": 3.661345197592253e-07,
      "logps/chosen": -173.86227416992188,
      "logps/rejected": -157.81129455566406,
      "loss": 0.4288,
      "rewards/chosen": 0.11688518524169922,
      "rewards/margins": 1.7955470085144043,
      "rewards/rejected": -1.678661823272705,
      "step": 1023
    },
    {
      "epoch": 0.27,
      "grad_norm": 31.106721878051758,
      "kl": 0.0,
      "learning_rate": 3.660036639623135e-07,
      "logps/chosen": -214.5399627685547,
      "logps/rejected": -246.36363220214844,
      "loss": 0.255,
      "rewards/chosen": -0.01915731653571129,
      "rewards/margins": 3.3207294940948486,
      "rewards/rejected": -3.3398869037628174,
      "step": 1024
    },
    {
      "epoch": 0.27,
      "grad_norm": 43.176334381103516,
      "kl": 0.0,
      "learning_rate": 3.658728081654017e-07,
      "logps/chosen": -217.10745239257812,
      "logps/rejected": -252.17526245117188,
      "loss": 0.3132,
      "rewards/chosen": -0.032477542757987976,
      "rewards/margins": 2.099614143371582,
      "rewards/rejected": -2.132091760635376,
      "step": 1025
    },
    {
      "epoch": 0.27,
      "grad_norm": 38.87571716308594,
      "kl": 0.0,
      "learning_rate": 3.657419523684899e-07,
      "logps/chosen": -156.24591064453125,
      "logps/rejected": -326.661865234375,
      "loss": 0.2864,
      "rewards/chosen": 0.2051771879196167,
      "rewards/margins": 3.872175693511963,
      "rewards/rejected": -3.6669983863830566,
      "step": 1026
    },
    {
      "epoch": 0.27,
      "grad_norm": 27.340553283691406,
      "kl": 0.0,
      "learning_rate": 3.656110965715781e-07,
      "logps/chosen": -151.21754455566406,
      "logps/rejected": -253.4945526123047,
      "loss": 0.3319,
      "rewards/chosen": -1.0714999437332153,
      "rewards/margins": 3.459376811981201,
      "rewards/rejected": -4.530876636505127,
      "step": 1027
    },
    {
      "epoch": 0.27,
      "grad_norm": 25.070993423461914,
      "kl": 0.0,
      "learning_rate": 3.654802407746663e-07,
      "logps/chosen": -177.9115753173828,
      "logps/rejected": -257.03033447265625,
      "loss": 0.3164,
      "rewards/chosen": -1.1591517925262451,
      "rewards/margins": 3.417459726333618,
      "rewards/rejected": -4.576611518859863,
      "step": 1028
    },
    {
      "epoch": 0.27,
      "grad_norm": 29.812543869018555,
      "kl": 0.0,
      "learning_rate": 3.6534938497775444e-07,
      "logps/chosen": -199.55758666992188,
      "logps/rejected": -237.88499450683594,
      "loss": 0.3949,
      "rewards/chosen": -0.17537379264831543,
      "rewards/margins": 2.9494917392730713,
      "rewards/rejected": -3.1248655319213867,
      "step": 1029
    },
    {
      "epoch": 0.27,
      "grad_norm": 36.289398193359375,
      "kl": 0.0,
      "learning_rate": 3.652185291808427e-07,
      "logps/chosen": -241.36741638183594,
      "logps/rejected": -276.4466247558594,
      "loss": 0.2921,
      "rewards/chosen": 0.03348015993833542,
      "rewards/margins": 2.9756243228912354,
      "rewards/rejected": -2.9421441555023193,
      "step": 1030
    },
    {
      "epoch": 0.27,
      "grad_norm": 36.39860916137695,
      "kl": 0.0,
      "learning_rate": 3.650876733839309e-07,
      "logps/chosen": -308.1175231933594,
      "logps/rejected": -331.46246337890625,
      "loss": 0.2929,
      "rewards/chosen": -1.4403387308120728,
      "rewards/margins": 0.3089176416397095,
      "rewards/rejected": -1.7492563724517822,
      "step": 1031
    },
    {
      "epoch": 0.27,
      "grad_norm": 35.538063049316406,
      "kl": 0.0,
      "learning_rate": 3.649568175870191e-07,
      "logps/chosen": -186.85958862304688,
      "logps/rejected": -172.8899383544922,
      "loss": 0.2932,
      "rewards/chosen": -0.7927916646003723,
      "rewards/margins": 2.2099575996398926,
      "rewards/rejected": -3.00274920463562,
      "step": 1032
    },
    {
      "epoch": 0.27,
      "grad_norm": 40.36717987060547,
      "kl": 0.0,
      "learning_rate": 3.648259617901073e-07,
      "logps/chosen": -242.24998474121094,
      "logps/rejected": -351.9871520996094,
      "loss": 0.3347,
      "rewards/chosen": -0.6761089563369751,
      "rewards/margins": 2.68052339553833,
      "rewards/rejected": -3.3566324710845947,
      "step": 1033
    },
    {
      "epoch": 0.27,
      "grad_norm": 29.186281204223633,
      "kl": 0.0,
      "learning_rate": 3.646951059931955e-07,
      "logps/chosen": -240.44607543945312,
      "logps/rejected": -237.96688842773438,
      "loss": 0.3484,
      "rewards/chosen": -0.43001431226730347,
      "rewards/margins": 2.2880451679229736,
      "rewards/rejected": -2.718059539794922,
      "step": 1034
    },
    {
      "epoch": 0.27,
      "grad_norm": 38.60892868041992,
      "kl": 0.0,
      "learning_rate": 3.645642501962837e-07,
      "logps/chosen": -198.13050842285156,
      "logps/rejected": -238.33778381347656,
      "loss": 0.2745,
      "rewards/chosen": -0.5678102374076843,
      "rewards/margins": 1.5358705520629883,
      "rewards/rejected": -2.1036808490753174,
      "step": 1035
    },
    {
      "epoch": 0.27,
      "grad_norm": 40.15579605102539,
      "kl": 0.0,
      "learning_rate": 3.6443339439937187e-07,
      "logps/chosen": -207.40586853027344,
      "logps/rejected": -269.68646240234375,
      "loss": 0.2976,
      "rewards/chosen": 1.5068060159683228,
      "rewards/margins": 5.378608226776123,
      "rewards/rejected": -3.87180233001709,
      "step": 1036
    },
    {
      "epoch": 0.27,
      "grad_norm": 40.54957962036133,
      "kl": 0.0,
      "learning_rate": 3.6430253860246007e-07,
      "logps/chosen": -225.75035095214844,
      "logps/rejected": -186.6640167236328,
      "loss": 0.249,
      "rewards/chosen": 0.4179876148700714,
      "rewards/margins": 2.1934192180633545,
      "rewards/rejected": -1.7754316329956055,
      "step": 1037
    },
    {
      "epoch": 0.27,
      "grad_norm": 28.730098724365234,
      "kl": 0.0,
      "learning_rate": 3.6417168280554826e-07,
      "logps/chosen": -187.89804077148438,
      "logps/rejected": -233.9765167236328,
      "loss": 0.3673,
      "rewards/chosen": -2.141961097717285,
      "rewards/margins": 0.7847371101379395,
      "rewards/rejected": -2.9266982078552246,
      "step": 1038
    },
    {
      "epoch": 0.27,
      "grad_norm": 47.1996955871582,
      "kl": 0.0,
      "learning_rate": 3.6404082700863646e-07,
      "logps/chosen": -220.11485290527344,
      "logps/rejected": -254.345703125,
      "loss": 0.2947,
      "rewards/chosen": -0.6743428707122803,
      "rewards/margins": 1.5272424221038818,
      "rewards/rejected": -2.201585292816162,
      "step": 1039
    },
    {
      "epoch": 0.27,
      "grad_norm": 28.40711212158203,
      "kl": 0.0,
      "learning_rate": 3.6390997121172465e-07,
      "logps/chosen": -237.76663208007812,
      "logps/rejected": -225.626220703125,
      "loss": 0.3574,
      "rewards/chosen": 3.728612184524536,
      "rewards/margins": 6.437889099121094,
      "rewards/rejected": -2.7092769145965576,
      "step": 1040
    },
    {
      "epoch": 0.27,
      "grad_norm": 36.701297760009766,
      "kl": 0.0,
      "learning_rate": 3.6377911541481285e-07,
      "logps/chosen": -221.3239288330078,
      "logps/rejected": -281.19451904296875,
      "loss": 0.3226,
      "rewards/chosen": 0.43275317549705505,
      "rewards/margins": 2.6378254890441895,
      "rewards/rejected": -2.2050724029541016,
      "step": 1041
    },
    {
      "epoch": 0.27,
      "grad_norm": 36.57932662963867,
      "kl": 0.0,
      "learning_rate": 3.636482596179011e-07,
      "logps/chosen": -248.81182861328125,
      "logps/rejected": -195.62704467773438,
      "loss": 0.3701,
      "rewards/chosen": -0.7044147253036499,
      "rewards/margins": 1.9177671670913696,
      "rewards/rejected": -2.6221818923950195,
      "step": 1042
    },
    {
      "epoch": 0.27,
      "grad_norm": 32.23270797729492,
      "kl": 0.0,
      "learning_rate": 3.635174038209893e-07,
      "logps/chosen": -211.1300048828125,
      "logps/rejected": -221.2258758544922,
      "loss": 0.1998,
      "rewards/chosen": -0.5551096200942993,
      "rewards/margins": 1.9950312376022339,
      "rewards/rejected": -2.550140857696533,
      "step": 1043
    },
    {
      "epoch": 0.27,
      "grad_norm": 39.78229522705078,
      "kl": 0.0,
      "learning_rate": 3.633865480240775e-07,
      "logps/chosen": -249.97067260742188,
      "logps/rejected": -204.08071899414062,
      "loss": 0.3696,
      "rewards/chosen": 0.06202399730682373,
      "rewards/margins": 1.7861658334732056,
      "rewards/rejected": -1.7241418361663818,
      "step": 1044
    },
    {
      "epoch": 0.27,
      "grad_norm": 38.07326889038086,
      "kl": 0.0,
      "learning_rate": 3.6325569222716564e-07,
      "logps/chosen": -237.25929260253906,
      "logps/rejected": -240.17529296875,
      "loss": 0.3822,
      "rewards/chosen": -0.42507678270339966,
      "rewards/margins": 1.954552412033081,
      "rewards/rejected": -2.379629135131836,
      "step": 1045
    },
    {
      "epoch": 0.27,
      "grad_norm": 41.57281494140625,
      "kl": 0.0,
      "learning_rate": 3.6312483643025383e-07,
      "logps/chosen": -183.5970916748047,
      "logps/rejected": -223.18557739257812,
      "loss": 0.4239,
      "rewards/chosen": 0.8696861267089844,
      "rewards/margins": 2.7113208770751953,
      "rewards/rejected": -1.8416346311569214,
      "step": 1046
    },
    {
      "epoch": 0.27,
      "grad_norm": 34.37956619262695,
      "kl": 0.0,
      "learning_rate": 3.6299398063334203e-07,
      "logps/chosen": -139.97079467773438,
      "logps/rejected": -280.18212890625,
      "loss": 0.2595,
      "rewards/chosen": 0.27579593658447266,
      "rewards/margins": 3.2917022705078125,
      "rewards/rejected": -3.01590633392334,
      "step": 1047
    },
    {
      "epoch": 0.27,
      "grad_norm": 38.35115432739258,
      "kl": 0.0,
      "learning_rate": 3.628631248364302e-07,
      "logps/chosen": -272.6831970214844,
      "logps/rejected": -240.39169311523438,
      "loss": 0.335,
      "rewards/chosen": -6.130115032196045,
      "rewards/margins": -2.784881114959717,
      "rewards/rejected": -3.345233917236328,
      "step": 1048
    },
    {
      "epoch": 0.27,
      "grad_norm": 32.5230598449707,
      "kl": 0.0,
      "learning_rate": 3.627322690395184e-07,
      "logps/chosen": -166.0072021484375,
      "logps/rejected": -243.00039672851562,
      "loss": 0.2476,
      "rewards/chosen": 0.14196598529815674,
      "rewards/margins": 3.079589366912842,
      "rewards/rejected": -2.9376235008239746,
      "step": 1049
    },
    {
      "epoch": 0.27,
      "grad_norm": 29.200593948364258,
      "kl": 0.0,
      "learning_rate": 3.626014132426066e-07,
      "logps/chosen": -175.75213623046875,
      "logps/rejected": -278.5937194824219,
      "loss": 0.3469,
      "rewards/chosen": 0.4250580668449402,
      "rewards/margins": 3.2779133319854736,
      "rewards/rejected": -2.8528552055358887,
      "step": 1050
    },
    {
      "epoch": 0.28,
      "grad_norm": 26.384889602661133,
      "kl": 0.0,
      "learning_rate": 3.624705574456948e-07,
      "logps/chosen": -120.09896087646484,
      "logps/rejected": -238.22848510742188,
      "loss": 0.3106,
      "rewards/chosen": -0.45138978958129883,
      "rewards/margins": 2.8458361625671387,
      "rewards/rejected": -3.2972259521484375,
      "step": 1051
    },
    {
      "epoch": 0.28,
      "grad_norm": 39.24557113647461,
      "kl": 0.0,
      "learning_rate": 3.62339701648783e-07,
      "logps/chosen": -196.98329162597656,
      "logps/rejected": -143.01910400390625,
      "loss": 0.4611,
      "rewards/chosen": -1.3231563568115234,
      "rewards/margins": 0.6955959796905518,
      "rewards/rejected": -2.018752336502075,
      "step": 1052
    },
    {
      "epoch": 0.28,
      "grad_norm": 35.793094635009766,
      "kl": 0.0,
      "learning_rate": 3.622088458518712e-07,
      "logps/chosen": -250.4149627685547,
      "logps/rejected": -282.08978271484375,
      "loss": 0.2657,
      "rewards/chosen": 1.1257227659225464,
      "rewards/margins": 4.489777088165283,
      "rewards/rejected": -3.3640542030334473,
      "step": 1053
    },
    {
      "epoch": 0.28,
      "grad_norm": 50.41539001464844,
      "kl": 0.0,
      "learning_rate": 3.620779900549594e-07,
      "logps/chosen": -232.12420654296875,
      "logps/rejected": -266.841552734375,
      "loss": 0.3643,
      "rewards/chosen": -0.20323766767978668,
      "rewards/margins": 1.5877408981323242,
      "rewards/rejected": -1.7909785509109497,
      "step": 1054
    },
    {
      "epoch": 0.28,
      "grad_norm": 45.92751693725586,
      "kl": 0.0,
      "learning_rate": 3.6194713425804765e-07,
      "logps/chosen": -179.73974609375,
      "logps/rejected": -249.5182647705078,
      "loss": 0.2827,
      "rewards/chosen": 0.3262701630592346,
      "rewards/margins": 3.874574899673462,
      "rewards/rejected": -3.548304796218872,
      "step": 1055
    },
    {
      "epoch": 0.28,
      "grad_norm": 36.23213577270508,
      "kl": 0.0,
      "learning_rate": 3.6181627846113585e-07,
      "logps/chosen": -171.02401733398438,
      "logps/rejected": -240.00747680664062,
      "loss": 0.2385,
      "rewards/chosen": 1.3682966232299805,
      "rewards/margins": 2.987136125564575,
      "rewards/rejected": -1.6188395023345947,
      "step": 1056
    },
    {
      "epoch": 0.28,
      "grad_norm": 29.29627799987793,
      "kl": 0.0,
      "learning_rate": 3.6168542266422404e-07,
      "logps/chosen": -210.8943328857422,
      "logps/rejected": -187.2423095703125,
      "loss": 0.2572,
      "rewards/chosen": 0.5274381637573242,
      "rewards/margins": 3.216721296310425,
      "rewards/rejected": -2.6892831325531006,
      "step": 1057
    },
    {
      "epoch": 0.28,
      "grad_norm": 31.17224884033203,
      "kl": 0.0,
      "learning_rate": 3.6155456686731224e-07,
      "logps/chosen": -180.1999969482422,
      "logps/rejected": -219.22610473632812,
      "loss": 0.4126,
      "rewards/chosen": -0.6666109561920166,
      "rewards/margins": 1.8168065547943115,
      "rewards/rejected": -2.483417510986328,
      "step": 1058
    },
    {
      "epoch": 0.28,
      "grad_norm": 33.67176055908203,
      "kl": 0.0,
      "learning_rate": 3.6142371107040043e-07,
      "logps/chosen": -220.28773498535156,
      "logps/rejected": -259.7289123535156,
      "loss": 0.2974,
      "rewards/chosen": -0.13516581058502197,
      "rewards/margins": 3.0074315071105957,
      "rewards/rejected": -3.142597198486328,
      "step": 1059
    },
    {
      "epoch": 0.28,
      "grad_norm": 33.259300231933594,
      "kl": 0.0,
      "learning_rate": 3.612928552734886e-07,
      "logps/chosen": -189.72705078125,
      "logps/rejected": -164.92654418945312,
      "loss": 0.4098,
      "rewards/chosen": -0.050531283020973206,
      "rewards/margins": 2.0134236812591553,
      "rewards/rejected": -2.063955068588257,
      "step": 1060
    },
    {
      "epoch": 0.28,
      "grad_norm": 38.474369049072266,
      "kl": 0.0,
      "learning_rate": 3.6116199947657677e-07,
      "logps/chosen": -211.18026733398438,
      "logps/rejected": -158.99363708496094,
      "loss": 0.3069,
      "rewards/chosen": -0.6650242209434509,
      "rewards/margins": 1.7212104797363281,
      "rewards/rejected": -2.386234760284424,
      "step": 1061
    },
    {
      "epoch": 0.28,
      "grad_norm": 29.082534790039062,
      "kl": 0.0,
      "learning_rate": 3.6103114367966497e-07,
      "logps/chosen": -282.92205810546875,
      "logps/rejected": -266.47674560546875,
      "loss": 0.2743,
      "rewards/chosen": 1.693918228149414,
      "rewards/margins": 5.3685173988342285,
      "rewards/rejected": -3.6745991706848145,
      "step": 1062
    },
    {
      "epoch": 0.28,
      "grad_norm": 35.966251373291016,
      "kl": 0.0,
      "learning_rate": 3.6090028788275316e-07,
      "logps/chosen": -163.5644989013672,
      "logps/rejected": -240.92135620117188,
      "loss": 0.2478,
      "rewards/chosen": 0.8558608889579773,
      "rewards/margins": 3.8144004344940186,
      "rewards/rejected": -2.9585394859313965,
      "step": 1063
    },
    {
      "epoch": 0.28,
      "grad_norm": 29.439090728759766,
      "kl": 0.0,
      "learning_rate": 3.6076943208584136e-07,
      "logps/chosen": -197.8784637451172,
      "logps/rejected": -206.02719116210938,
      "loss": 0.2793,
      "rewards/chosen": -0.06350186467170715,
      "rewards/margins": 3.4905803203582764,
      "rewards/rejected": -3.554082155227661,
      "step": 1064
    },
    {
      "epoch": 0.28,
      "grad_norm": 34.36996841430664,
      "kl": 0.0,
      "learning_rate": 3.6063857628892956e-07,
      "logps/chosen": -149.76780700683594,
      "logps/rejected": -293.9136047363281,
      "loss": 0.2786,
      "rewards/chosen": 0.11478301137685776,
      "rewards/margins": 2.3451292514801025,
      "rewards/rejected": -2.230346202850342,
      "step": 1065
    },
    {
      "epoch": 0.28,
      "grad_norm": 36.8817253112793,
      "kl": 0.0,
      "learning_rate": 3.6050772049201775e-07,
      "logps/chosen": -324.67822265625,
      "logps/rejected": -313.7707214355469,
      "loss": 0.2427,
      "rewards/chosen": 0.6795901656150818,
      "rewards/margins": 5.313671112060547,
      "rewards/rejected": -4.63408088684082,
      "step": 1066
    },
    {
      "epoch": 0.28,
      "grad_norm": 38.049503326416016,
      "kl": 0.0,
      "learning_rate": 3.6037686469510595e-07,
      "logps/chosen": -273.94134521484375,
      "logps/rejected": -324.9948425292969,
      "loss": 0.2256,
      "rewards/chosen": -0.8676828742027283,
      "rewards/margins": 2.8628861904144287,
      "rewards/rejected": -3.7305691242218018,
      "step": 1067
    },
    {
      "epoch": 0.28,
      "grad_norm": 33.18891906738281,
      "kl": 0.0,
      "learning_rate": 3.602460088981942e-07,
      "logps/chosen": -252.3577117919922,
      "logps/rejected": -173.41571044921875,
      "loss": 0.3427,
      "rewards/chosen": 0.029461294412612915,
      "rewards/margins": 1.8500933647155762,
      "rewards/rejected": -1.8206321001052856,
      "step": 1068
    },
    {
      "epoch": 0.28,
      "grad_norm": 35.695560455322266,
      "kl": 0.0,
      "learning_rate": 3.601151531012824e-07,
      "logps/chosen": -273.1550598144531,
      "logps/rejected": -285.8321533203125,
      "loss": 0.1906,
      "rewards/chosen": 1.8435993194580078,
      "rewards/margins": 5.752671241760254,
      "rewards/rejected": -3.909071922302246,
      "step": 1069
    },
    {
      "epoch": 0.28,
      "grad_norm": 34.36787033081055,
      "kl": 0.0,
      "learning_rate": 3.599842973043706e-07,
      "logps/chosen": -214.49176025390625,
      "logps/rejected": -263.76544189453125,
      "loss": 0.261,
      "rewards/chosen": 1.1163358688354492,
      "rewards/margins": 5.000162124633789,
      "rewards/rejected": -3.8838260173797607,
      "step": 1070
    },
    {
      "epoch": 0.28,
      "grad_norm": 31.274845123291016,
      "kl": 0.0,
      "learning_rate": 3.598534415074588e-07,
      "logps/chosen": -269.2871398925781,
      "logps/rejected": -231.68600463867188,
      "loss": 0.3206,
      "rewards/chosen": -1.9677873849868774,
      "rewards/margins": 0.8992663621902466,
      "rewards/rejected": -2.867053747177124,
      "step": 1071
    },
    {
      "epoch": 0.28,
      "grad_norm": 32.18807601928711,
      "kl": 0.0,
      "learning_rate": 3.59722585710547e-07,
      "logps/chosen": -276.0224304199219,
      "logps/rejected": -250.87071228027344,
      "loss": 0.4247,
      "rewards/chosen": 0.2261885702610016,
      "rewards/margins": 2.891800880432129,
      "rewards/rejected": -2.66561222076416,
      "step": 1072
    },
    {
      "epoch": 0.28,
      "grad_norm": 36.16868591308594,
      "kl": 0.0,
      "learning_rate": 3.595917299136352e-07,
      "logps/chosen": -208.01426696777344,
      "logps/rejected": -285.5256652832031,
      "loss": 0.2483,
      "rewards/chosen": 0.5357463955879211,
      "rewards/margins": 1.908036708831787,
      "rewards/rejected": -1.3722903728485107,
      "step": 1073
    },
    {
      "epoch": 0.28,
      "grad_norm": 39.653663635253906,
      "kl": 0.0,
      "learning_rate": 3.594608741167234e-07,
      "logps/chosen": -278.96038818359375,
      "logps/rejected": -210.6820526123047,
      "loss": 0.325,
      "rewards/chosen": 0.039040904492139816,
      "rewards/margins": 2.643235445022583,
      "rewards/rejected": -2.6041946411132812,
      "step": 1074
    },
    {
      "epoch": 0.28,
      "grad_norm": 37.162513732910156,
      "kl": 0.0,
      "learning_rate": 3.5933001831981157e-07,
      "logps/chosen": -233.07406616210938,
      "logps/rejected": -233.7490997314453,
      "loss": 0.3276,
      "rewards/chosen": 0.33511853218078613,
      "rewards/margins": 2.7062385082244873,
      "rewards/rejected": -2.371119976043701,
      "step": 1075
    },
    {
      "epoch": 0.28,
      "grad_norm": 34.30678176879883,
      "kl": 0.0,
      "learning_rate": 3.591991625228997e-07,
      "logps/chosen": -296.16326904296875,
      "logps/rejected": -230.3595733642578,
      "loss": 0.2341,
      "rewards/chosen": 2.3373847007751465,
      "rewards/margins": 4.555594444274902,
      "rewards/rejected": -2.2182095050811768,
      "step": 1076
    },
    {
      "epoch": 0.28,
      "grad_norm": 20.76529884338379,
      "kl": 0.0,
      "learning_rate": 3.590683067259879e-07,
      "logps/chosen": -186.85400390625,
      "logps/rejected": -223.57244873046875,
      "loss": 0.1979,
      "rewards/chosen": 1.0140910148620605,
      "rewards/margins": 5.6391987800598145,
      "rewards/rejected": -4.625107765197754,
      "step": 1077
    },
    {
      "epoch": 0.28,
      "grad_norm": 44.112098693847656,
      "kl": 0.0,
      "learning_rate": 3.589374509290761e-07,
      "logps/chosen": -180.42352294921875,
      "logps/rejected": -276.9669189453125,
      "loss": 0.3337,
      "rewards/chosen": -0.0607970654964447,
      "rewards/margins": 3.0823521614074707,
      "rewards/rejected": -3.1431491374969482,
      "step": 1078
    },
    {
      "epoch": 0.28,
      "grad_norm": 37.582191467285156,
      "kl": 0.0,
      "learning_rate": 3.588065951321643e-07,
      "logps/chosen": -243.1421356201172,
      "logps/rejected": -354.2049865722656,
      "loss": 0.2466,
      "rewards/chosen": -0.5239735245704651,
      "rewards/margins": 4.440597057342529,
      "rewards/rejected": -4.96457052230835,
      "step": 1079
    },
    {
      "epoch": 0.28,
      "grad_norm": 33.97587966918945,
      "kl": 0.0,
      "learning_rate": 3.5867573933525255e-07,
      "logps/chosen": -177.9431915283203,
      "logps/rejected": -192.5262451171875,
      "loss": 0.3506,
      "rewards/chosen": 0.012957848608493805,
      "rewards/margins": 2.7563772201538086,
      "rewards/rejected": -2.7434194087982178,
      "step": 1080
    },
    {
      "epoch": 0.28,
      "grad_norm": 36.664241790771484,
      "kl": 0.0,
      "learning_rate": 3.5854488353834075e-07,
      "logps/chosen": -247.65528869628906,
      "logps/rejected": -163.8009490966797,
      "loss": 0.3547,
      "rewards/chosen": -0.40981823205947876,
      "rewards/margins": 2.227193593978882,
      "rewards/rejected": -2.637011766433716,
      "step": 1081
    },
    {
      "epoch": 0.28,
      "grad_norm": 30.23735237121582,
      "kl": 0.0,
      "learning_rate": 3.5841402774142894e-07,
      "logps/chosen": -216.5425262451172,
      "logps/rejected": -273.2435607910156,
      "loss": 0.2448,
      "rewards/chosen": -0.4988417327404022,
      "rewards/margins": 3.563955783843994,
      "rewards/rejected": -4.062797546386719,
      "step": 1082
    },
    {
      "epoch": 0.28,
      "grad_norm": 34.601112365722656,
      "kl": 0.0,
      "learning_rate": 3.5828317194451714e-07,
      "logps/chosen": -210.7240447998047,
      "logps/rejected": -196.9040985107422,
      "loss": 0.2698,
      "rewards/chosen": 0.395294725894928,
      "rewards/margins": 3.474808931350708,
      "rewards/rejected": -3.079514265060425,
      "step": 1083
    },
    {
      "epoch": 0.28,
      "grad_norm": 28.270124435424805,
      "kl": 0.0,
      "learning_rate": 3.5815231614760534e-07,
      "logps/chosen": -165.27096557617188,
      "logps/rejected": -188.78155517578125,
      "loss": 0.2591,
      "rewards/chosen": 0.3385252356529236,
      "rewards/margins": 3.490814685821533,
      "rewards/rejected": -3.152289390563965,
      "step": 1084
    },
    {
      "epoch": 0.28,
      "grad_norm": 34.028770446777344,
      "kl": 0.0,
      "learning_rate": 3.5802146035069353e-07,
      "logps/chosen": -246.2218780517578,
      "logps/rejected": -238.57894897460938,
      "loss": 0.2223,
      "rewards/chosen": 1.6595675945281982,
      "rewards/margins": 6.902527809143066,
      "rewards/rejected": -5.242960453033447,
      "step": 1085
    },
    {
      "epoch": 0.28,
      "grad_norm": 36.6795539855957,
      "kl": 0.0,
      "learning_rate": 3.5789060455378173e-07,
      "logps/chosen": -303.646484375,
      "logps/rejected": -332.2771911621094,
      "loss": 0.2087,
      "rewards/chosen": 1.6191469430923462,
      "rewards/margins": 5.439394474029541,
      "rewards/rejected": -3.8202476501464844,
      "step": 1086
    },
    {
      "epoch": 0.28,
      "grad_norm": 29.40221405029297,
      "kl": 0.0,
      "learning_rate": 3.577597487568699e-07,
      "logps/chosen": -186.3616485595703,
      "logps/rejected": -279.948974609375,
      "loss": 0.2862,
      "rewards/chosen": -0.8338308334350586,
      "rewards/margins": 3.1905031204223633,
      "rewards/rejected": -4.024333953857422,
      "step": 1087
    },
    {
      "epoch": 0.28,
      "grad_norm": 42.282047271728516,
      "kl": 0.0,
      "learning_rate": 3.576288929599581e-07,
      "logps/chosen": -204.36720275878906,
      "logps/rejected": -209.05628967285156,
      "loss": 0.3698,
      "rewards/chosen": -0.9839210510253906,
      "rewards/margins": 1.6166069507598877,
      "rewards/rejected": -2.6005280017852783,
      "step": 1088
    },
    {
      "epoch": 0.29,
      "grad_norm": 37.4203987121582,
      "kl": 0.0,
      "learning_rate": 3.574980371630463e-07,
      "logps/chosen": -258.50775146484375,
      "logps/rejected": -273.71368408203125,
      "loss": 0.2078,
      "rewards/chosen": 0.680251955986023,
      "rewards/margins": 4.447729110717773,
      "rewards/rejected": -3.767477035522461,
      "step": 1089
    },
    {
      "epoch": 0.29,
      "grad_norm": 28.57037353515625,
      "kl": 0.0,
      "learning_rate": 3.573671813661345e-07,
      "logps/chosen": -210.90475463867188,
      "logps/rejected": -245.07102966308594,
      "loss": 0.1402,
      "rewards/chosen": 0.5093256235122681,
      "rewards/margins": 4.227346420288086,
      "rewards/rejected": -3.7180206775665283,
      "step": 1090
    },
    {
      "epoch": 0.29,
      "grad_norm": 35.79833984375,
      "kl": 0.0,
      "learning_rate": 3.5723632556922266e-07,
      "logps/chosen": -241.64450073242188,
      "logps/rejected": -228.38893127441406,
      "loss": 0.2786,
      "rewards/chosen": 1.0499564409255981,
      "rewards/margins": 3.9940905570983887,
      "rewards/rejected": -2.944133996963501,
      "step": 1091
    },
    {
      "epoch": 0.29,
      "grad_norm": 33.26674270629883,
      "kl": 0.0,
      "learning_rate": 3.5710546977231085e-07,
      "logps/chosen": -398.8047180175781,
      "logps/rejected": -177.8960723876953,
      "loss": 0.3536,
      "rewards/chosen": -1.450081706047058,
      "rewards/margins": 2.225198745727539,
      "rewards/rejected": -3.6752805709838867,
      "step": 1092
    },
    {
      "epoch": 0.29,
      "grad_norm": 34.63792037963867,
      "kl": 0.0,
      "learning_rate": 3.569746139753991e-07,
      "logps/chosen": -271.9081726074219,
      "logps/rejected": -234.6984100341797,
      "loss": 0.3375,
      "rewards/chosen": 1.359961748123169,
      "rewards/margins": 4.406167030334473,
      "rewards/rejected": -3.0462050437927246,
      "step": 1093
    },
    {
      "epoch": 0.29,
      "grad_norm": 29.882545471191406,
      "kl": 0.0,
      "learning_rate": 3.568437581784873e-07,
      "logps/chosen": -212.69424438476562,
      "logps/rejected": -268.8453063964844,
      "loss": 0.2343,
      "rewards/chosen": 1.1743170022964478,
      "rewards/margins": 4.926886081695557,
      "rewards/rejected": -3.7525689601898193,
      "step": 1094
    },
    {
      "epoch": 0.29,
      "grad_norm": 29.734010696411133,
      "kl": 0.0,
      "learning_rate": 3.567129023815755e-07,
      "logps/chosen": -181.9860382080078,
      "logps/rejected": -233.7906951904297,
      "loss": 0.3177,
      "rewards/chosen": -0.8437288403511047,
      "rewards/margins": 2.7986247539520264,
      "rewards/rejected": -3.6423535346984863,
      "step": 1095
    },
    {
      "epoch": 0.29,
      "grad_norm": 36.10123062133789,
      "kl": 0.0,
      "learning_rate": 3.565820465846637e-07,
      "logps/chosen": -188.33181762695312,
      "logps/rejected": -130.89389038085938,
      "loss": 0.4587,
      "rewards/chosen": -0.7544440031051636,
      "rewards/margins": 3.384410858154297,
      "rewards/rejected": -4.13885498046875,
      "step": 1096
    },
    {
      "epoch": 0.29,
      "grad_norm": 32.93227767944336,
      "kl": 0.0,
      "learning_rate": 3.564511907877519e-07,
      "logps/chosen": -238.99197387695312,
      "logps/rejected": -234.0099334716797,
      "loss": 0.3542,
      "rewards/chosen": -0.7831324934959412,
      "rewards/margins": 3.8155860900878906,
      "rewards/rejected": -4.598718643188477,
      "step": 1097
    },
    {
      "epoch": 0.29,
      "grad_norm": 31.56029510498047,
      "kl": 0.0,
      "learning_rate": 3.563203349908401e-07,
      "logps/chosen": -155.75697326660156,
      "logps/rejected": -248.09512329101562,
      "loss": 0.3233,
      "rewards/chosen": 0.1227286234498024,
      "rewards/margins": 2.778966188430786,
      "rewards/rejected": -2.6562376022338867,
      "step": 1098
    },
    {
      "epoch": 0.29,
      "grad_norm": 34.97945785522461,
      "kl": 0.0,
      "learning_rate": 3.561894791939283e-07,
      "logps/chosen": -165.0885772705078,
      "logps/rejected": -286.6366271972656,
      "loss": 0.3808,
      "rewards/chosen": 0.3326500356197357,
      "rewards/margins": 3.2164618968963623,
      "rewards/rejected": -2.8838119506835938,
      "step": 1099
    },
    {
      "epoch": 0.29,
      "grad_norm": 36.84022521972656,
      "kl": 0.0,
      "learning_rate": 3.5605862339701647e-07,
      "logps/chosen": -274.4532775878906,
      "logps/rejected": -254.0680389404297,
      "loss": 0.417,
      "rewards/chosen": 0.12156467139720917,
      "rewards/margins": 2.132566213607788,
      "rewards/rejected": -2.0110015869140625,
      "step": 1100
    },
    {
      "epoch": 0.29,
      "grad_norm": 40.564884185791016,
      "kl": 0.0,
      "learning_rate": 3.5592776760010467e-07,
      "logps/chosen": -211.90956115722656,
      "logps/rejected": -237.0247039794922,
      "loss": 0.3465,
      "rewards/chosen": -0.3848746716976166,
      "rewards/margins": 1.7771941423416138,
      "rewards/rejected": -2.1620688438415527,
      "step": 1101
    },
    {
      "epoch": 0.29,
      "grad_norm": 22.69942855834961,
      "kl": 0.0,
      "learning_rate": 3.5579691180319287e-07,
      "logps/chosen": -239.27420043945312,
      "logps/rejected": -246.932861328125,
      "loss": 0.2281,
      "rewards/chosen": 0.6759759187698364,
      "rewards/margins": 5.967441558837891,
      "rewards/rejected": -5.291465759277344,
      "step": 1102
    },
    {
      "epoch": 0.29,
      "grad_norm": 35.47650909423828,
      "kl": 0.0,
      "learning_rate": 3.5566605600628106e-07,
      "logps/chosen": -215.21624755859375,
      "logps/rejected": -268.92169189453125,
      "loss": 0.2809,
      "rewards/chosen": 0.8522527813911438,
      "rewards/margins": 4.585559844970703,
      "rewards/rejected": -3.733306884765625,
      "step": 1103
    },
    {
      "epoch": 0.29,
      "grad_norm": 39.69252395629883,
      "kl": 0.0,
      "learning_rate": 3.5553520020936926e-07,
      "logps/chosen": -225.89573669433594,
      "logps/rejected": -233.9873046875,
      "loss": 0.4371,
      "rewards/chosen": -0.27535781264305115,
      "rewards/margins": 2.050819158554077,
      "rewards/rejected": -2.326176881790161,
      "step": 1104
    },
    {
      "epoch": 0.29,
      "grad_norm": 34.08661651611328,
      "kl": 0.0,
      "learning_rate": 3.5540434441245745e-07,
      "logps/chosen": -189.98110961914062,
      "logps/rejected": -253.66513061523438,
      "loss": 0.3135,
      "rewards/chosen": 0.5056968927383423,
      "rewards/margins": 3.583799362182617,
      "rewards/rejected": -3.0781025886535645,
      "step": 1105
    },
    {
      "epoch": 0.29,
      "grad_norm": 26.737289428710938,
      "kl": 0.0,
      "learning_rate": 3.552734886155457e-07,
      "logps/chosen": -213.71267700195312,
      "logps/rejected": -161.48214721679688,
      "loss": 0.371,
      "rewards/chosen": -0.6218294501304626,
      "rewards/margins": 1.985694408416748,
      "rewards/rejected": -2.6075239181518555,
      "step": 1106
    },
    {
      "epoch": 0.29,
      "grad_norm": 38.503726959228516,
      "kl": 0.0,
      "learning_rate": 3.5514263281863385e-07,
      "logps/chosen": -183.30770874023438,
      "logps/rejected": -271.33172607421875,
      "loss": 0.3733,
      "rewards/chosen": -0.9077540636062622,
      "rewards/margins": 3.8997726440429688,
      "rewards/rejected": -4.807526588439941,
      "step": 1107
    },
    {
      "epoch": 0.29,
      "grad_norm": 32.68248748779297,
      "kl": 0.0,
      "learning_rate": 3.5501177702172204e-07,
      "logps/chosen": -263.5723876953125,
      "logps/rejected": -320.6473388671875,
      "loss": 0.472,
      "rewards/chosen": -1.8644930124282837,
      "rewards/margins": 2.3240761756896973,
      "rewards/rejected": -4.188569068908691,
      "step": 1108
    },
    {
      "epoch": 0.29,
      "grad_norm": 36.400020599365234,
      "kl": 0.0,
      "learning_rate": 3.5488092122481024e-07,
      "logps/chosen": -245.26560974121094,
      "logps/rejected": -276.50933837890625,
      "loss": 0.3022,
      "rewards/chosen": 0.9690700769424438,
      "rewards/margins": 3.6182336807250977,
      "rewards/rejected": -2.6491634845733643,
      "step": 1109
    },
    {
      "epoch": 0.29,
      "grad_norm": 39.89811706542969,
      "kl": 0.0,
      "learning_rate": 3.5475006542789843e-07,
      "logps/chosen": -296.92352294921875,
      "logps/rejected": -367.4890441894531,
      "loss": 0.3288,
      "rewards/chosen": -1.7777142524719238,
      "rewards/margins": 3.5788440704345703,
      "rewards/rejected": -5.356558322906494,
      "step": 1110
    },
    {
      "epoch": 0.29,
      "grad_norm": 34.54069900512695,
      "kl": 0.0,
      "learning_rate": 3.5461920963098663e-07,
      "logps/chosen": -204.82083129882812,
      "logps/rejected": -325.15386962890625,
      "loss": 0.2518,
      "rewards/chosen": 0.557120680809021,
      "rewards/margins": 3.710430145263672,
      "rewards/rejected": -3.1533093452453613,
      "step": 1111
    },
    {
      "epoch": 0.29,
      "grad_norm": 30.902477264404297,
      "kl": 0.0,
      "learning_rate": 3.544883538340748e-07,
      "logps/chosen": -275.5830993652344,
      "logps/rejected": -183.99923706054688,
      "loss": 0.1808,
      "rewards/chosen": 1.1739755868911743,
      "rewards/margins": 4.945966720581055,
      "rewards/rejected": -3.77199125289917,
      "step": 1112
    },
    {
      "epoch": 0.29,
      "grad_norm": 30.833444595336914,
      "kl": 0.0,
      "learning_rate": 3.54357498037163e-07,
      "logps/chosen": -201.6013946533203,
      "logps/rejected": -168.2086944580078,
      "loss": 0.1972,
      "rewards/chosen": 0.5766014456748962,
      "rewards/margins": 3.2745020389556885,
      "rewards/rejected": -2.6979005336761475,
      "step": 1113
    },
    {
      "epoch": 0.29,
      "grad_norm": 26.51701545715332,
      "kl": 0.0,
      "learning_rate": 3.542266422402512e-07,
      "logps/chosen": -208.7091522216797,
      "logps/rejected": -217.86351013183594,
      "loss": 0.1889,
      "rewards/chosen": 1.0350052118301392,
      "rewards/margins": 4.19869327545166,
      "rewards/rejected": -3.1636881828308105,
      "step": 1114
    },
    {
      "epoch": 0.29,
      "grad_norm": 30.602649688720703,
      "kl": 0.0,
      "learning_rate": 3.540957864433394e-07,
      "logps/chosen": -174.76805114746094,
      "logps/rejected": -262.15087890625,
      "loss": 0.1866,
      "rewards/chosen": 0.5281372666358948,
      "rewards/margins": 5.422541618347168,
      "rewards/rejected": -4.894404411315918,
      "step": 1115
    },
    {
      "epoch": 0.29,
      "grad_norm": 23.70747184753418,
      "kl": 0.0,
      "learning_rate": 3.539649306464276e-07,
      "logps/chosen": -151.63156127929688,
      "logps/rejected": -231.32972717285156,
      "loss": 0.2839,
      "rewards/chosen": -0.6982941627502441,
      "rewards/margins": 2.6295316219329834,
      "rewards/rejected": -3.3278257846832275,
      "step": 1116
    },
    {
      "epoch": 0.29,
      "grad_norm": 37.54524230957031,
      "kl": 0.0,
      "learning_rate": 3.538340748495158e-07,
      "logps/chosen": -252.19773864746094,
      "logps/rejected": -178.48255920410156,
      "loss": 0.3343,
      "rewards/chosen": -1.1702947616577148,
      "rewards/margins": 1.391571044921875,
      "rewards/rejected": -2.56186580657959,
      "step": 1117
    },
    {
      "epoch": 0.29,
      "grad_norm": 31.71670913696289,
      "kl": 0.0,
      "learning_rate": 3.53703219052604e-07,
      "logps/chosen": -174.445068359375,
      "logps/rejected": -246.0499725341797,
      "loss": 0.4527,
      "rewards/chosen": -0.7562252283096313,
      "rewards/margins": 2.1291980743408203,
      "rewards/rejected": -2.885423183441162,
      "step": 1118
    },
    {
      "epoch": 0.29,
      "grad_norm": 33.092411041259766,
      "kl": 0.0,
      "learning_rate": 3.5357236325569225e-07,
      "logps/chosen": -212.7728729248047,
      "logps/rejected": -231.80722045898438,
      "loss": 0.3066,
      "rewards/chosen": -0.33431848883628845,
      "rewards/margins": 2.203496217727661,
      "rewards/rejected": -2.5378146171569824,
      "step": 1119
    },
    {
      "epoch": 0.29,
      "grad_norm": 44.41012954711914,
      "kl": 0.0,
      "learning_rate": 3.5344150745878045e-07,
      "logps/chosen": -224.70591735839844,
      "logps/rejected": -218.91012573242188,
      "loss": 0.3321,
      "rewards/chosen": 0.26089727878570557,
      "rewards/margins": 2.4730587005615234,
      "rewards/rejected": -2.2121613025665283,
      "step": 1120
    },
    {
      "epoch": 0.29,
      "grad_norm": 36.9603385925293,
      "kl": 0.0,
      "learning_rate": 3.5331065166186864e-07,
      "logps/chosen": -256.7554016113281,
      "logps/rejected": -261.4122314453125,
      "loss": 0.3427,
      "rewards/chosen": -0.10907495021820068,
      "rewards/margins": 3.4610395431518555,
      "rewards/rejected": -3.5701146125793457,
      "step": 1121
    },
    {
      "epoch": 0.29,
      "grad_norm": 31.963926315307617,
      "kl": 0.0,
      "learning_rate": 3.531797958649568e-07,
      "logps/chosen": -200.814453125,
      "logps/rejected": -228.10572814941406,
      "loss": 0.3222,
      "rewards/chosen": -0.9516986608505249,
      "rewards/margins": 2.583322525024414,
      "rewards/rejected": -3.5350210666656494,
      "step": 1122
    },
    {
      "epoch": 0.29,
      "grad_norm": 41.13035583496094,
      "kl": 0.0,
      "learning_rate": 3.53048940068045e-07,
      "logps/chosen": -182.65289306640625,
      "logps/rejected": -259.0008850097656,
      "loss": 0.2592,
      "rewards/chosen": 0.5731912851333618,
      "rewards/margins": 5.2247538566589355,
      "rewards/rejected": -4.651562690734863,
      "step": 1123
    },
    {
      "epoch": 0.29,
      "grad_norm": 34.95411682128906,
      "kl": 0.0,
      "learning_rate": 3.529180842711332e-07,
      "logps/chosen": -268.8537292480469,
      "logps/rejected": -250.743896484375,
      "loss": 0.2416,
      "rewards/chosen": -0.5174262523651123,
      "rewards/margins": 3.3615188598632812,
      "rewards/rejected": -3.8789451122283936,
      "step": 1124
    },
    {
      "epoch": 0.29,
      "grad_norm": 30.07512855529785,
      "kl": 0.0,
      "learning_rate": 3.527872284742214e-07,
      "logps/chosen": -233.08389282226562,
      "logps/rejected": -165.42440795898438,
      "loss": 0.3159,
      "rewards/chosen": 0.16913124918937683,
      "rewards/margins": 3.1799628734588623,
      "rewards/rejected": -3.010831594467163,
      "step": 1125
    },
    {
      "epoch": 0.29,
      "grad_norm": 32.58613586425781,
      "kl": 0.0,
      "learning_rate": 3.5265637267730957e-07,
      "logps/chosen": -214.123779296875,
      "logps/rejected": -241.11907958984375,
      "loss": 0.2387,
      "rewards/chosen": -0.24165599048137665,
      "rewards/margins": 2.814639091491699,
      "rewards/rejected": -3.056295156478882,
      "step": 1126
    },
    {
      "epoch": 0.29,
      "grad_norm": 32.11212158203125,
      "kl": 0.0,
      "learning_rate": 3.5252551688039777e-07,
      "logps/chosen": -187.3433380126953,
      "logps/rejected": -289.6556091308594,
      "loss": 0.277,
      "rewards/chosen": 0.13912898302078247,
      "rewards/margins": 4.586435794830322,
      "rewards/rejected": -4.4473066329956055,
      "step": 1127
    },
    {
      "epoch": 0.3,
      "grad_norm": 28.93172836303711,
      "kl": 0.0,
      "learning_rate": 3.5239466108348596e-07,
      "logps/chosen": -157.70382690429688,
      "logps/rejected": -244.54635620117188,
      "loss": 0.3391,
      "rewards/chosen": 0.6886506080627441,
      "rewards/margins": 4.031140327453613,
      "rewards/rejected": -3.3424899578094482,
      "step": 1128
    },
    {
      "epoch": 0.3,
      "grad_norm": 32.03681945800781,
      "kl": 0.0,
      "learning_rate": 3.5226380528657416e-07,
      "logps/chosen": -256.11553955078125,
      "logps/rejected": -284.6397399902344,
      "loss": 0.2272,
      "rewards/chosen": 0.2986472249031067,
      "rewards/margins": 6.660142421722412,
      "rewards/rejected": -6.361495018005371,
      "step": 1129
    },
    {
      "epoch": 0.3,
      "grad_norm": 39.74080276489258,
      "kl": 0.0,
      "learning_rate": 3.5213294948966236e-07,
      "logps/chosen": -210.30718994140625,
      "logps/rejected": -232.85952758789062,
      "loss": 0.3897,
      "rewards/chosen": 0.4224991798400879,
      "rewards/margins": 2.9178881645202637,
      "rewards/rejected": -2.495388984680176,
      "step": 1130
    },
    {
      "epoch": 0.3,
      "grad_norm": 27.7830753326416,
      "kl": 0.0,
      "learning_rate": 3.520020936927506e-07,
      "logps/chosen": -158.5603790283203,
      "logps/rejected": -174.37652587890625,
      "loss": 0.4527,
      "rewards/chosen": -0.919854998588562,
      "rewards/margins": 1.2761861085891724,
      "rewards/rejected": -2.1960411071777344,
      "step": 1131
    },
    {
      "epoch": 0.3,
      "grad_norm": 50.690189361572266,
      "kl": 0.0,
      "learning_rate": 3.518712378958388e-07,
      "logps/chosen": -222.47323608398438,
      "logps/rejected": -234.22178649902344,
      "loss": 0.3715,
      "rewards/chosen": 0.2417455017566681,
      "rewards/margins": 4.570836544036865,
      "rewards/rejected": -4.3290910720825195,
      "step": 1132
    },
    {
      "epoch": 0.3,
      "grad_norm": 41.496700286865234,
      "kl": 0.0,
      "learning_rate": 3.51740382098927e-07,
      "logps/chosen": -145.2799835205078,
      "logps/rejected": -285.2710266113281,
      "loss": 0.2416,
      "rewards/chosen": -0.3389700949192047,
      "rewards/margins": 2.8799259662628174,
      "rewards/rejected": -3.2188961505889893,
      "step": 1133
    },
    {
      "epoch": 0.3,
      "grad_norm": 26.7552547454834,
      "kl": 0.0,
      "learning_rate": 3.516095263020152e-07,
      "logps/chosen": -165.4346466064453,
      "logps/rejected": -254.95603942871094,
      "loss": 0.3079,
      "rewards/chosen": 0.22509948909282684,
      "rewards/margins": 5.674718379974365,
      "rewards/rejected": -5.449618816375732,
      "step": 1134
    },
    {
      "epoch": 0.3,
      "grad_norm": 36.21974182128906,
      "kl": 0.0,
      "learning_rate": 3.514786705051034e-07,
      "logps/chosen": -269.6980895996094,
      "logps/rejected": -245.57833862304688,
      "loss": 0.3177,
      "rewards/chosen": 1.210508942604065,
      "rewards/margins": 3.6992435455322266,
      "rewards/rejected": -2.488734483718872,
      "step": 1135
    },
    {
      "epoch": 0.3,
      "grad_norm": 40.173545837402344,
      "kl": 0.0,
      "learning_rate": 3.513478147081916e-07,
      "logps/chosen": -208.01304626464844,
      "logps/rejected": -236.66079711914062,
      "loss": 0.2758,
      "rewards/chosen": 0.11015648394823074,
      "rewards/margins": 3.861097812652588,
      "rewards/rejected": -3.750941276550293,
      "step": 1136
    },
    {
      "epoch": 0.3,
      "grad_norm": 45.25703811645508,
      "kl": 0.0,
      "learning_rate": 3.512169589112798e-07,
      "logps/chosen": -177.85910034179688,
      "logps/rejected": -201.8383026123047,
      "loss": 0.3978,
      "rewards/chosen": -0.15296238660812378,
      "rewards/margins": 2.0827102661132812,
      "rewards/rejected": -2.23567271232605,
      "step": 1137
    },
    {
      "epoch": 0.3,
      "grad_norm": 27.257539749145508,
      "kl": 0.0,
      "learning_rate": 3.510861031143679e-07,
      "logps/chosen": -211.14024353027344,
      "logps/rejected": -286.07354736328125,
      "loss": 0.2634,
      "rewards/chosen": -0.6804866790771484,
      "rewards/margins": 2.8411343097686768,
      "rewards/rejected": -3.521620988845825,
      "step": 1138
    },
    {
      "epoch": 0.3,
      "grad_norm": 34.01937484741211,
      "kl": 0.0,
      "learning_rate": 3.509552473174561e-07,
      "logps/chosen": -215.672607421875,
      "logps/rejected": -360.40460205078125,
      "loss": 0.341,
      "rewards/chosen": 0.5583714246749878,
      "rewards/margins": 4.312696933746338,
      "rewards/rejected": -3.7543256282806396,
      "step": 1139
    },
    {
      "epoch": 0.3,
      "grad_norm": 30.42136573791504,
      "kl": 0.0,
      "learning_rate": 3.508243915205443e-07,
      "logps/chosen": -247.34075927734375,
      "logps/rejected": -221.0343475341797,
      "loss": 0.285,
      "rewards/chosen": 0.6125203371047974,
      "rewards/margins": 4.340419292449951,
      "rewards/rejected": -3.7278990745544434,
      "step": 1140
    },
    {
      "epoch": 0.3,
      "grad_norm": 42.30283737182617,
      "kl": 0.0,
      "learning_rate": 3.506935357236325e-07,
      "logps/chosen": -246.60385131835938,
      "logps/rejected": -209.10311889648438,
      "loss": 0.4,
      "rewards/chosen": -1.0343382358551025,
      "rewards/margins": 1.872960090637207,
      "rewards/rejected": -2.9072983264923096,
      "step": 1141
    },
    {
      "epoch": 0.3,
      "grad_norm": 32.05286407470703,
      "kl": 0.0,
      "learning_rate": 3.505626799267207e-07,
      "logps/chosen": -206.08543395996094,
      "logps/rejected": -248.9346923828125,
      "loss": 0.2648,
      "rewards/chosen": -1.6441336870193481,
      "rewards/margins": 0.37358176708221436,
      "rewards/rejected": -2.0177154541015625,
      "step": 1142
    },
    {
      "epoch": 0.3,
      "grad_norm": 34.85634994506836,
      "kl": 0.0,
      "learning_rate": 3.504318241298089e-07,
      "logps/chosen": -234.87477111816406,
      "logps/rejected": -354.82647705078125,
      "loss": 0.3797,
      "rewards/chosen": -0.04308699071407318,
      "rewards/margins": 4.236164569854736,
      "rewards/rejected": -4.279251575469971,
      "step": 1143
    },
    {
      "epoch": 0.3,
      "grad_norm": 38.970787048339844,
      "kl": 0.0,
      "learning_rate": 3.5030096833289715e-07,
      "logps/chosen": -294.22186279296875,
      "logps/rejected": -266.58087158203125,
      "loss": 0.3057,
      "rewards/chosen": 0.0297364741563797,
      "rewards/margins": 2.739408016204834,
      "rewards/rejected": -2.7096714973449707,
      "step": 1144
    },
    {
      "epoch": 0.3,
      "grad_norm": 27.94097328186035,
      "kl": 0.0,
      "learning_rate": 3.5017011253598535e-07,
      "logps/chosen": -166.2245330810547,
      "logps/rejected": -200.76283264160156,
      "loss": 0.3123,
      "rewards/chosen": -0.9906286001205444,
      "rewards/margins": 2.226215362548828,
      "rewards/rejected": -3.216843843460083,
      "step": 1145
    },
    {
      "epoch": 0.3,
      "grad_norm": 39.00582504272461,
      "kl": 0.0,
      "learning_rate": 3.5003925673907355e-07,
      "logps/chosen": -161.9521942138672,
      "logps/rejected": -197.63607788085938,
      "loss": 0.3922,
      "rewards/chosen": -0.6772257089614868,
      "rewards/margins": 1.6258729696273804,
      "rewards/rejected": -2.303098678588867,
      "step": 1146
    },
    {
      "epoch": 0.3,
      "grad_norm": 33.07608413696289,
      "kl": 0.0,
      "learning_rate": 3.4990840094216174e-07,
      "logps/chosen": -215.2793731689453,
      "logps/rejected": -267.1920471191406,
      "loss": 0.2962,
      "rewards/chosen": -0.22169288992881775,
      "rewards/margins": 3.129300832748413,
      "rewards/rejected": -3.3509936332702637,
      "step": 1147
    },
    {
      "epoch": 0.3,
      "grad_norm": 26.845361709594727,
      "kl": 0.0,
      "learning_rate": 3.4977754514524994e-07,
      "logps/chosen": -178.5347900390625,
      "logps/rejected": -216.12083435058594,
      "loss": 0.3127,
      "rewards/chosen": 0.09511661529541016,
      "rewards/margins": 4.102299213409424,
      "rewards/rejected": -4.007182598114014,
      "step": 1148
    },
    {
      "epoch": 0.3,
      "grad_norm": 27.727128982543945,
      "kl": 0.0,
      "learning_rate": 3.4964668934833813e-07,
      "logps/chosen": -416.78363037109375,
      "logps/rejected": -312.7774353027344,
      "loss": 0.307,
      "rewards/chosen": -3.3301751613616943,
      "rewards/margins": 0.24054336547851562,
      "rewards/rejected": -3.57071852684021,
      "step": 1149
    },
    {
      "epoch": 0.3,
      "grad_norm": 39.83740997314453,
      "kl": 0.0,
      "learning_rate": 3.4951583355142633e-07,
      "logps/chosen": -150.38316345214844,
      "logps/rejected": -260.5281066894531,
      "loss": 0.4156,
      "rewards/chosen": -0.8923667073249817,
      "rewards/margins": 1.060227394104004,
      "rewards/rejected": -1.9525940418243408,
      "step": 1150
    },
    {
      "epoch": 0.3,
      "grad_norm": 29.897340774536133,
      "kl": 0.0,
      "learning_rate": 3.4938497775451453e-07,
      "logps/chosen": -259.205810546875,
      "logps/rejected": -210.67149353027344,
      "loss": 0.2528,
      "rewards/chosen": 0.46551254391670227,
      "rewards/margins": 2.836808443069458,
      "rewards/rejected": -2.371295928955078,
      "step": 1151
    },
    {
      "epoch": 0.3,
      "grad_norm": 31.324119567871094,
      "kl": 0.0,
      "learning_rate": 3.492541219576027e-07,
      "logps/chosen": -177.623779296875,
      "logps/rejected": -181.99087524414062,
      "loss": 0.2261,
      "rewards/chosen": 0.7093040347099304,
      "rewards/margins": 3.908512830734253,
      "rewards/rejected": -3.1992087364196777,
      "step": 1152
    },
    {
      "epoch": 0.3,
      "grad_norm": 62.75912094116211,
      "kl": 0.0,
      "learning_rate": 3.4912326616069087e-07,
      "logps/chosen": -243.6322479248047,
      "logps/rejected": -309.91485595703125,
      "loss": 0.4145,
      "rewards/chosen": -1.3495761156082153,
      "rewards/margins": 1.1117101907730103,
      "rewards/rejected": -2.4612863063812256,
      "step": 1153
    },
    {
      "epoch": 0.3,
      "grad_norm": 24.441761016845703,
      "kl": 0.0,
      "learning_rate": 3.4899241036377906e-07,
      "logps/chosen": -137.18801879882812,
      "logps/rejected": -218.70822143554688,
      "loss": 0.4508,
      "rewards/chosen": -1.321584701538086,
      "rewards/margins": 2.427882671356201,
      "rewards/rejected": -3.749467372894287,
      "step": 1154
    },
    {
      "epoch": 0.3,
      "grad_norm": 26.579702377319336,
      "kl": 0.0,
      "learning_rate": 3.4886155456686726e-07,
      "logps/chosen": -259.0023193359375,
      "logps/rejected": -244.285400390625,
      "loss": 0.3294,
      "rewards/chosen": -0.7168277502059937,
      "rewards/margins": 3.6419262886047363,
      "rewards/rejected": -4.3587541580200195,
      "step": 1155
    },
    {
      "epoch": 0.3,
      "grad_norm": 34.10115051269531,
      "kl": 0.0,
      "learning_rate": 3.4873069876995545e-07,
      "logps/chosen": -196.53575134277344,
      "logps/rejected": -263.05865478515625,
      "loss": 0.3352,
      "rewards/chosen": 0.4309160113334656,
      "rewards/margins": 4.168835639953613,
      "rewards/rejected": -3.737919569015503,
      "step": 1156
    },
    {
      "epoch": 0.3,
      "grad_norm": 31.891420364379883,
      "kl": 0.0,
      "learning_rate": 3.485998429730437e-07,
      "logps/chosen": -220.2379608154297,
      "logps/rejected": -201.052734375,
      "loss": 0.3821,
      "rewards/chosen": -2.2106711864471436,
      "rewards/margins": 1.8266870975494385,
      "rewards/rejected": -4.037358283996582,
      "step": 1157
    },
    {
      "epoch": 0.3,
      "grad_norm": 27.060434341430664,
      "kl": 0.0,
      "learning_rate": 3.484689871761319e-07,
      "logps/chosen": -279.9279479980469,
      "logps/rejected": -191.00369262695312,
      "loss": 0.2911,
      "rewards/chosen": -1.332848072052002,
      "rewards/margins": 2.7451157569885254,
      "rewards/rejected": -4.077963829040527,
      "step": 1158
    },
    {
      "epoch": 0.3,
      "grad_norm": 34.95372009277344,
      "kl": 0.0,
      "learning_rate": 3.483381313792201e-07,
      "logps/chosen": -285.4616394042969,
      "logps/rejected": -143.6122283935547,
      "loss": 0.2803,
      "rewards/chosen": -0.22251377999782562,
      "rewards/margins": 3.0675089359283447,
      "rewards/rejected": -3.290022611618042,
      "step": 1159
    },
    {
      "epoch": 0.3,
      "grad_norm": 34.428009033203125,
      "kl": 0.0,
      "learning_rate": 3.482072755823083e-07,
      "logps/chosen": -238.25436401367188,
      "logps/rejected": -227.17388916015625,
      "loss": 0.2845,
      "rewards/chosen": 0.45113271474838257,
      "rewards/margins": 3.127865791320801,
      "rewards/rejected": -2.6767330169677734,
      "step": 1160
    },
    {
      "epoch": 0.3,
      "grad_norm": 29.634763717651367,
      "kl": 0.0,
      "learning_rate": 3.480764197853965e-07,
      "logps/chosen": -255.47523498535156,
      "logps/rejected": -360.51165771484375,
      "loss": 0.2351,
      "rewards/chosen": 0.6718313097953796,
      "rewards/margins": 3.9802281856536865,
      "rewards/rejected": -3.308396816253662,
      "step": 1161
    },
    {
      "epoch": 0.3,
      "grad_norm": 32.4753303527832,
      "kl": 0.0,
      "learning_rate": 3.479455639884847e-07,
      "logps/chosen": -266.46209716796875,
      "logps/rejected": -259.5201110839844,
      "loss": 0.2627,
      "rewards/chosen": -1.0025486946105957,
      "rewards/margins": 2.339106321334839,
      "rewards/rejected": -3.3416550159454346,
      "step": 1162
    },
    {
      "epoch": 0.3,
      "grad_norm": 35.363380432128906,
      "kl": 0.0,
      "learning_rate": 3.478147081915729e-07,
      "logps/chosen": -265.4547424316406,
      "logps/rejected": -208.99771118164062,
      "loss": 0.3466,
      "rewards/chosen": 0.7662509083747864,
      "rewards/margins": 2.833404541015625,
      "rewards/rejected": -2.0671536922454834,
      "step": 1163
    },
    {
      "epoch": 0.3,
      "grad_norm": 37.66944122314453,
      "kl": 0.0,
      "learning_rate": 3.476838523946611e-07,
      "logps/chosen": -204.93299865722656,
      "logps/rejected": -207.90904235839844,
      "loss": 0.3169,
      "rewards/chosen": -1.2671194076538086,
      "rewards/margins": 0.4675861597061157,
      "rewards/rejected": -1.7347055673599243,
      "step": 1164
    },
    {
      "epoch": 0.3,
      "grad_norm": 35.521873474121094,
      "kl": 0.0,
      "learning_rate": 3.4755299659774927e-07,
      "logps/chosen": -196.3144073486328,
      "logps/rejected": -231.2823028564453,
      "loss": 0.3089,
      "rewards/chosen": 1.3642683029174805,
      "rewards/margins": 4.673673629760742,
      "rewards/rejected": -3.309405565261841,
      "step": 1165
    },
    {
      "epoch": 0.31,
      "grad_norm": 37.934539794921875,
      "kl": 0.0,
      "learning_rate": 3.4742214080083747e-07,
      "logps/chosen": -213.7936553955078,
      "logps/rejected": -251.3383026123047,
      "loss": 0.3177,
      "rewards/chosen": 0.11157000809907913,
      "rewards/margins": 3.19706392288208,
      "rewards/rejected": -3.085493803024292,
      "step": 1166
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.21260070800781,
      "kl": 0.0,
      "learning_rate": 3.4729128500392566e-07,
      "logps/chosen": -254.15847778320312,
      "logps/rejected": -162.43576049804688,
      "loss": 0.434,
      "rewards/chosen": -0.8147860765457153,
      "rewards/margins": 2.457930564880371,
      "rewards/rejected": -3.272716760635376,
      "step": 1167
    },
    {
      "epoch": 0.31,
      "grad_norm": 39.650821685791016,
      "kl": 0.0,
      "learning_rate": 3.471604292070138e-07,
      "logps/chosen": -192.7276153564453,
      "logps/rejected": -248.39785766601562,
      "loss": 0.3297,
      "rewards/chosen": -0.7319798469543457,
      "rewards/margins": 2.5251142978668213,
      "rewards/rejected": -3.257094144821167,
      "step": 1168
    },
    {
      "epoch": 0.31,
      "grad_norm": 33.93229675292969,
      "kl": 0.0,
      "learning_rate": 3.47029573410102e-07,
      "logps/chosen": -265.5296936035156,
      "logps/rejected": -164.14395141601562,
      "loss": 0.3404,
      "rewards/chosen": -0.1786927878856659,
      "rewards/margins": 1.8975303173065186,
      "rewards/rejected": -2.076223134994507,
      "step": 1169
    },
    {
      "epoch": 0.31,
      "grad_norm": 33.14167785644531,
      "kl": 0.0,
      "learning_rate": 3.4689871761319025e-07,
      "logps/chosen": -175.51358032226562,
      "logps/rejected": -238.18153381347656,
      "loss": 0.3263,
      "rewards/chosen": -0.6082727909088135,
      "rewards/margins": 2.283324956893921,
      "rewards/rejected": -2.8915977478027344,
      "step": 1170
    },
    {
      "epoch": 0.31,
      "grad_norm": 32.023223876953125,
      "kl": 0.0,
      "learning_rate": 3.4676786181627845e-07,
      "logps/chosen": -231.79791259765625,
      "logps/rejected": -228.96759033203125,
      "loss": 0.4064,
      "rewards/chosen": -0.6112353801727295,
      "rewards/margins": 2.084949016571045,
      "rewards/rejected": -2.6961843967437744,
      "step": 1171
    },
    {
      "epoch": 0.31,
      "grad_norm": 30.645387649536133,
      "kl": 0.0,
      "learning_rate": 3.4663700601936664e-07,
      "logps/chosen": -229.69735717773438,
      "logps/rejected": -265.0238037109375,
      "loss": 0.3055,
      "rewards/chosen": 0.29142194986343384,
      "rewards/margins": 3.7708466053009033,
      "rewards/rejected": -3.4794247150421143,
      "step": 1172
    },
    {
      "epoch": 0.31,
      "grad_norm": 32.59995651245117,
      "kl": 0.0,
      "learning_rate": 3.4650615022245484e-07,
      "logps/chosen": -262.04833984375,
      "logps/rejected": -277.7827453613281,
      "loss": 0.2863,
      "rewards/chosen": -1.32673978805542,
      "rewards/margins": 2.91971492767334,
      "rewards/rejected": -4.24645471572876,
      "step": 1173
    },
    {
      "epoch": 0.31,
      "grad_norm": 35.03938674926758,
      "kl": 0.0,
      "learning_rate": 3.4637529442554304e-07,
      "logps/chosen": -189.8060760498047,
      "logps/rejected": -258.8448181152344,
      "loss": 0.3792,
      "rewards/chosen": -0.6095502376556396,
      "rewards/margins": 1.7947406768798828,
      "rewards/rejected": -2.4042909145355225,
      "step": 1174
    },
    {
      "epoch": 0.31,
      "grad_norm": 31.531982421875,
      "kl": 0.0,
      "learning_rate": 3.4624443862863123e-07,
      "logps/chosen": -160.89028930664062,
      "logps/rejected": -330.67138671875,
      "loss": 0.2391,
      "rewards/chosen": 0.9728485941886902,
      "rewards/margins": 4.4766974449157715,
      "rewards/rejected": -3.5038487911224365,
      "step": 1175
    },
    {
      "epoch": 0.31,
      "grad_norm": 24.545883178710938,
      "kl": 0.0,
      "learning_rate": 3.4611358283171943e-07,
      "logps/chosen": -182.7744598388672,
      "logps/rejected": -288.28857421875,
      "loss": 0.3344,
      "rewards/chosen": -0.42739564180374146,
      "rewards/margins": 4.145176410675049,
      "rewards/rejected": -4.572572231292725,
      "step": 1176
    },
    {
      "epoch": 0.31,
      "grad_norm": 31.24026870727539,
      "kl": 0.0,
      "learning_rate": 3.459827270348076e-07,
      "logps/chosen": -211.71585083007812,
      "logps/rejected": -207.3837890625,
      "loss": 0.3375,
      "rewards/chosen": 0.5522348880767822,
      "rewards/margins": 3.1370089054107666,
      "rewards/rejected": -2.5847740173339844,
      "step": 1177
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.2593879699707,
      "kl": 0.0,
      "learning_rate": 3.458518712378958e-07,
      "logps/chosen": -200.26980590820312,
      "logps/rejected": -212.49972534179688,
      "loss": 0.3796,
      "rewards/chosen": -0.28433290123939514,
      "rewards/margins": 2.662614107131958,
      "rewards/rejected": -2.9469470977783203,
      "step": 1178
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.07558059692383,
      "kl": 0.0,
      "learning_rate": 3.45721015440984e-07,
      "logps/chosen": -190.99557495117188,
      "logps/rejected": -223.3245086669922,
      "loss": 0.2997,
      "rewards/chosen": 1.9847080707550049,
      "rewards/margins": 4.220609664916992,
      "rewards/rejected": -2.235901355743408,
      "step": 1179
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.867740631103516,
      "kl": 0.0,
      "learning_rate": 3.455901596440722e-07,
      "logps/chosen": -261.5715026855469,
      "logps/rejected": -251.77923583984375,
      "loss": 0.2971,
      "rewards/chosen": 0.7804626822471619,
      "rewards/margins": 4.566412925720215,
      "rewards/rejected": -3.7859504222869873,
      "step": 1180
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.3149299621582,
      "kl": 0.0,
      "learning_rate": 3.454593038471604e-07,
      "logps/chosen": -183.1739501953125,
      "logps/rejected": -226.0718231201172,
      "loss": 0.3276,
      "rewards/chosen": -0.10726749897003174,
      "rewards/margins": 2.843820095062256,
      "rewards/rejected": -2.951087713241577,
      "step": 1181
    },
    {
      "epoch": 0.31,
      "grad_norm": 26.37593650817871,
      "kl": 0.0,
      "learning_rate": 3.4532844805024866e-07,
      "logps/chosen": -159.28369140625,
      "logps/rejected": -247.39651489257812,
      "loss": 0.3558,
      "rewards/chosen": -0.06649637222290039,
      "rewards/margins": 3.216395854949951,
      "rewards/rejected": -3.2828922271728516,
      "step": 1182
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.705955505371094,
      "kl": 0.0,
      "learning_rate": 3.4519759225333685e-07,
      "logps/chosen": -212.6090545654297,
      "logps/rejected": -210.63018798828125,
      "loss": 0.3391,
      "rewards/chosen": -0.04410066828131676,
      "rewards/margins": 3.6254005432128906,
      "rewards/rejected": -3.669501304626465,
      "step": 1183
    },
    {
      "epoch": 0.31,
      "grad_norm": 40.53861618041992,
      "kl": 0.0,
      "learning_rate": 3.45066736456425e-07,
      "logps/chosen": -221.90635681152344,
      "logps/rejected": -186.09765625,
      "loss": 0.4034,
      "rewards/chosen": -0.9847675561904907,
      "rewards/margins": 1.2683357000350952,
      "rewards/rejected": -2.253103256225586,
      "step": 1184
    },
    {
      "epoch": 0.31,
      "grad_norm": 35.10524368286133,
      "kl": 0.0,
      "learning_rate": 3.449358806595132e-07,
      "logps/chosen": -174.99594116210938,
      "logps/rejected": -395.3352355957031,
      "loss": 0.4667,
      "rewards/chosen": -0.811806321144104,
      "rewards/margins": 4.3953328132629395,
      "rewards/rejected": -5.207139015197754,
      "step": 1185
    },
    {
      "epoch": 0.31,
      "grad_norm": 29.58946990966797,
      "kl": 0.0,
      "learning_rate": 3.448050248626014e-07,
      "logps/chosen": -188.7567138671875,
      "logps/rejected": -224.7203826904297,
      "loss": 0.3357,
      "rewards/chosen": -0.04228636622428894,
      "rewards/margins": 2.5439279079437256,
      "rewards/rejected": -2.586214303970337,
      "step": 1186
    },
    {
      "epoch": 0.31,
      "grad_norm": 28.963594436645508,
      "kl": 0.0,
      "learning_rate": 3.446741690656896e-07,
      "logps/chosen": -222.30162048339844,
      "logps/rejected": -302.3878173828125,
      "loss": 0.2565,
      "rewards/chosen": -0.7175504565238953,
      "rewards/margins": 2.6879355907440186,
      "rewards/rejected": -3.4054861068725586,
      "step": 1187
    },
    {
      "epoch": 0.31,
      "grad_norm": 35.3771858215332,
      "kl": 0.0,
      "learning_rate": 3.445433132687778e-07,
      "logps/chosen": -156.33737182617188,
      "logps/rejected": -252.5842742919922,
      "loss": 0.251,
      "rewards/chosen": 1.3936952352523804,
      "rewards/margins": 4.000040531158447,
      "rewards/rejected": -2.6063451766967773,
      "step": 1188
    },
    {
      "epoch": 0.31,
      "grad_norm": 29.472457885742188,
      "kl": 0.0,
      "learning_rate": 3.44412457471866e-07,
      "logps/chosen": -253.04592895507812,
      "logps/rejected": -221.87600708007812,
      "loss": 0.2159,
      "rewards/chosen": 1.3840115070343018,
      "rewards/margins": 5.230579376220703,
      "rewards/rejected": -3.8465676307678223,
      "step": 1189
    },
    {
      "epoch": 0.31,
      "grad_norm": 35.125526428222656,
      "kl": 0.0,
      "learning_rate": 3.442816016749542e-07,
      "logps/chosen": -195.70407104492188,
      "logps/rejected": -204.52281188964844,
      "loss": 0.3822,
      "rewards/chosen": 0.1059611365199089,
      "rewards/margins": 2.519016742706299,
      "rewards/rejected": -2.413055658340454,
      "step": 1190
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.76149368286133,
      "kl": 0.0,
      "learning_rate": 3.4415074587804237e-07,
      "logps/chosen": -146.63796997070312,
      "logps/rejected": -223.8983917236328,
      "loss": 0.2817,
      "rewards/chosen": 0.023099077865481377,
      "rewards/margins": 2.1824231147766113,
      "rewards/rejected": -2.1593239307403564,
      "step": 1191
    },
    {
      "epoch": 0.31,
      "grad_norm": 31.83498191833496,
      "kl": 0.0,
      "learning_rate": 3.4401989008113057e-07,
      "logps/chosen": -163.8594970703125,
      "logps/rejected": -251.00015258789062,
      "loss": 0.2421,
      "rewards/chosen": -0.8251597285270691,
      "rewards/margins": 2.2953813076019287,
      "rewards/rejected": -3.1205410957336426,
      "step": 1192
    },
    {
      "epoch": 0.31,
      "grad_norm": 39.38047790527344,
      "kl": 0.0,
      "learning_rate": 3.4388903428421876e-07,
      "logps/chosen": -245.26820373535156,
      "logps/rejected": -191.28118896484375,
      "loss": 0.2675,
      "rewards/chosen": 0.16523434221744537,
      "rewards/margins": 1.8972578048706055,
      "rewards/rejected": -1.7320234775543213,
      "step": 1193
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.019287109375,
      "kl": 0.0,
      "learning_rate": 3.4375817848730696e-07,
      "logps/chosen": -169.6418914794922,
      "logps/rejected": -283.7315368652344,
      "loss": 0.2766,
      "rewards/chosen": 0.9147642850875854,
      "rewards/margins": 3.9462924003601074,
      "rewards/rejected": -3.0315279960632324,
      "step": 1194
    },
    {
      "epoch": 0.31,
      "grad_norm": 35.77202606201172,
      "kl": 0.0,
      "learning_rate": 3.436273226903952e-07,
      "logps/chosen": -218.32711791992188,
      "logps/rejected": -284.1678161621094,
      "loss": 0.4119,
      "rewards/chosen": -0.4475761651992798,
      "rewards/margins": 3.220229148864746,
      "rewards/rejected": -3.6678051948547363,
      "step": 1195
    },
    {
      "epoch": 0.31,
      "grad_norm": 38.87939453125,
      "kl": 0.0,
      "learning_rate": 3.434964668934834e-07,
      "logps/chosen": -267.1624755859375,
      "logps/rejected": -216.53843688964844,
      "loss": 0.382,
      "rewards/chosen": -0.9228044748306274,
      "rewards/margins": 1.4545472860336304,
      "rewards/rejected": -2.377351760864258,
      "step": 1196
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.275413513183594,
      "kl": 0.0,
      "learning_rate": 3.433656110965716e-07,
      "logps/chosen": -179.17579650878906,
      "logps/rejected": -263.31060791015625,
      "loss": 0.3637,
      "rewards/chosen": -1.5491220951080322,
      "rewards/margins": 2.713186502456665,
      "rewards/rejected": -4.262308597564697,
      "step": 1197
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.126121520996094,
      "kl": 0.0,
      "learning_rate": 3.432347552996598e-07,
      "logps/chosen": -212.6999969482422,
      "logps/rejected": -227.0008544921875,
      "loss": 0.4075,
      "rewards/chosen": -0.4199311137199402,
      "rewards/margins": 3.25209903717041,
      "rewards/rejected": -3.672030210494995,
      "step": 1198
    },
    {
      "epoch": 0.31,
      "grad_norm": 34.182960510253906,
      "kl": 0.0,
      "learning_rate": 3.4310389950274794e-07,
      "logps/chosen": -250.26296997070312,
      "logps/rejected": -278.6178894042969,
      "loss": 0.2245,
      "rewards/chosen": 1.2832039594650269,
      "rewards/margins": 4.30702543258667,
      "rewards/rejected": -3.0238215923309326,
      "step": 1199
    },
    {
      "epoch": 0.31,
      "grad_norm": 28.641311645507812,
      "kl": 0.0,
      "learning_rate": 3.4297304370583614e-07,
      "logps/chosen": -316.5002136230469,
      "logps/rejected": -273.9151611328125,
      "loss": 0.2202,
      "rewards/chosen": -1.1112972497940063,
      "rewards/margins": 1.6765261888504028,
      "rewards/rejected": -2.787823438644409,
      "step": 1200
    },
    {
      "epoch": 0.31,
      "grad_norm": 32.865848541259766,
      "kl": 0.0,
      "learning_rate": 3.4284218790892433e-07,
      "logps/chosen": -256.8058166503906,
      "logps/rejected": -209.70956420898438,
      "loss": 0.2936,
      "rewards/chosen": -0.7760433554649353,
      "rewards/margins": 3.5908236503601074,
      "rewards/rejected": -4.3668670654296875,
      "step": 1201
    },
    {
      "epoch": 0.31,
      "grad_norm": 36.68121337890625,
      "kl": 0.0,
      "learning_rate": 3.4271133211201253e-07,
      "logps/chosen": -296.3019104003906,
      "logps/rejected": -201.8861083984375,
      "loss": 0.3134,
      "rewards/chosen": -1.5894598960876465,
      "rewards/margins": 1.1896445751190186,
      "rewards/rejected": -2.779104471206665,
      "step": 1202
    },
    {
      "epoch": 0.31,
      "grad_norm": 31.72292137145996,
      "kl": 0.0,
      "learning_rate": 3.425804763151007e-07,
      "logps/chosen": -211.2059783935547,
      "logps/rejected": -217.85487365722656,
      "loss": 0.1636,
      "rewards/chosen": 1.3678613901138306,
      "rewards/margins": 4.285525321960449,
      "rewards/rejected": -2.917663812637329,
      "step": 1203
    },
    {
      "epoch": 0.32,
      "grad_norm": 30.31131935119629,
      "kl": 0.0,
      "learning_rate": 3.424496205181889e-07,
      "logps/chosen": -167.01673889160156,
      "logps/rejected": -249.53805541992188,
      "loss": 0.2794,
      "rewards/chosen": 0.2843502163887024,
      "rewards/margins": 4.522599697113037,
      "rewards/rejected": -4.2382493019104,
      "step": 1204
    },
    {
      "epoch": 0.32,
      "grad_norm": 35.710227966308594,
      "kl": 0.0,
      "learning_rate": 3.423187647212771e-07,
      "logps/chosen": -276.68084716796875,
      "logps/rejected": -268.0350036621094,
      "loss": 0.3991,
      "rewards/chosen": -1.6733345985412598,
      "rewards/margins": 2.664903163909912,
      "rewards/rejected": -4.338237762451172,
      "step": 1205
    },
    {
      "epoch": 0.32,
      "grad_norm": 30.06000518798828,
      "kl": 0.0,
      "learning_rate": 3.421879089243653e-07,
      "logps/chosen": -129.31488037109375,
      "logps/rejected": -242.54222106933594,
      "loss": 0.3198,
      "rewards/chosen": 0.6263553500175476,
      "rewards/margins": 4.219228267669678,
      "rewards/rejected": -3.5928728580474854,
      "step": 1206
    },
    {
      "epoch": 0.32,
      "grad_norm": 30.132490158081055,
      "kl": 0.0,
      "learning_rate": 3.420570531274535e-07,
      "logps/chosen": -215.542236328125,
      "logps/rejected": -291.1411437988281,
      "loss": 0.334,
      "rewards/chosen": -0.12181363999843597,
      "rewards/margins": 4.176156044006348,
      "rewards/rejected": -4.297969818115234,
      "step": 1207
    },
    {
      "epoch": 0.32,
      "grad_norm": 27.812297821044922,
      "kl": 0.0,
      "learning_rate": 3.4192619733054176e-07,
      "logps/chosen": -249.6283416748047,
      "logps/rejected": -277.0008239746094,
      "loss": 0.2755,
      "rewards/chosen": -0.5821962356567383,
      "rewards/margins": 4.218933582305908,
      "rewards/rejected": -4.8011298179626465,
      "step": 1208
    },
    {
      "epoch": 0.32,
      "grad_norm": 29.9333553314209,
      "kl": 0.0,
      "learning_rate": 3.4179534153362995e-07,
      "logps/chosen": -163.0187530517578,
      "logps/rejected": -219.123046875,
      "loss": 0.3825,
      "rewards/chosen": -0.026593446731567383,
      "rewards/margins": 3.478567361831665,
      "rewards/rejected": -3.5051608085632324,
      "step": 1209
    },
    {
      "epoch": 0.32,
      "grad_norm": 38.39295196533203,
      "kl": 0.0,
      "learning_rate": 3.4166448573671815e-07,
      "logps/chosen": -208.79698181152344,
      "logps/rejected": -254.31642150878906,
      "loss": 0.1957,
      "rewards/chosen": 0.22365638613700867,
      "rewards/margins": 2.581850290298462,
      "rewards/rejected": -2.358193874359131,
      "step": 1210
    },
    {
      "epoch": 0.32,
      "grad_norm": 38.24787139892578,
      "kl": 0.0,
      "learning_rate": 3.4153362993980635e-07,
      "logps/chosen": -168.13674926757812,
      "logps/rejected": -360.1756896972656,
      "loss": 0.3725,
      "rewards/chosen": -0.9074063301086426,
      "rewards/margins": 3.2058396339416504,
      "rewards/rejected": -4.113245964050293,
      "step": 1211
    },
    {
      "epoch": 0.32,
      "grad_norm": 31.460186004638672,
      "kl": 0.0,
      "learning_rate": 3.4140277414289454e-07,
      "logps/chosen": -224.10098266601562,
      "logps/rejected": -221.63059997558594,
      "loss": 0.3513,
      "rewards/chosen": 0.035461753606796265,
      "rewards/margins": 3.605962038040161,
      "rewards/rejected": -3.570500373840332,
      "step": 1212
    },
    {
      "epoch": 0.32,
      "grad_norm": 35.52521896362305,
      "kl": 0.0,
      "learning_rate": 3.4127191834598274e-07,
      "logps/chosen": -181.10682678222656,
      "logps/rejected": -281.4771728515625,
      "loss": 0.2102,
      "rewards/chosen": 1.0580134391784668,
      "rewards/margins": 5.643892765045166,
      "rewards/rejected": -4.585879325866699,
      "step": 1213
    },
    {
      "epoch": 0.32,
      "grad_norm": 39.60414505004883,
      "kl": 0.0,
      "learning_rate": 3.4114106254907093e-07,
      "logps/chosen": -282.92169189453125,
      "logps/rejected": -266.72705078125,
      "loss": 0.41,
      "rewards/chosen": -0.517210066318512,
      "rewards/margins": 2.233038902282715,
      "rewards/rejected": -2.750248908996582,
      "step": 1214
    },
    {
      "epoch": 0.32,
      "grad_norm": 29.621784210205078,
      "kl": 0.0,
      "learning_rate": 3.410102067521591e-07,
      "logps/chosen": -267.070556640625,
      "logps/rejected": -187.371826171875,
      "loss": 0.3786,
      "rewards/chosen": 0.6396648287773132,
      "rewards/margins": 3.1138055324554443,
      "rewards/rejected": -2.4741406440734863,
      "step": 1215
    },
    {
      "epoch": 0.32,
      "grad_norm": 30.271541595458984,
      "kl": 0.0,
      "learning_rate": 3.4087935095524727e-07,
      "logps/chosen": -183.39422607421875,
      "logps/rejected": -191.06455993652344,
      "loss": 0.2562,
      "rewards/chosen": -0.22444909811019897,
      "rewards/margins": 2.9028217792510986,
      "rewards/rejected": -3.1272709369659424,
      "step": 1216
    },
    {
      "epoch": 0.32,
      "grad_norm": 44.55641174316406,
      "kl": 0.0,
      "learning_rate": 3.4074849515833547e-07,
      "logps/chosen": -303.41845703125,
      "logps/rejected": -240.85781860351562,
      "loss": 0.2687,
      "rewards/chosen": 0.5954338908195496,
      "rewards/margins": 4.038144111633301,
      "rewards/rejected": -3.4427103996276855,
      "step": 1217
    },
    {
      "epoch": 0.32,
      "grad_norm": 32.984859466552734,
      "kl": 0.0,
      "learning_rate": 3.4061763936142367e-07,
      "logps/chosen": -242.97837829589844,
      "logps/rejected": -271.81951904296875,
      "loss": 0.3608,
      "rewards/chosen": -0.5196236371994019,
      "rewards/margins": 4.061900615692139,
      "rewards/rejected": -4.58152437210083,
      "step": 1218
    },
    {
      "epoch": 0.32,
      "grad_norm": 33.90766525268555,
      "kl": 0.0,
      "learning_rate": 3.4048678356451186e-07,
      "logps/chosen": -212.87364196777344,
      "logps/rejected": -280.95367431640625,
      "loss": 0.2355,
      "rewards/chosen": -0.5813936591148376,
      "rewards/margins": 4.398910045623779,
      "rewards/rejected": -4.980303764343262,
      "step": 1219
    },
    {
      "epoch": 0.32,
      "grad_norm": 35.093475341796875,
      "kl": 0.0,
      "learning_rate": 3.4035592776760006e-07,
      "logps/chosen": -157.0143280029297,
      "logps/rejected": -259.2233581542969,
      "loss": 0.3154,
      "rewards/chosen": -0.6709714531898499,
      "rewards/margins": 4.702061653137207,
      "rewards/rejected": -5.373033046722412,
      "step": 1220
    },
    {
      "epoch": 0.32,
      "grad_norm": 32.3795051574707,
      "kl": 0.0,
      "learning_rate": 3.402250719706883e-07,
      "logps/chosen": -224.70108032226562,
      "logps/rejected": -208.52987670898438,
      "loss": 0.2729,
      "rewards/chosen": 0.46259552240371704,
      "rewards/margins": 4.860898494720459,
      "rewards/rejected": -4.398303031921387,
      "step": 1221
    },
    {
      "epoch": 0.32,
      "grad_norm": 37.3427848815918,
      "kl": 0.0,
      "learning_rate": 3.400942161737765e-07,
      "logps/chosen": -329.36810302734375,
      "logps/rejected": -238.87973022460938,
      "loss": 0.323,
      "rewards/chosen": -1.7284150123596191,
      "rewards/margins": 1.4195363521575928,
      "rewards/rejected": -3.147951364517212,
      "step": 1222
    },
    {
      "epoch": 0.32,
      "grad_norm": 32.58262634277344,
      "kl": 0.0,
      "learning_rate": 3.399633603768647e-07,
      "logps/chosen": -324.0147399902344,
      "logps/rejected": -252.42791748046875,
      "loss": 0.3242,
      "rewards/chosen": 0.827644944190979,
      "rewards/margins": 4.047224521636963,
      "rewards/rejected": -3.2195796966552734,
      "step": 1223
    },
    {
      "epoch": 0.32,
      "grad_norm": 41.58140563964844,
      "kl": 0.0,
      "learning_rate": 3.398325045799529e-07,
      "logps/chosen": -160.6015625,
      "logps/rejected": -184.90505981445312,
      "loss": 0.4001,
      "rewards/chosen": -0.4772702157497406,
      "rewards/margins": 2.276743173599243,
      "rewards/rejected": -2.7540132999420166,
      "step": 1224
    },
    {
      "epoch": 0.32,
      "grad_norm": 34.357051849365234,
      "kl": 0.0,
      "learning_rate": 3.397016487830411e-07,
      "logps/chosen": -260.9039001464844,
      "logps/rejected": -277.7361145019531,
      "loss": 0.3566,
      "rewards/chosen": -0.6111627221107483,
      "rewards/margins": 3.346715211868286,
      "rewards/rejected": -3.9578778743743896,
      "step": 1225
    },
    {
      "epoch": 0.32,
      "grad_norm": 29.176429748535156,
      "kl": 0.0,
      "learning_rate": 3.395707929861293e-07,
      "logps/chosen": -180.9840850830078,
      "logps/rejected": -256.6327209472656,
      "loss": 0.2435,
      "rewards/chosen": 1.7163417339324951,
      "rewards/margins": 4.8167266845703125,
      "rewards/rejected": -3.1003851890563965,
      "step": 1226
    },
    {
      "epoch": 0.32,
      "grad_norm": 32.2241096496582,
      "kl": 0.0,
      "learning_rate": 3.394399371892175e-07,
      "logps/chosen": -216.3617706298828,
      "logps/rejected": -303.110595703125,
      "loss": 0.2862,
      "rewards/chosen": 0.3831513524055481,
      "rewards/margins": 5.223682880401611,
      "rewards/rejected": -4.840531349182129,
      "step": 1227
    },
    {
      "epoch": 0.32,
      "grad_norm": 31.990962982177734,
      "kl": 0.0,
      "learning_rate": 3.393090813923057e-07,
      "logps/chosen": -193.42356872558594,
      "logps/rejected": -281.7864685058594,
      "loss": 0.2524,
      "rewards/chosen": 0.4198690950870514,
      "rewards/margins": 3.9765639305114746,
      "rewards/rejected": -3.556694746017456,
      "step": 1228
    },
    {
      "epoch": 0.32,
      "grad_norm": 35.88570785522461,
      "kl": 0.0,
      "learning_rate": 3.391782255953939e-07,
      "logps/chosen": -199.9151611328125,
      "logps/rejected": -377.8648986816406,
      "loss": 0.2915,
      "rewards/chosen": -0.035070642828941345,
      "rewards/margins": 4.737933158874512,
      "rewards/rejected": -4.773003578186035,
      "step": 1229
    },
    {
      "epoch": 0.32,
      "grad_norm": 34.06972122192383,
      "kl": 0.0,
      "learning_rate": 3.39047369798482e-07,
      "logps/chosen": -178.60128784179688,
      "logps/rejected": -202.55992126464844,
      "loss": 0.3516,
      "rewards/chosen": -0.07218974083662033,
      "rewards/margins": 2.193382978439331,
      "rewards/rejected": -2.2655727863311768,
      "step": 1230
    },
    {
      "epoch": 0.32,
      "grad_norm": 25.618955612182617,
      "kl": 0.0,
      "learning_rate": 3.389165140015702e-07,
      "logps/chosen": -183.26873779296875,
      "logps/rejected": -279.2188415527344,
      "loss": 0.2036,
      "rewards/chosen": 0.7737140655517578,
      "rewards/margins": 4.777088642120361,
      "rewards/rejected": -4.0033745765686035,
      "step": 1231
    },
    {
      "epoch": 0.32,
      "grad_norm": 35.26194763183594,
      "kl": 0.0,
      "learning_rate": 3.387856582046584e-07,
      "logps/chosen": -228.4512481689453,
      "logps/rejected": -207.5843048095703,
      "loss": 0.2103,
      "rewards/chosen": -0.6176943778991699,
      "rewards/margins": 2.538363218307495,
      "rewards/rejected": -3.156057596206665,
      "step": 1232
    },
    {
      "epoch": 0.32,
      "grad_norm": 31.332338333129883,
      "kl": 0.0,
      "learning_rate": 3.386548024077466e-07,
      "logps/chosen": -224.06138610839844,
      "logps/rejected": -265.4245910644531,
      "loss": 0.2006,
      "rewards/chosen": 1.4042233228683472,
      "rewards/margins": 6.1257853507995605,
      "rewards/rejected": -4.721561908721924,
      "step": 1233
    },
    {
      "epoch": 0.32,
      "grad_norm": 31.065032958984375,
      "kl": 0.0,
      "learning_rate": 3.3852394661083486e-07,
      "logps/chosen": -166.82833862304688,
      "logps/rejected": -242.60174560546875,
      "loss": 0.2613,
      "rewards/chosen": 1.596419095993042,
      "rewards/margins": 5.903311729431152,
      "rewards/rejected": -4.306892395019531,
      "step": 1234
    },
    {
      "epoch": 0.32,
      "grad_norm": 46.043609619140625,
      "kl": 0.0,
      "learning_rate": 3.3839309081392305e-07,
      "logps/chosen": -263.305419921875,
      "logps/rejected": -230.71295166015625,
      "loss": 0.257,
      "rewards/chosen": 0.45542898774147034,
      "rewards/margins": 3.6104214191436768,
      "rewards/rejected": -3.1549923419952393,
      "step": 1235
    },
    {
      "epoch": 0.32,
      "grad_norm": 23.832841873168945,
      "kl": 0.0,
      "learning_rate": 3.3826223501701125e-07,
      "logps/chosen": -253.24847412109375,
      "logps/rejected": -249.77139282226562,
      "loss": 0.2069,
      "rewards/chosen": 1.0765002965927124,
      "rewards/margins": 5.492335796356201,
      "rewards/rejected": -4.415835380554199,
      "step": 1236
    },
    {
      "epoch": 0.32,
      "grad_norm": 30.16594123840332,
      "kl": 0.0,
      "learning_rate": 3.3813137922009944e-07,
      "logps/chosen": -155.04786682128906,
      "logps/rejected": -249.30087280273438,
      "loss": 0.2036,
      "rewards/chosen": 0.18218417465686798,
      "rewards/margins": 4.265267372131348,
      "rewards/rejected": -4.083083152770996,
      "step": 1237
    },
    {
      "epoch": 0.32,
      "grad_norm": 28.164379119873047,
      "kl": 0.0,
      "learning_rate": 3.3800052342318764e-07,
      "logps/chosen": -170.76309204101562,
      "logps/rejected": -257.8999328613281,
      "loss": 0.2369,
      "rewards/chosen": -0.30757319927215576,
      "rewards/margins": 3.608126163482666,
      "rewards/rejected": -3.9156994819641113,
      "step": 1238
    },
    {
      "epoch": 0.32,
      "grad_norm": 26.276927947998047,
      "kl": 0.0,
      "learning_rate": 3.3786966762627584e-07,
      "logps/chosen": -264.63726806640625,
      "logps/rejected": -238.72987365722656,
      "loss": 0.2766,
      "rewards/chosen": -0.26351839303970337,
      "rewards/margins": 4.711465358734131,
      "rewards/rejected": -4.9749836921691895,
      "step": 1239
    },
    {
      "epoch": 0.32,
      "grad_norm": 37.53226089477539,
      "kl": 0.0,
      "learning_rate": 3.3773881182936403e-07,
      "logps/chosen": -281.6136779785156,
      "logps/rejected": -226.8959503173828,
      "loss": 0.2406,
      "rewards/chosen": 0.10490907728672028,
      "rewards/margins": 3.665008783340454,
      "rewards/rejected": -3.5600996017456055,
      "step": 1240
    },
    {
      "epoch": 0.32,
      "grad_norm": 31.892934799194336,
      "kl": 0.0,
      "learning_rate": 3.3760795603245223e-07,
      "logps/chosen": -195.93685913085938,
      "logps/rejected": -157.26161193847656,
      "loss": 0.2046,
      "rewards/chosen": 1.778195858001709,
      "rewards/margins": 4.448366165161133,
      "rewards/rejected": -2.670170545578003,
      "step": 1241
    },
    {
      "epoch": 0.33,
      "grad_norm": 30.623804092407227,
      "kl": 0.0,
      "learning_rate": 3.374771002355404e-07,
      "logps/chosen": -290.7236328125,
      "logps/rejected": -267.4052734375,
      "loss": 0.1627,
      "rewards/chosen": 1.2010747194290161,
      "rewards/margins": 3.951550006866455,
      "rewards/rejected": -2.7504751682281494,
      "step": 1242
    },
    {
      "epoch": 0.33,
      "grad_norm": 32.45186996459961,
      "kl": 0.0,
      "learning_rate": 3.373462444386286e-07,
      "logps/chosen": -163.9646759033203,
      "logps/rejected": -299.09814453125,
      "loss": 0.3236,
      "rewards/chosen": 0.41362541913986206,
      "rewards/margins": 4.08070182800293,
      "rewards/rejected": -3.667076587677002,
      "step": 1243
    },
    {
      "epoch": 0.33,
      "grad_norm": 30.40380859375,
      "kl": 0.0,
      "learning_rate": 3.372153886417168e-07,
      "logps/chosen": -238.871337890625,
      "logps/rejected": -240.1171417236328,
      "loss": 0.3127,
      "rewards/chosen": -0.03489166498184204,
      "rewards/margins": 2.8372724056243896,
      "rewards/rejected": -2.872164011001587,
      "step": 1244
    },
    {
      "epoch": 0.33,
      "grad_norm": 35.650115966796875,
      "kl": 0.0,
      "learning_rate": 3.37084532844805e-07,
      "logps/chosen": -289.92877197265625,
      "logps/rejected": -228.46633911132812,
      "loss": 0.3788,
      "rewards/chosen": -0.9537517428398132,
      "rewards/margins": 1.8257262706756592,
      "rewards/rejected": -2.779478073120117,
      "step": 1245
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.27054977416992,
      "kl": 0.0,
      "learning_rate": 3.3695367704789316e-07,
      "logps/chosen": -249.22886657714844,
      "logps/rejected": -367.12518310546875,
      "loss": 0.1397,
      "rewards/chosen": -1.49248206615448,
      "rewards/margins": 1.9466301202774048,
      "rewards/rejected": -3.4391121864318848,
      "step": 1246
    },
    {
      "epoch": 0.33,
      "grad_norm": 37.12660598754883,
      "kl": 0.0,
      "learning_rate": 3.368228212509814e-07,
      "logps/chosen": -214.15615844726562,
      "logps/rejected": -298.9302673339844,
      "loss": 0.3329,
      "rewards/chosen": 0.08646465837955475,
      "rewards/margins": 2.5704357624053955,
      "rewards/rejected": -2.483971118927002,
      "step": 1247
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.774932861328125,
      "kl": 0.0,
      "learning_rate": 3.366919654540696e-07,
      "logps/chosen": -171.0416259765625,
      "logps/rejected": -205.38577270507812,
      "loss": 0.3242,
      "rewards/chosen": -0.309177964925766,
      "rewards/margins": 3.087322235107422,
      "rewards/rejected": -3.3965001106262207,
      "step": 1248
    },
    {
      "epoch": 0.33,
      "grad_norm": 28.201087951660156,
      "kl": 0.0,
      "learning_rate": 3.365611096571578e-07,
      "logps/chosen": -197.46839904785156,
      "logps/rejected": -156.85015869140625,
      "loss": 0.3639,
      "rewards/chosen": -0.526721715927124,
      "rewards/margins": 2.991612672805786,
      "rewards/rejected": -3.51833438873291,
      "step": 1249
    },
    {
      "epoch": 0.33,
      "grad_norm": 40.35578918457031,
      "kl": 0.0,
      "learning_rate": 3.36430253860246e-07,
      "logps/chosen": -216.8326416015625,
      "logps/rejected": -204.6576385498047,
      "loss": 0.2314,
      "rewards/chosen": 0.758324921131134,
      "rewards/margins": 3.4638853073120117,
      "rewards/rejected": -2.7055604457855225,
      "step": 1250
    },
    {
      "epoch": 0.33,
      "grad_norm": 49.18851089477539,
      "kl": 0.0,
      "learning_rate": 3.362993980633342e-07,
      "logps/chosen": -201.96607971191406,
      "logps/rejected": -231.14260864257812,
      "loss": 0.4535,
      "rewards/chosen": -0.826595664024353,
      "rewards/margins": 2.144320487976074,
      "rewards/rejected": -2.9709160327911377,
      "step": 1251
    },
    {
      "epoch": 0.33,
      "grad_norm": 28.66099739074707,
      "kl": 0.0,
      "learning_rate": 3.361685422664224e-07,
      "logps/chosen": -226.81207275390625,
      "logps/rejected": -262.93695068359375,
      "loss": 0.2788,
      "rewards/chosen": -0.13777820765972137,
      "rewards/margins": 3.4477951526641846,
      "rewards/rejected": -3.585573434829712,
      "step": 1252
    },
    {
      "epoch": 0.33,
      "grad_norm": 31.51655387878418,
      "kl": 0.0,
      "learning_rate": 3.360376864695106e-07,
      "logps/chosen": -211.86929321289062,
      "logps/rejected": -180.88951110839844,
      "loss": 0.3393,
      "rewards/chosen": 1.378541111946106,
      "rewards/margins": 3.041508674621582,
      "rewards/rejected": -1.662967562675476,
      "step": 1253
    },
    {
      "epoch": 0.33,
      "grad_norm": 39.14763641357422,
      "kl": 0.0,
      "learning_rate": 3.359068306725988e-07,
      "logps/chosen": -187.16439819335938,
      "logps/rejected": -303.744873046875,
      "loss": 0.273,
      "rewards/chosen": 0.921317994594574,
      "rewards/margins": 3.8566207885742188,
      "rewards/rejected": -2.935302734375,
      "step": 1254
    },
    {
      "epoch": 0.33,
      "grad_norm": 35.68745040893555,
      "kl": 0.0,
      "learning_rate": 3.35775974875687e-07,
      "logps/chosen": -155.5998992919922,
      "logps/rejected": -257.8075256347656,
      "loss": 0.2658,
      "rewards/chosen": 0.012111015617847443,
      "rewards/margins": 5.436872482299805,
      "rewards/rejected": -5.4247612953186035,
      "step": 1255
    },
    {
      "epoch": 0.33,
      "grad_norm": 29.839157104492188,
      "kl": 0.0,
      "learning_rate": 3.3564511907877517e-07,
      "logps/chosen": -293.30322265625,
      "logps/rejected": -283.6425476074219,
      "loss": 0.254,
      "rewards/chosen": 1.0832270383834839,
      "rewards/margins": 3.56386137008667,
      "rewards/rejected": -2.4806342124938965,
      "step": 1256
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.9878044128418,
      "kl": 0.0,
      "learning_rate": 3.3551426328186337e-07,
      "logps/chosen": -195.77926635742188,
      "logps/rejected": -237.57620239257812,
      "loss": 0.2727,
      "rewards/chosen": 0.3099658787250519,
      "rewards/margins": 3.34023118019104,
      "rewards/rejected": -3.0302653312683105,
      "step": 1257
    },
    {
      "epoch": 0.33,
      "grad_norm": 40.25716018676758,
      "kl": 0.0,
      "learning_rate": 3.3538340748495156e-07,
      "logps/chosen": -231.48263549804688,
      "logps/rejected": -191.4035186767578,
      "loss": 0.354,
      "rewards/chosen": 0.30240654945373535,
      "rewards/margins": 3.148895502090454,
      "rewards/rejected": -2.8464889526367188,
      "step": 1258
    },
    {
      "epoch": 0.33,
      "grad_norm": 47.265533447265625,
      "kl": 0.0,
      "learning_rate": 3.352525516880398e-07,
      "logps/chosen": -204.80039978027344,
      "logps/rejected": -298.1878967285156,
      "loss": 0.3726,
      "rewards/chosen": -0.2800760865211487,
      "rewards/margins": 2.6696648597717285,
      "rewards/rejected": -2.9497408866882324,
      "step": 1259
    },
    {
      "epoch": 0.33,
      "grad_norm": 21.470439910888672,
      "kl": 0.0,
      "learning_rate": 3.35121695891128e-07,
      "logps/chosen": -235.9132843017578,
      "logps/rejected": -189.3260955810547,
      "loss": 0.1427,
      "rewards/chosen": 2.9290521144866943,
      "rewards/margins": 5.725638389587402,
      "rewards/rejected": -2.796586513519287,
      "step": 1260
    },
    {
      "epoch": 0.33,
      "grad_norm": 30.650962829589844,
      "kl": 0.0,
      "learning_rate": 3.3499084009421615e-07,
      "logps/chosen": -199.38571166992188,
      "logps/rejected": -224.87171936035156,
      "loss": 0.2969,
      "rewards/chosen": 0.1919896900653839,
      "rewards/margins": 2.7235448360443115,
      "rewards/rejected": -2.53155517578125,
      "step": 1261
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.12488555908203,
      "kl": 0.0,
      "learning_rate": 3.3485998429730435e-07,
      "logps/chosen": -232.99105834960938,
      "logps/rejected": -231.17054748535156,
      "loss": 0.2222,
      "rewards/chosen": 1.200358271598816,
      "rewards/margins": 5.405186176300049,
      "rewards/rejected": -4.204827785491943,
      "step": 1262
    },
    {
      "epoch": 0.33,
      "grad_norm": 27.685657501220703,
      "kl": 0.0,
      "learning_rate": 3.3472912850039254e-07,
      "logps/chosen": -186.10411071777344,
      "logps/rejected": -265.6203918457031,
      "loss": 0.2324,
      "rewards/chosen": -0.5338296890258789,
      "rewards/margins": 4.319558620452881,
      "rewards/rejected": -4.85338830947876,
      "step": 1263
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.48194122314453,
      "kl": 0.0,
      "learning_rate": 3.3459827270348074e-07,
      "logps/chosen": -229.52662658691406,
      "logps/rejected": -180.10003662109375,
      "loss": 0.3356,
      "rewards/chosen": -0.27033787965774536,
      "rewards/margins": 2.364189386367798,
      "rewards/rejected": -2.6345272064208984,
      "step": 1264
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.939762115478516,
      "kl": 0.0,
      "learning_rate": 3.3446741690656893e-07,
      "logps/chosen": -208.50833129882812,
      "logps/rejected": -187.72377014160156,
      "loss": 0.3683,
      "rewards/chosen": -0.9061264395713806,
      "rewards/margins": 1.2528698444366455,
      "rewards/rejected": -2.158996343612671,
      "step": 1265
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.020294189453125,
      "kl": 0.0,
      "learning_rate": 3.3433656110965713e-07,
      "logps/chosen": -325.7091064453125,
      "logps/rejected": -205.07147216796875,
      "loss": 0.3448,
      "rewards/chosen": -0.6361463069915771,
      "rewards/margins": 2.527127504348755,
      "rewards/rejected": -3.163273811340332,
      "step": 1266
    },
    {
      "epoch": 0.33,
      "grad_norm": 34.751121520996094,
      "kl": 0.0,
      "learning_rate": 3.3420570531274533e-07,
      "logps/chosen": -207.54547119140625,
      "logps/rejected": -274.9513854980469,
      "loss": 0.3026,
      "rewards/chosen": 1.1950047016143799,
      "rewards/margins": 5.298422813415527,
      "rewards/rejected": -4.103418350219727,
      "step": 1267
    },
    {
      "epoch": 0.33,
      "grad_norm": 29.884418487548828,
      "kl": 0.0,
      "learning_rate": 3.340748495158335e-07,
      "logps/chosen": -208.39561462402344,
      "logps/rejected": -266.50848388671875,
      "loss": 0.2547,
      "rewards/chosen": -0.3481028974056244,
      "rewards/margins": 3.2418465614318848,
      "rewards/rejected": -3.589949369430542,
      "step": 1268
    },
    {
      "epoch": 0.33,
      "grad_norm": 31.327274322509766,
      "kl": 0.0,
      "learning_rate": 3.339439937189217e-07,
      "logps/chosen": -122.44271087646484,
      "logps/rejected": -207.86700439453125,
      "loss": 0.2258,
      "rewards/chosen": -0.5743510127067566,
      "rewards/margins": 2.366105318069458,
      "rewards/rejected": -2.9404563903808594,
      "step": 1269
    },
    {
      "epoch": 0.33,
      "grad_norm": 32.57371520996094,
      "kl": 0.0,
      "learning_rate": 3.338131379220099e-07,
      "logps/chosen": -242.4744873046875,
      "logps/rejected": -311.08563232421875,
      "loss": 0.4399,
      "rewards/chosen": 0.11520695686340332,
      "rewards/margins": 2.5848286151885986,
      "rewards/rejected": -2.4696216583251953,
      "step": 1270
    },
    {
      "epoch": 0.33,
      "grad_norm": 31.31317901611328,
      "kl": 0.0,
      "learning_rate": 3.336822821250981e-07,
      "logps/chosen": -240.73184204101562,
      "logps/rejected": -262.8088073730469,
      "loss": 0.26,
      "rewards/chosen": 0.389142781496048,
      "rewards/margins": 2.956101417541504,
      "rewards/rejected": -2.5669586658477783,
      "step": 1271
    },
    {
      "epoch": 0.33,
      "grad_norm": 31.12925910949707,
      "kl": 0.0,
      "learning_rate": 3.3355142632818636e-07,
      "logps/chosen": -196.77252197265625,
      "logps/rejected": -157.89349365234375,
      "loss": 0.2793,
      "rewards/chosen": 1.9008492231369019,
      "rewards/margins": 3.7623605728149414,
      "rewards/rejected": -1.8615113496780396,
      "step": 1272
    },
    {
      "epoch": 0.33,
      "grad_norm": 30.95001983642578,
      "kl": 0.0,
      "learning_rate": 3.3342057053127456e-07,
      "logps/chosen": -212.56983947753906,
      "logps/rejected": -295.71539306640625,
      "loss": 0.2432,
      "rewards/chosen": 0.1810222566127777,
      "rewards/margins": 5.87899112701416,
      "rewards/rejected": -5.69796895980835,
      "step": 1273
    },
    {
      "epoch": 0.33,
      "grad_norm": 28.166805267333984,
      "kl": 0.0,
      "learning_rate": 3.3328971473436275e-07,
      "logps/chosen": -232.2152862548828,
      "logps/rejected": -165.13128662109375,
      "loss": 0.2392,
      "rewards/chosen": 2.376075506210327,
      "rewards/margins": 5.288122177124023,
      "rewards/rejected": -2.912046432495117,
      "step": 1274
    },
    {
      "epoch": 0.33,
      "grad_norm": 25.603988647460938,
      "kl": 0.0,
      "learning_rate": 3.3315885893745095e-07,
      "logps/chosen": -156.23667907714844,
      "logps/rejected": -237.1687469482422,
      "loss": 0.2811,
      "rewards/chosen": -1.1099730730056763,
      "rewards/margins": 3.728555679321289,
      "rewards/rejected": -4.838528633117676,
      "step": 1275
    },
    {
      "epoch": 0.33,
      "grad_norm": 33.69292068481445,
      "kl": 0.0,
      "learning_rate": 3.3302800314053914e-07,
      "logps/chosen": -239.6951904296875,
      "logps/rejected": -345.7257995605469,
      "loss": 0.2289,
      "rewards/chosen": 0.387544721364975,
      "rewards/margins": 4.686365604400635,
      "rewards/rejected": -4.298820972442627,
      "step": 1276
    },
    {
      "epoch": 0.33,
      "grad_norm": 41.52256774902344,
      "kl": 0.0,
      "learning_rate": 3.328971473436273e-07,
      "logps/chosen": -274.7806701660156,
      "logps/rejected": -259.1031494140625,
      "loss": 0.2678,
      "rewards/chosen": 3.6112773418426514,
      "rewards/margins": 6.836021423339844,
      "rewards/rejected": -3.2247438430786133,
      "step": 1277
    },
    {
      "epoch": 0.33,
      "grad_norm": 36.408992767333984,
      "kl": 0.0,
      "learning_rate": 3.327662915467155e-07,
      "logps/chosen": -235.43978881835938,
      "logps/rejected": -193.00564575195312,
      "loss": 0.3697,
      "rewards/chosen": -0.9631791710853577,
      "rewards/margins": 1.1764774322509766,
      "rewards/rejected": -2.1396565437316895,
      "step": 1278
    },
    {
      "epoch": 0.33,
      "grad_norm": 30.44072151184082,
      "kl": 0.0,
      "learning_rate": 3.326354357498037e-07,
      "logps/chosen": -218.36167907714844,
      "logps/rejected": -300.410400390625,
      "loss": 0.3474,
      "rewards/chosen": -0.4119375944137573,
      "rewards/margins": 2.955416679382324,
      "rewards/rejected": -3.367354393005371,
      "step": 1279
    },
    {
      "epoch": 0.33,
      "grad_norm": 32.16078567504883,
      "kl": 0.0,
      "learning_rate": 3.325045799528919e-07,
      "logps/chosen": -241.3597412109375,
      "logps/rejected": -221.12881469726562,
      "loss": 0.2101,
      "rewards/chosen": -0.3120342791080475,
      "rewards/margins": 3.0008411407470703,
      "rewards/rejected": -3.312875509262085,
      "step": 1280
    },
    {
      "epoch": 0.34,
      "grad_norm": 27.598134994506836,
      "kl": 0.0,
      "learning_rate": 3.3237372415598007e-07,
      "logps/chosen": -299.2360534667969,
      "logps/rejected": -197.92079162597656,
      "loss": 0.2723,
      "rewards/chosen": -2.645521879196167,
      "rewards/margins": 2.080192804336548,
      "rewards/rejected": -4.725714683532715,
      "step": 1281
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.310110092163086,
      "kl": 0.0,
      "learning_rate": 3.3224286835906827e-07,
      "logps/chosen": -248.118408203125,
      "logps/rejected": -244.019775390625,
      "loss": 0.2386,
      "rewards/chosen": 0.5699302554130554,
      "rewards/margins": 4.555118560791016,
      "rewards/rejected": -3.9851884841918945,
      "step": 1282
    },
    {
      "epoch": 0.34,
      "grad_norm": 32.69074630737305,
      "kl": 0.0,
      "learning_rate": 3.3211201256215646e-07,
      "logps/chosen": -220.3896484375,
      "logps/rejected": -378.6432800292969,
      "loss": 0.3685,
      "rewards/chosen": -0.45064517855644226,
      "rewards/margins": 4.8300395011901855,
      "rewards/rejected": -5.280684471130371,
      "step": 1283
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.25950813293457,
      "kl": 0.0,
      "learning_rate": 3.3198115676524466e-07,
      "logps/chosen": -211.71487426757812,
      "logps/rejected": -291.50787353515625,
      "loss": 0.3666,
      "rewards/chosen": -0.06342494487762451,
      "rewards/margins": 3.4773359298706055,
      "rewards/rejected": -3.5407607555389404,
      "step": 1284
    },
    {
      "epoch": 0.34,
      "grad_norm": 31.38227653503418,
      "kl": 0.0,
      "learning_rate": 3.318503009683329e-07,
      "logps/chosen": -215.906005859375,
      "logps/rejected": -224.23704528808594,
      "loss": 0.3256,
      "rewards/chosen": 0.7493216395378113,
      "rewards/margins": 4.575001239776611,
      "rewards/rejected": -3.8256797790527344,
      "step": 1285
    },
    {
      "epoch": 0.34,
      "grad_norm": 34.842342376708984,
      "kl": 0.0,
      "learning_rate": 3.317194451714211e-07,
      "logps/chosen": -220.79994201660156,
      "logps/rejected": -257.6636962890625,
      "loss": 0.3224,
      "rewards/chosen": 0.5526933670043945,
      "rewards/margins": 4.311444282531738,
      "rewards/rejected": -3.7587506771087646,
      "step": 1286
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.686933517456055,
      "kl": 0.0,
      "learning_rate": 3.315885893745093e-07,
      "logps/chosen": -167.66883850097656,
      "logps/rejected": -153.66209411621094,
      "loss": 0.3503,
      "rewards/chosen": -0.23966237902641296,
      "rewards/margins": 1.8854963779449463,
      "rewards/rejected": -2.1251587867736816,
      "step": 1287
    },
    {
      "epoch": 0.34,
      "grad_norm": 31.066295623779297,
      "kl": 0.0,
      "learning_rate": 3.314577335775975e-07,
      "logps/chosen": -240.11083984375,
      "logps/rejected": -255.7864532470703,
      "loss": 0.3307,
      "rewards/chosen": -0.976582944393158,
      "rewards/margins": 1.937021017074585,
      "rewards/rejected": -2.9136040210723877,
      "step": 1288
    },
    {
      "epoch": 0.34,
      "grad_norm": 35.197872161865234,
      "kl": 0.0,
      "learning_rate": 3.313268777806857e-07,
      "logps/chosen": -232.86688232421875,
      "logps/rejected": -194.73477172851562,
      "loss": 0.2723,
      "rewards/chosen": 2.010469913482666,
      "rewards/margins": 4.8714823722839355,
      "rewards/rejected": -2.8610124588012695,
      "step": 1289
    },
    {
      "epoch": 0.34,
      "grad_norm": 31.749187469482422,
      "kl": 0.0,
      "learning_rate": 3.311960219837739e-07,
      "logps/chosen": -243.93431091308594,
      "logps/rejected": -220.49685668945312,
      "loss": 0.3194,
      "rewards/chosen": -1.9240546226501465,
      "rewards/margins": 1.2547504901885986,
      "rewards/rejected": -3.178805112838745,
      "step": 1290
    },
    {
      "epoch": 0.34,
      "grad_norm": 34.85056686401367,
      "kl": 0.0,
      "learning_rate": 3.310651661868621e-07,
      "logps/chosen": -213.20204162597656,
      "logps/rejected": -278.9600830078125,
      "loss": 0.3964,
      "rewards/chosen": -0.25566422939300537,
      "rewards/margins": 3.075680732727051,
      "rewards/rejected": -3.3313448429107666,
      "step": 1291
    },
    {
      "epoch": 0.34,
      "grad_norm": 24.75638198852539,
      "kl": 0.0,
      "learning_rate": 3.3093431038995023e-07,
      "logps/chosen": -206.9791717529297,
      "logps/rejected": -248.0471649169922,
      "loss": 0.4092,
      "rewards/chosen": -0.372029185295105,
      "rewards/margins": 3.3591151237487793,
      "rewards/rejected": -3.7311441898345947,
      "step": 1292
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.22563171386719,
      "kl": 0.0,
      "learning_rate": 3.308034545930384e-07,
      "logps/chosen": -156.0008544921875,
      "logps/rejected": -316.6332092285156,
      "loss": 0.3212,
      "rewards/chosen": -0.07225000858306885,
      "rewards/margins": 4.35306453704834,
      "rewards/rejected": -4.425314426422119,
      "step": 1293
    },
    {
      "epoch": 0.34,
      "grad_norm": 34.623626708984375,
      "kl": 0.0,
      "learning_rate": 3.306725987961266e-07,
      "logps/chosen": -279.859130859375,
      "logps/rejected": -233.71697998046875,
      "loss": 0.3227,
      "rewards/chosen": 1.6375625133514404,
      "rewards/margins": 4.174957752227783,
      "rewards/rejected": -2.5373952388763428,
      "step": 1294
    },
    {
      "epoch": 0.34,
      "grad_norm": 45.627933502197266,
      "kl": 0.0,
      "learning_rate": 3.305417429992148e-07,
      "logps/chosen": -160.09315490722656,
      "logps/rejected": -212.97886657714844,
      "loss": 0.3397,
      "rewards/chosen": -0.6607236862182617,
      "rewards/margins": 3.3477578163146973,
      "rewards/rejected": -4.008481502532959,
      "step": 1295
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.33060073852539,
      "kl": 0.0,
      "learning_rate": 3.30410887202303e-07,
      "logps/chosen": -139.9228973388672,
      "logps/rejected": -234.82342529296875,
      "loss": 0.2886,
      "rewards/chosen": -0.6339284777641296,
      "rewards/margins": 2.746208429336548,
      "rewards/rejected": -3.3801369667053223,
      "step": 1296
    },
    {
      "epoch": 0.34,
      "grad_norm": 38.89460754394531,
      "kl": 0.0,
      "learning_rate": 3.302800314053912e-07,
      "logps/chosen": -195.2257843017578,
      "logps/rejected": -192.89590454101562,
      "loss": 0.4372,
      "rewards/chosen": -0.28713446855545044,
      "rewards/margins": 2.1057236194610596,
      "rewards/rejected": -2.3928580284118652,
      "step": 1297
    },
    {
      "epoch": 0.34,
      "grad_norm": 36.500423431396484,
      "kl": 0.0,
      "learning_rate": 3.3014917560847946e-07,
      "logps/chosen": -210.28921508789062,
      "logps/rejected": -235.51065063476562,
      "loss": 0.3719,
      "rewards/chosen": -1.0210708379745483,
      "rewards/margins": 1.5469309091567993,
      "rewards/rejected": -2.5680017471313477,
      "step": 1298
    },
    {
      "epoch": 0.34,
      "grad_norm": 22.017080307006836,
      "kl": 0.0,
      "learning_rate": 3.3001831981156765e-07,
      "logps/chosen": -259.6287841796875,
      "logps/rejected": -318.98065185546875,
      "loss": 0.3813,
      "rewards/chosen": -0.38963747024536133,
      "rewards/margins": 3.8554883003234863,
      "rewards/rejected": -4.245125770568848,
      "step": 1299
    },
    {
      "epoch": 0.34,
      "grad_norm": 39.884857177734375,
      "kl": 0.0,
      "learning_rate": 3.2988746401465585e-07,
      "logps/chosen": -259.2381896972656,
      "logps/rejected": -260.0971374511719,
      "loss": 0.2882,
      "rewards/chosen": -0.054092198610305786,
      "rewards/margins": 2.6869301795959473,
      "rewards/rejected": -2.7410223484039307,
      "step": 1300
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.03709411621094,
      "kl": 0.0,
      "learning_rate": 3.2975660821774405e-07,
      "logps/chosen": -166.88330078125,
      "logps/rejected": -223.3334503173828,
      "loss": 0.2703,
      "rewards/chosen": 0.3510906994342804,
      "rewards/margins": 4.768466949462891,
      "rewards/rejected": -4.4173760414123535,
      "step": 1301
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.759967803955078,
      "kl": 0.0,
      "learning_rate": 3.2962575242083224e-07,
      "logps/chosen": -252.9071044921875,
      "logps/rejected": -229.6643829345703,
      "loss": 0.2931,
      "rewards/chosen": 1.2659831047058105,
      "rewards/margins": 3.1950631141662598,
      "rewards/rejected": -1.9290800094604492,
      "step": 1302
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.443843841552734,
      "kl": 0.0,
      "learning_rate": 3.2949489662392044e-07,
      "logps/chosen": -242.87637329101562,
      "logps/rejected": -233.8732147216797,
      "loss": 0.1907,
      "rewards/chosen": 0.24500472843647003,
      "rewards/margins": 3.0496490001678467,
      "rewards/rejected": -2.8046443462371826,
      "step": 1303
    },
    {
      "epoch": 0.34,
      "grad_norm": 35.365848541259766,
      "kl": 0.0,
      "learning_rate": 3.2936404082700864e-07,
      "logps/chosen": -231.86984252929688,
      "logps/rejected": -308.7398681640625,
      "loss": 0.2432,
      "rewards/chosen": 0.4850401282310486,
      "rewards/margins": 4.330360412597656,
      "rewards/rejected": -3.845320224761963,
      "step": 1304
    },
    {
      "epoch": 0.34,
      "grad_norm": 37.317588806152344,
      "kl": 0.0,
      "learning_rate": 3.2923318503009683e-07,
      "logps/chosen": -150.95700073242188,
      "logps/rejected": -274.0843505859375,
      "loss": 0.3491,
      "rewards/chosen": 0.7635312676429749,
      "rewards/margins": 3.192793846130371,
      "rewards/rejected": -2.429262638092041,
      "step": 1305
    },
    {
      "epoch": 0.34,
      "grad_norm": 48.94740295410156,
      "kl": 0.0,
      "learning_rate": 3.2910232923318503e-07,
      "logps/chosen": -346.37908935546875,
      "logps/rejected": -209.9012908935547,
      "loss": 0.2711,
      "rewards/chosen": 2.334627628326416,
      "rewards/margins": 6.195981025695801,
      "rewards/rejected": -3.8613533973693848,
      "step": 1306
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.677465438842773,
      "kl": 0.0,
      "learning_rate": 3.289714734362732e-07,
      "logps/chosen": -222.22210693359375,
      "logps/rejected": -294.2380676269531,
      "loss": 0.2369,
      "rewards/chosen": 0.19016534090042114,
      "rewards/margins": 3.609548807144165,
      "rewards/rejected": -3.4193835258483887,
      "step": 1307
    },
    {
      "epoch": 0.34,
      "grad_norm": 35.83417892456055,
      "kl": 0.0,
      "learning_rate": 3.2884061763936137e-07,
      "logps/chosen": -196.81204223632812,
      "logps/rejected": -219.13180541992188,
      "loss": 0.3425,
      "rewards/chosen": 0.48529767990112305,
      "rewards/margins": 3.888932704925537,
      "rewards/rejected": -3.403635025024414,
      "step": 1308
    },
    {
      "epoch": 0.34,
      "grad_norm": 29.441770553588867,
      "kl": 0.0,
      "learning_rate": 3.2870976184244956e-07,
      "logps/chosen": -184.06394958496094,
      "logps/rejected": -191.65087890625,
      "loss": 0.313,
      "rewards/chosen": -1.2080618143081665,
      "rewards/margins": 1.6941922903060913,
      "rewards/rejected": -2.902254104614258,
      "step": 1309
    },
    {
      "epoch": 0.34,
      "grad_norm": 41.78122329711914,
      "kl": 0.0,
      "learning_rate": 3.2857890604553776e-07,
      "logps/chosen": -259.63751220703125,
      "logps/rejected": -190.84457397460938,
      "loss": 0.3762,
      "rewards/chosen": 0.5477985143661499,
      "rewards/margins": 3.737063407897949,
      "rewards/rejected": -3.1892647743225098,
      "step": 1310
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.553565979003906,
      "kl": 0.0,
      "learning_rate": 3.28448050248626e-07,
      "logps/chosen": -198.94801330566406,
      "logps/rejected": -414.20306396484375,
      "loss": 0.347,
      "rewards/chosen": -0.033007245510816574,
      "rewards/margins": 3.7820427417755127,
      "rewards/rejected": -3.815049886703491,
      "step": 1311
    },
    {
      "epoch": 0.34,
      "grad_norm": 43.8389778137207,
      "kl": 0.0,
      "learning_rate": 3.283171944517142e-07,
      "logps/chosen": -165.3663330078125,
      "logps/rejected": -285.16845703125,
      "loss": 0.4081,
      "rewards/chosen": -1.201613187789917,
      "rewards/margins": 1.7473170757293701,
      "rewards/rejected": -2.948930263519287,
      "step": 1312
    },
    {
      "epoch": 0.34,
      "grad_norm": 30.555959701538086,
      "kl": 0.0,
      "learning_rate": 3.281863386548024e-07,
      "logps/chosen": -217.09132385253906,
      "logps/rejected": -234.91665649414062,
      "loss": 0.3565,
      "rewards/chosen": 1.054399013519287,
      "rewards/margins": 4.382190704345703,
      "rewards/rejected": -3.327791452407837,
      "step": 1313
    },
    {
      "epoch": 0.34,
      "grad_norm": 33.22416687011719,
      "kl": 0.0,
      "learning_rate": 3.280554828578906e-07,
      "logps/chosen": -217.23284912109375,
      "logps/rejected": -216.6754150390625,
      "loss": 0.3091,
      "rewards/chosen": 0.3072512447834015,
      "rewards/margins": 1.8591370582580566,
      "rewards/rejected": -1.5518858432769775,
      "step": 1314
    },
    {
      "epoch": 0.34,
      "grad_norm": 29.256071090698242,
      "kl": 0.0,
      "learning_rate": 3.279246270609788e-07,
      "logps/chosen": -211.4664306640625,
      "logps/rejected": -235.34336853027344,
      "loss": 0.2241,
      "rewards/chosen": 1.3942919969558716,
      "rewards/margins": 4.8556413650512695,
      "rewards/rejected": -3.4613492488861084,
      "step": 1315
    },
    {
      "epoch": 0.34,
      "grad_norm": 45.10913848876953,
      "kl": 0.0,
      "learning_rate": 3.27793771264067e-07,
      "logps/chosen": -262.1194152832031,
      "logps/rejected": -236.89015197753906,
      "loss": 0.4242,
      "rewards/chosen": -0.6220903396606445,
      "rewards/margins": 1.0707972049713135,
      "rewards/rejected": -1.692887544631958,
      "step": 1316
    },
    {
      "epoch": 0.34,
      "grad_norm": 43.61226272583008,
      "kl": 0.0,
      "learning_rate": 3.276629154671552e-07,
      "logps/chosen": -205.19381713867188,
      "logps/rejected": -300.3736267089844,
      "loss": 0.3916,
      "rewards/chosen": -0.4625236392021179,
      "rewards/margins": 2.3358194828033447,
      "rewards/rejected": -2.7983431816101074,
      "step": 1317
    },
    {
      "epoch": 0.34,
      "grad_norm": 35.57749557495117,
      "kl": 0.0,
      "learning_rate": 3.275320596702434e-07,
      "logps/chosen": -215.95855712890625,
      "logps/rejected": -211.11325073242188,
      "loss": 0.2919,
      "rewards/chosen": -0.27239271998405457,
      "rewards/margins": 3.561589002609253,
      "rewards/rejected": -3.83398175239563,
      "step": 1318
    },
    {
      "epoch": 0.35,
      "grad_norm": 30.0214900970459,
      "kl": 0.0,
      "learning_rate": 3.274012038733316e-07,
      "logps/chosen": -162.21311950683594,
      "logps/rejected": -231.0467987060547,
      "loss": 0.2538,
      "rewards/chosen": -0.42144662141799927,
      "rewards/margins": 3.6177022457122803,
      "rewards/rejected": -4.039148807525635,
      "step": 1319
    },
    {
      "epoch": 0.35,
      "grad_norm": 35.25520706176758,
      "kl": 0.0,
      "learning_rate": 3.2727034807641977e-07,
      "logps/chosen": -214.12205505371094,
      "logps/rejected": -236.38314819335938,
      "loss": 0.4485,
      "rewards/chosen": -1.0461833477020264,
      "rewards/margins": 4.4344940185546875,
      "rewards/rejected": -5.480677604675293,
      "step": 1320
    },
    {
      "epoch": 0.35,
      "grad_norm": 27.39512825012207,
      "kl": 0.0,
      "learning_rate": 3.2713949227950797e-07,
      "logps/chosen": -230.88999938964844,
      "logps/rejected": -208.17617797851562,
      "loss": 0.2967,
      "rewards/chosen": -0.46113088726997375,
      "rewards/margins": 3.70143723487854,
      "rewards/rejected": -4.162568092346191,
      "step": 1321
    },
    {
      "epoch": 0.35,
      "grad_norm": 49.641719818115234,
      "kl": 0.0,
      "learning_rate": 3.2700863648259616e-07,
      "logps/chosen": -278.8284912109375,
      "logps/rejected": -205.60911560058594,
      "loss": 0.3747,
      "rewards/chosen": -0.01196346990764141,
      "rewards/margins": 2.739752769470215,
      "rewards/rejected": -2.751716136932373,
      "step": 1322
    },
    {
      "epoch": 0.35,
      "grad_norm": 33.947898864746094,
      "kl": 0.0,
      "learning_rate": 3.2687778068568436e-07,
      "logps/chosen": -194.21571350097656,
      "logps/rejected": -252.41253662109375,
      "loss": 0.2927,
      "rewards/chosen": -0.2291528880596161,
      "rewards/margins": 3.9392712116241455,
      "rewards/rejected": -4.168424129486084,
      "step": 1323
    },
    {
      "epoch": 0.35,
      "grad_norm": 38.89460372924805,
      "kl": 0.0,
      "learning_rate": 3.2674692488877256e-07,
      "logps/chosen": -178.58995056152344,
      "logps/rejected": -286.2423400878906,
      "loss": 0.4326,
      "rewards/chosen": -0.6443737745285034,
      "rewards/margins": 2.9597129821777344,
      "rewards/rejected": -3.6040866374969482,
      "step": 1324
    },
    {
      "epoch": 0.35,
      "grad_norm": 28.817676544189453,
      "kl": 0.0,
      "learning_rate": 3.2661606909186075e-07,
      "logps/chosen": -181.80223083496094,
      "logps/rejected": -182.43310546875,
      "loss": 0.3204,
      "rewards/chosen": -0.6919834613800049,
      "rewards/margins": 2.9899115562438965,
      "rewards/rejected": -3.6818950176239014,
      "step": 1325
    },
    {
      "epoch": 0.35,
      "grad_norm": 37.1805534362793,
      "kl": 0.0,
      "learning_rate": 3.2648521329494895e-07,
      "logps/chosen": -130.12796020507812,
      "logps/rejected": -197.9749755859375,
      "loss": 0.4522,
      "rewards/chosen": -0.9502408504486084,
      "rewards/margins": 0.5358531475067139,
      "rewards/rejected": -1.4860939979553223,
      "step": 1326
    },
    {
      "epoch": 0.35,
      "grad_norm": 42.603267669677734,
      "kl": 0.0,
      "learning_rate": 3.2635435749803715e-07,
      "logps/chosen": -154.728271484375,
      "logps/rejected": -188.37063598632812,
      "loss": 0.3996,
      "rewards/chosen": -0.20417237281799316,
      "rewards/margins": 1.0332392454147339,
      "rewards/rejected": -1.237411618232727,
      "step": 1327
    },
    {
      "epoch": 0.35,
      "grad_norm": 23.203495025634766,
      "kl": 0.0,
      "learning_rate": 3.2622350170112534e-07,
      "logps/chosen": -237.64964294433594,
      "logps/rejected": -216.24838256835938,
      "loss": 0.3329,
      "rewards/chosen": -0.4951002895832062,
      "rewards/margins": 2.8161814212799072,
      "rewards/rejected": -3.311281681060791,
      "step": 1328
    },
    {
      "epoch": 0.35,
      "grad_norm": 32.98259353637695,
      "kl": 0.0,
      "learning_rate": 3.2609264590421354e-07,
      "logps/chosen": -287.97393798828125,
      "logps/rejected": -265.22216796875,
      "loss": 0.2701,
      "rewards/chosen": 1.7656995058059692,
      "rewards/margins": 5.856935977935791,
      "rewards/rejected": -4.091236591339111,
      "step": 1329
    },
    {
      "epoch": 0.35,
      "grad_norm": 31.110828399658203,
      "kl": 0.0,
      "learning_rate": 3.2596179010730173e-07,
      "logps/chosen": -168.94876098632812,
      "logps/rejected": -231.80029296875,
      "loss": 0.2973,
      "rewards/chosen": 0.5853991508483887,
      "rewards/margins": 4.364181041717529,
      "rewards/rejected": -3.7787818908691406,
      "step": 1330
    },
    {
      "epoch": 0.35,
      "grad_norm": 28.867490768432617,
      "kl": 0.0,
      "learning_rate": 3.2583093431038993e-07,
      "logps/chosen": -153.498779296875,
      "logps/rejected": -174.12374877929688,
      "loss": 0.3801,
      "rewards/chosen": -0.5442581176757812,
      "rewards/margins": 1.9553544521331787,
      "rewards/rejected": -2.49961256980896,
      "step": 1331
    },
    {
      "epoch": 0.35,
      "grad_norm": 34.34536361694336,
      "kl": 0.0,
      "learning_rate": 3.257000785134781e-07,
      "logps/chosen": -166.44349670410156,
      "logps/rejected": -246.9544677734375,
      "loss": 0.2097,
      "rewards/chosen": 0.37404146790504456,
      "rewards/margins": 4.729082107543945,
      "rewards/rejected": -4.355040550231934,
      "step": 1332
    },
    {
      "epoch": 0.35,
      "grad_norm": 34.6766471862793,
      "kl": 0.0,
      "learning_rate": 3.255692227165663e-07,
      "logps/chosen": -306.0629577636719,
      "logps/rejected": -280.69903564453125,
      "loss": 0.2987,
      "rewards/chosen": 0.3299980163574219,
      "rewards/margins": 6.534651279449463,
      "rewards/rejected": -6.204653263092041,
      "step": 1333
    },
    {
      "epoch": 0.35,
      "grad_norm": 39.86823654174805,
      "kl": 0.0,
      "learning_rate": 3.254383669196545e-07,
      "logps/chosen": -244.7062530517578,
      "logps/rejected": -260.9649658203125,
      "loss": 0.3421,
      "rewards/chosen": 0.7138856649398804,
      "rewards/margins": 2.7415547370910645,
      "rewards/rejected": -2.0276689529418945,
      "step": 1334
    },
    {
      "epoch": 0.35,
      "grad_norm": 36.28447341918945,
      "kl": 0.0,
      "learning_rate": 3.253075111227427e-07,
      "logps/chosen": -216.41928100585938,
      "logps/rejected": -272.4765930175781,
      "loss": 0.282,
      "rewards/chosen": 1.007466435432434,
      "rewards/margins": 4.307610988616943,
      "rewards/rejected": -3.300144672393799,
      "step": 1335
    },
    {
      "epoch": 0.35,
      "grad_norm": 37.88795471191406,
      "kl": 0.0,
      "learning_rate": 3.2517665532583096e-07,
      "logps/chosen": -272.2811584472656,
      "logps/rejected": -198.16000366210938,
      "loss": 0.3557,
      "rewards/chosen": 0.30693429708480835,
      "rewards/margins": 2.8492815494537354,
      "rewards/rejected": -2.5423471927642822,
      "step": 1336
    },
    {
      "epoch": 0.35,
      "grad_norm": 37.08567810058594,
      "kl": 0.0,
      "learning_rate": 3.2504579952891916e-07,
      "logps/chosen": -127.075439453125,
      "logps/rejected": -191.78448486328125,
      "loss": 0.316,
      "rewards/chosen": -1.1991398334503174,
      "rewards/margins": 2.050868272781372,
      "rewards/rejected": -3.2500081062316895,
      "step": 1337
    },
    {
      "epoch": 0.35,
      "grad_norm": 28.48150062561035,
      "kl": 0.0,
      "learning_rate": 3.2491494373200736e-07,
      "logps/chosen": -139.986572265625,
      "logps/rejected": -309.28363037109375,
      "loss": 0.2604,
      "rewards/chosen": 0.7744281888008118,
      "rewards/margins": 3.9714620113372803,
      "rewards/rejected": -3.1970338821411133,
      "step": 1338
    },
    {
      "epoch": 0.35,
      "grad_norm": 32.136024475097656,
      "kl": 0.0,
      "learning_rate": 3.247840879350955e-07,
      "logps/chosen": -233.92196655273438,
      "logps/rejected": -303.3210754394531,
      "loss": 0.3133,
      "rewards/chosen": -1.318963885307312,
      "rewards/margins": 2.054591655731201,
      "rewards/rejected": -3.3735556602478027,
      "step": 1339
    },
    {
      "epoch": 0.35,
      "grad_norm": 39.1546630859375,
      "kl": 0.0,
      "learning_rate": 3.246532321381837e-07,
      "logps/chosen": -158.99038696289062,
      "logps/rejected": -258.7633972167969,
      "loss": 0.41,
      "rewards/chosen": -0.913709819316864,
      "rewards/margins": 3.426400899887085,
      "rewards/rejected": -4.340110778808594,
      "step": 1340
    },
    {
      "epoch": 0.35,
      "grad_norm": 31.439769744873047,
      "kl": 0.0,
      "learning_rate": 3.245223763412719e-07,
      "logps/chosen": -248.65176391601562,
      "logps/rejected": -187.52197265625,
      "loss": 0.1849,
      "rewards/chosen": -0.4633801281452179,
      "rewards/margins": 3.0590274333953857,
      "rewards/rejected": -3.5224075317382812,
      "step": 1341
    },
    {
      "epoch": 0.35,
      "grad_norm": 24.785945892333984,
      "kl": 0.0,
      "learning_rate": 3.243915205443601e-07,
      "logps/chosen": -163.9761199951172,
      "logps/rejected": -241.343994140625,
      "loss": 0.2388,
      "rewards/chosen": -0.21575519442558289,
      "rewards/margins": 3.2993061542510986,
      "rewards/rejected": -3.515061378479004,
      "step": 1342
    },
    {
      "epoch": 0.35,
      "grad_norm": 38.66981887817383,
      "kl": 0.0,
      "learning_rate": 3.242606647474483e-07,
      "logps/chosen": -245.6348114013672,
      "logps/rejected": -206.09735107421875,
      "loss": 0.3331,
      "rewards/chosen": 0.7461506128311157,
      "rewards/margins": 2.7707104682922363,
      "rewards/rejected": -2.02455997467041,
      "step": 1343
    },
    {
      "epoch": 0.35,
      "grad_norm": 31.069034576416016,
      "kl": 0.0,
      "learning_rate": 3.241298089505365e-07,
      "logps/chosen": -217.89781188964844,
      "logps/rejected": -283.12060546875,
      "loss": 0.3239,
      "rewards/chosen": 0.8098434209823608,
      "rewards/margins": 4.698688983917236,
      "rewards/rejected": -3.888845682144165,
      "step": 1344
    },
    {
      "epoch": 0.35,
      "grad_norm": 33.27344512939453,
      "kl": 0.0,
      "learning_rate": 3.239989531536247e-07,
      "logps/chosen": -275.2389831542969,
      "logps/rejected": -250.8555145263672,
      "loss": 0.3039,
      "rewards/chosen": -0.4530223309993744,
      "rewards/margins": 3.8131110668182373,
      "rewards/rejected": -4.2661333084106445,
      "step": 1345
    },
    {
      "epoch": 0.35,
      "grad_norm": 38.127281188964844,
      "kl": 0.0,
      "learning_rate": 3.2386809735671287e-07,
      "logps/chosen": -169.89163208007812,
      "logps/rejected": -284.6907653808594,
      "loss": 0.4194,
      "rewards/chosen": -0.27509641647338867,
      "rewards/margins": 3.6556601524353027,
      "rewards/rejected": -3.9307565689086914,
      "step": 1346
    },
    {
      "epoch": 0.35,
      "grad_norm": 29.15488052368164,
      "kl": 0.0,
      "learning_rate": 3.2373724155980107e-07,
      "logps/chosen": -197.29884338378906,
      "logps/rejected": -242.72018432617188,
      "loss": 0.3905,
      "rewards/chosen": 0.06425243616104126,
      "rewards/margins": 3.8348217010498047,
      "rewards/rejected": -3.770569324493408,
      "step": 1347
    },
    {
      "epoch": 0.35,
      "grad_norm": 30.490800857543945,
      "kl": 0.0,
      "learning_rate": 3.2360638576288926e-07,
      "logps/chosen": -191.22560119628906,
      "logps/rejected": -229.977783203125,
      "loss": 0.3334,
      "rewards/chosen": -0.10668891668319702,
      "rewards/margins": 2.346184253692627,
      "rewards/rejected": -2.4528732299804688,
      "step": 1348
    },
    {
      "epoch": 0.35,
      "grad_norm": 34.293060302734375,
      "kl": 0.0,
      "learning_rate": 3.234755299659775e-07,
      "logps/chosen": -214.0876922607422,
      "logps/rejected": -265.4975891113281,
      "loss": 0.1624,
      "rewards/chosen": 1.9567590951919556,
      "rewards/margins": 5.267825603485107,
      "rewards/rejected": -3.3110663890838623,
      "step": 1349
    },
    {
      "epoch": 0.35,
      "grad_norm": 39.73982238769531,
      "kl": 0.0,
      "learning_rate": 3.233446741690657e-07,
      "logps/chosen": -167.7155303955078,
      "logps/rejected": -342.3310546875,
      "loss": 0.1932,
      "rewards/chosen": 1.1075279712677002,
      "rewards/margins": 5.490621566772461,
      "rewards/rejected": -4.383093357086182,
      "step": 1350
    },
    {
      "epoch": 0.35,
      "grad_norm": 38.64570236206055,
      "kl": 0.0,
      "learning_rate": 3.232138183721539e-07,
      "logps/chosen": -183.162109375,
      "logps/rejected": -280.5931091308594,
      "loss": 0.2884,
      "rewards/chosen": 1.5205305814743042,
      "rewards/margins": 4.8635640144348145,
      "rewards/rejected": -3.3430333137512207,
      "step": 1351
    },
    {
      "epoch": 0.35,
      "grad_norm": 34.20791244506836,
      "kl": 0.0,
      "learning_rate": 3.230829625752421e-07,
      "logps/chosen": -206.82135009765625,
      "logps/rejected": -225.54063415527344,
      "loss": 0.3409,
      "rewards/chosen": 0.016800865530967712,
      "rewards/margins": 2.840919017791748,
      "rewards/rejected": -2.824118137359619,
      "step": 1352
    },
    {
      "epoch": 0.35,
      "grad_norm": 35.95807647705078,
      "kl": 0.0,
      "learning_rate": 3.229521067783303e-07,
      "logps/chosen": -185.8177490234375,
      "logps/rejected": -196.95664978027344,
      "loss": 0.2837,
      "rewards/chosen": 0.334745854139328,
      "rewards/margins": 1.8283900022506714,
      "rewards/rejected": -1.493644118309021,
      "step": 1353
    },
    {
      "epoch": 0.35,
      "grad_norm": 34.594947814941406,
      "kl": 0.0,
      "learning_rate": 3.2282125098141844e-07,
      "logps/chosen": -233.8248291015625,
      "logps/rejected": -209.21939086914062,
      "loss": 0.3388,
      "rewards/chosen": 0.3423979878425598,
      "rewards/margins": 2.1111137866973877,
      "rewards/rejected": -1.768715739250183,
      "step": 1354
    },
    {
      "epoch": 0.35,
      "grad_norm": 35.39228057861328,
      "kl": 0.0,
      "learning_rate": 3.2269039518450664e-07,
      "logps/chosen": -188.0096435546875,
      "logps/rejected": -226.69932556152344,
      "loss": 0.287,
      "rewards/chosen": 0.7518016695976257,
      "rewards/margins": 4.171019077301025,
      "rewards/rejected": -3.419217348098755,
      "step": 1355
    },
    {
      "epoch": 0.35,
      "grad_norm": 37.23146057128906,
      "kl": 0.0,
      "learning_rate": 3.2255953938759483e-07,
      "logps/chosen": -220.07754516601562,
      "logps/rejected": -296.0440979003906,
      "loss": 0.2496,
      "rewards/chosen": 0.1624390184879303,
      "rewards/margins": 4.445444107055664,
      "rewards/rejected": -4.283005237579346,
      "step": 1356
    },
    {
      "epoch": 0.36,
      "grad_norm": 37.137718200683594,
      "kl": 0.0,
      "learning_rate": 3.2242868359068303e-07,
      "logps/chosen": -179.314453125,
      "logps/rejected": -221.1763916015625,
      "loss": 0.2737,
      "rewards/chosen": 0.19471484422683716,
      "rewards/margins": 2.744776964187622,
      "rewards/rejected": -2.5500621795654297,
      "step": 1357
    },
    {
      "epoch": 0.36,
      "grad_norm": 38.724143981933594,
      "kl": 0.0,
      "learning_rate": 3.222978277937712e-07,
      "logps/chosen": -208.41488647460938,
      "logps/rejected": -224.366455078125,
      "loss": 0.237,
      "rewards/chosen": 1.1091750860214233,
      "rewards/margins": 5.58030366897583,
      "rewards/rejected": -4.471128463745117,
      "step": 1358
    },
    {
      "epoch": 0.36,
      "grad_norm": 38.86799621582031,
      "kl": 0.0,
      "learning_rate": 3.221669719968594e-07,
      "logps/chosen": -221.882568359375,
      "logps/rejected": -244.26026916503906,
      "loss": 0.3496,
      "rewards/chosen": 0.35998934507369995,
      "rewards/margins": 3.726022958755493,
      "rewards/rejected": -3.3660335540771484,
      "step": 1359
    },
    {
      "epoch": 0.36,
      "grad_norm": 28.907777786254883,
      "kl": 0.0,
      "learning_rate": 3.220361161999476e-07,
      "logps/chosen": -229.4911651611328,
      "logps/rejected": -188.29910278320312,
      "loss": 0.2566,
      "rewards/chosen": 0.28083348274230957,
      "rewards/margins": 2.9774253368377686,
      "rewards/rejected": -2.696591854095459,
      "step": 1360
    },
    {
      "epoch": 0.36,
      "grad_norm": 36.41794967651367,
      "kl": 0.0,
      "learning_rate": 3.219052604030358e-07,
      "logps/chosen": -156.39662170410156,
      "logps/rejected": -194.1817626953125,
      "loss": 0.3464,
      "rewards/chosen": 0.58488529920578,
      "rewards/margins": 2.58624529838562,
      "rewards/rejected": -2.0013599395751953,
      "step": 1361
    },
    {
      "epoch": 0.36,
      "grad_norm": 30.247217178344727,
      "kl": 0.0,
      "learning_rate": 3.2177440460612406e-07,
      "logps/chosen": -207.5975799560547,
      "logps/rejected": -271.22698974609375,
      "loss": 0.2429,
      "rewards/chosen": 2.0984890460968018,
      "rewards/margins": 6.212061882019043,
      "rewards/rejected": -4.11357307434082,
      "step": 1362
    },
    {
      "epoch": 0.36,
      "grad_norm": 36.8974723815918,
      "kl": 0.0,
      "learning_rate": 3.2164354880921226e-07,
      "logps/chosen": -251.12509155273438,
      "logps/rejected": -222.53085327148438,
      "loss": 0.3591,
      "rewards/chosen": -0.7981698513031006,
      "rewards/margins": 2.2022039890289307,
      "rewards/rejected": -3.0003738403320312,
      "step": 1363
    },
    {
      "epoch": 0.36,
      "grad_norm": 31.882705688476562,
      "kl": 0.0,
      "learning_rate": 3.2151269301230045e-07,
      "logps/chosen": -241.10394287109375,
      "logps/rejected": -229.85205078125,
      "loss": 0.256,
      "rewards/chosen": -1.05807626247406,
      "rewards/margins": 2.32078218460083,
      "rewards/rejected": -3.3788583278656006,
      "step": 1364
    },
    {
      "epoch": 0.36,
      "grad_norm": 32.5128288269043,
      "kl": 0.0,
      "learning_rate": 3.2138183721538865e-07,
      "logps/chosen": -293.7722473144531,
      "logps/rejected": -266.4582214355469,
      "loss": 0.3502,
      "rewards/chosen": -0.9223523139953613,
      "rewards/margins": 2.8367271423339844,
      "rewards/rejected": -3.7590794563293457,
      "step": 1365
    },
    {
      "epoch": 0.36,
      "grad_norm": 31.339826583862305,
      "kl": 0.0,
      "learning_rate": 3.2125098141847685e-07,
      "logps/chosen": -186.4215545654297,
      "logps/rejected": -194.24964904785156,
      "loss": 0.309,
      "rewards/chosen": -0.3455151915550232,
      "rewards/margins": 2.1275269985198975,
      "rewards/rejected": -2.4730422496795654,
      "step": 1366
    },
    {
      "epoch": 0.36,
      "grad_norm": 27.793371200561523,
      "kl": 0.0,
      "learning_rate": 3.2112012562156504e-07,
      "logps/chosen": -207.349609375,
      "logps/rejected": -221.176025390625,
      "loss": 0.3284,
      "rewards/chosen": -1.0674750804901123,
      "rewards/margins": 1.9950244426727295,
      "rewards/rejected": -3.062499523162842,
      "step": 1367
    },
    {
      "epoch": 0.36,
      "grad_norm": 25.472999572753906,
      "kl": 0.0,
      "learning_rate": 3.2098926982465324e-07,
      "logps/chosen": -225.4566192626953,
      "logps/rejected": -209.94873046875,
      "loss": 0.2498,
      "rewards/chosen": -0.36175042390823364,
      "rewards/margins": 2.9907066822052,
      "rewards/rejected": -3.352457046508789,
      "step": 1368
    },
    {
      "epoch": 0.36,
      "grad_norm": 31.11713981628418,
      "kl": 0.0,
      "learning_rate": 3.2085841402774143e-07,
      "logps/chosen": -164.0789031982422,
      "logps/rejected": -194.20265197753906,
      "loss": 0.3384,
      "rewards/chosen": 0.4543860852718353,
      "rewards/margins": 2.6434974670410156,
      "rewards/rejected": -2.1891114711761475,
      "step": 1369
    },
    {
      "epoch": 0.36,
      "grad_norm": 34.00928497314453,
      "kl": 0.0,
      "learning_rate": 3.207275582308296e-07,
      "logps/chosen": -212.17886352539062,
      "logps/rejected": -181.32400512695312,
      "loss": 0.2691,
      "rewards/chosen": 0.9818140864372253,
      "rewards/margins": 3.594285011291504,
      "rewards/rejected": -2.612470865249634,
      "step": 1370
    },
    {
      "epoch": 0.36,
      "grad_norm": 29.166141510009766,
      "kl": 0.0,
      "learning_rate": 3.2059670243391777e-07,
      "logps/chosen": -249.57412719726562,
      "logps/rejected": -267.99700927734375,
      "loss": 0.3086,
      "rewards/chosen": 0.5101392269134521,
      "rewards/margins": 4.556434631347656,
      "rewards/rejected": -4.046295642852783,
      "step": 1371
    },
    {
      "epoch": 0.36,
      "grad_norm": 31.815664291381836,
      "kl": 0.0,
      "learning_rate": 3.2046584663700597e-07,
      "logps/chosen": -231.37120056152344,
      "logps/rejected": -383.26568603515625,
      "loss": 0.2616,
      "rewards/chosen": 0.07025027275085449,
      "rewards/margins": 8.557687759399414,
      "rewards/rejected": -8.48743724822998,
      "step": 1372
    },
    {
      "epoch": 0.36,
      "grad_norm": 36.147239685058594,
      "kl": 0.0,
      "learning_rate": 3.2033499084009417e-07,
      "logps/chosen": -177.21237182617188,
      "logps/rejected": -266.6687316894531,
      "loss": 0.3493,
      "rewards/chosen": -0.384672611951828,
      "rewards/margins": 2.363210439682007,
      "rewards/rejected": -2.7478830814361572,
      "step": 1373
    },
    {
      "epoch": 0.36,
      "grad_norm": 37.070411682128906,
      "kl": 0.0,
      "learning_rate": 3.202041350431824e-07,
      "logps/chosen": -232.02838134765625,
      "logps/rejected": -181.37887573242188,
      "loss": 0.3831,
      "rewards/chosen": -0.11297231912612915,
      "rewards/margins": 2.5317389965057373,
      "rewards/rejected": -2.6447112560272217,
      "step": 1374
    },
    {
      "epoch": 0.36,
      "grad_norm": 42.010162353515625,
      "kl": 0.0,
      "learning_rate": 3.200732792462706e-07,
      "logps/chosen": -185.11367797851562,
      "logps/rejected": -175.94879150390625,
      "loss": 0.3412,
      "rewards/chosen": 0.004318729043006897,
      "rewards/margins": 2.7210121154785156,
      "rewards/rejected": -2.71669340133667,
      "step": 1375
    },
    {
      "epoch": 0.36,
      "grad_norm": 32.87958908081055,
      "kl": 0.0,
      "learning_rate": 3.199424234493588e-07,
      "logps/chosen": -230.2548065185547,
      "logps/rejected": -321.4191589355469,
      "loss": 0.3384,
      "rewards/chosen": -0.9217988848686218,
      "rewards/margins": 1.7525115013122559,
      "rewards/rejected": -2.6743104457855225,
      "step": 1376
    },
    {
      "epoch": 0.36,
      "grad_norm": 36.438838958740234,
      "kl": 0.0,
      "learning_rate": 3.19811567652447e-07,
      "logps/chosen": -219.46987915039062,
      "logps/rejected": -230.5147247314453,
      "loss": 0.2815,
      "rewards/chosen": 0.13436929881572723,
      "rewards/margins": 2.3530349731445312,
      "rewards/rejected": -2.218665599822998,
      "step": 1377
    },
    {
      "epoch": 0.36,
      "grad_norm": 31.880538940429688,
      "kl": 0.0,
      "learning_rate": 3.196807118555352e-07,
      "logps/chosen": -138.39146423339844,
      "logps/rejected": -214.18983459472656,
      "loss": 0.2752,
      "rewards/chosen": 1.1650010347366333,
      "rewards/margins": 3.4112720489501953,
      "rewards/rejected": -2.2462708950042725,
      "step": 1378
    },
    {
      "epoch": 0.36,
      "grad_norm": 24.90720558166504,
      "kl": 0.0,
      "learning_rate": 3.195498560586234e-07,
      "logps/chosen": -242.847412109375,
      "logps/rejected": -178.04931640625,
      "loss": 0.3436,
      "rewards/chosen": -0.43071645498275757,
      "rewards/margins": 2.444901943206787,
      "rewards/rejected": -2.8756184577941895,
      "step": 1379
    },
    {
      "epoch": 0.36,
      "grad_norm": 42.002777099609375,
      "kl": 0.0,
      "learning_rate": 3.194190002617116e-07,
      "logps/chosen": -255.03749084472656,
      "logps/rejected": -203.60467529296875,
      "loss": 0.3988,
      "rewards/chosen": -0.029839009046554565,
      "rewards/margins": 1.9282221794128418,
      "rewards/rejected": -1.9580612182617188,
      "step": 1380
    },
    {
      "epoch": 0.36,
      "grad_norm": 34.07427215576172,
      "kl": 0.0,
      "learning_rate": 3.192881444647998e-07,
      "logps/chosen": -143.0186767578125,
      "logps/rejected": -223.99244689941406,
      "loss": 0.3164,
      "rewards/chosen": 0.14076310396194458,
      "rewards/margins": 3.031914472579956,
      "rewards/rejected": -2.8911514282226562,
      "step": 1381
    },
    {
      "epoch": 0.36,
      "grad_norm": 29.833765029907227,
      "kl": 0.0,
      "learning_rate": 3.19157288667888e-07,
      "logps/chosen": -210.23605346679688,
      "logps/rejected": -235.79354858398438,
      "loss": 0.3206,
      "rewards/chosen": -0.04645270109176636,
      "rewards/margins": 4.771703720092773,
      "rewards/rejected": -4.8181562423706055,
      "step": 1382
    },
    {
      "epoch": 0.36,
      "grad_norm": 35.3935546875,
      "kl": 0.0,
      "learning_rate": 3.190264328709762e-07,
      "logps/chosen": -313.9126892089844,
      "logps/rejected": -148.69590759277344,
      "loss": 0.3035,
      "rewards/chosen": -0.025645911693572998,
      "rewards/margins": 2.0939083099365234,
      "rewards/rejected": -2.119554281234741,
      "step": 1383
    },
    {
      "epoch": 0.36,
      "grad_norm": 33.280887603759766,
      "kl": 0.0,
      "learning_rate": 3.188955770740644e-07,
      "logps/chosen": -209.35018920898438,
      "logps/rejected": -344.81787109375,
      "loss": 0.2728,
      "rewards/chosen": 0.5633559226989746,
      "rewards/margins": 11.289161682128906,
      "rewards/rejected": -10.72580623626709,
      "step": 1384
    },
    {
      "epoch": 0.36,
      "grad_norm": 29.389549255371094,
      "kl": 0.0,
      "learning_rate": 3.187647212771525e-07,
      "logps/chosen": -143.76699829101562,
      "logps/rejected": -267.39080810546875,
      "loss": 0.3333,
      "rewards/chosen": -0.05079573392868042,
      "rewards/margins": 3.496812582015991,
      "rewards/rejected": -3.5476083755493164,
      "step": 1385
    },
    {
      "epoch": 0.36,
      "grad_norm": 27.688262939453125,
      "kl": 0.0,
      "learning_rate": 3.186338654802407e-07,
      "logps/chosen": -173.18344116210938,
      "logps/rejected": -175.218017578125,
      "loss": 0.2628,
      "rewards/chosen": 0.21559491753578186,
      "rewards/margins": 1.7542529106140137,
      "rewards/rejected": -1.5386580228805542,
      "step": 1386
    },
    {
      "epoch": 0.36,
      "grad_norm": 26.76343536376953,
      "kl": 0.0,
      "learning_rate": 3.1850300968332896e-07,
      "logps/chosen": -156.0584259033203,
      "logps/rejected": -314.71502685546875,
      "loss": 0.1997,
      "rewards/chosen": 1.3652375936508179,
      "rewards/margins": 7.026503562927246,
      "rewards/rejected": -5.661265850067139,
      "step": 1387
    },
    {
      "epoch": 0.36,
      "grad_norm": 33.4283332824707,
      "kl": 0.0,
      "learning_rate": 3.1837215388641716e-07,
      "logps/chosen": -196.38980102539062,
      "logps/rejected": -226.2455291748047,
      "loss": 0.153,
      "rewards/chosen": 3.0454821586608887,
      "rewards/margins": 6.030291557312012,
      "rewards/rejected": -2.984809160232544,
      "step": 1388
    },
    {
      "epoch": 0.36,
      "grad_norm": 38.99360656738281,
      "kl": 0.0,
      "learning_rate": 3.1824129808950536e-07,
      "logps/chosen": -282.0292663574219,
      "logps/rejected": -421.4825439453125,
      "loss": 0.2815,
      "rewards/chosen": 0.8324300646781921,
      "rewards/margins": 3.952428102493286,
      "rewards/rejected": -3.119997978210449,
      "step": 1389
    },
    {
      "epoch": 0.36,
      "grad_norm": 43.50868606567383,
      "kl": 0.0,
      "learning_rate": 3.1811044229259355e-07,
      "logps/chosen": -124.22837829589844,
      "logps/rejected": -212.58775329589844,
      "loss": 0.3436,
      "rewards/chosen": 0.3585464656352997,
      "rewards/margins": 2.7998228073120117,
      "rewards/rejected": -2.4412763118743896,
      "step": 1390
    },
    {
      "epoch": 0.36,
      "grad_norm": 44.76431655883789,
      "kl": 0.0,
      "learning_rate": 3.1797958649568175e-07,
      "logps/chosen": -224.62722778320312,
      "logps/rejected": -197.28936767578125,
      "loss": 0.4756,
      "rewards/chosen": -1.3198111057281494,
      "rewards/margins": -0.12586522102355957,
      "rewards/rejected": -1.1939458847045898,
      "step": 1391
    },
    {
      "epoch": 0.36,
      "grad_norm": 33.97430419921875,
      "kl": 0.0,
      "learning_rate": 3.1784873069876994e-07,
      "logps/chosen": -203.94140625,
      "logps/rejected": -204.45584106445312,
      "loss": 0.3566,
      "rewards/chosen": 0.5176467895507812,
      "rewards/margins": 1.5917928218841553,
      "rewards/rejected": -1.074146032333374,
      "step": 1392
    },
    {
      "epoch": 0.36,
      "grad_norm": 34.34186553955078,
      "kl": 0.0,
      "learning_rate": 3.1771787490185814e-07,
      "logps/chosen": -232.93101501464844,
      "logps/rejected": -186.81085205078125,
      "loss": 0.366,
      "rewards/chosen": 0.39034122228622437,
      "rewards/margins": 1.8955557346343994,
      "rewards/rejected": -1.5052144527435303,
      "step": 1393
    },
    {
      "epoch": 0.36,
      "grad_norm": 34.1844367980957,
      "kl": 0.0,
      "learning_rate": 3.1758701910494634e-07,
      "logps/chosen": -183.45620727539062,
      "logps/rejected": -236.90887451171875,
      "loss": 0.2629,
      "rewards/chosen": 0.44305187463760376,
      "rewards/margins": 4.251132488250732,
      "rewards/rejected": -3.8080806732177734,
      "step": 1394
    },
    {
      "epoch": 0.37,
      "grad_norm": 43.349403381347656,
      "kl": 0.0,
      "learning_rate": 3.1745616330803453e-07,
      "logps/chosen": -195.5172882080078,
      "logps/rejected": -221.8050537109375,
      "loss": 0.2069,
      "rewards/chosen": 1.6076291799545288,
      "rewards/margins": 4.366086006164551,
      "rewards/rejected": -2.7584567070007324,
      "step": 1395
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.73597717285156,
      "kl": 0.0,
      "learning_rate": 3.1732530751112273e-07,
      "logps/chosen": -155.1674041748047,
      "logps/rejected": -318.8786315917969,
      "loss": 0.3455,
      "rewards/chosen": -0.8497037887573242,
      "rewards/margins": 2.198971748352051,
      "rewards/rejected": -3.048675537109375,
      "step": 1396
    },
    {
      "epoch": 0.37,
      "grad_norm": 33.25680160522461,
      "kl": 0.0,
      "learning_rate": 3.171944517142109e-07,
      "logps/chosen": -204.45106506347656,
      "logps/rejected": -172.80410766601562,
      "loss": 0.2137,
      "rewards/chosen": 1.4785555601119995,
      "rewards/margins": 3.709897041320801,
      "rewards/rejected": -2.231341600418091,
      "step": 1397
    },
    {
      "epoch": 0.37,
      "grad_norm": 28.279253005981445,
      "kl": 0.0,
      "learning_rate": 3.170635959172991e-07,
      "logps/chosen": -173.07913208007812,
      "logps/rejected": -171.8015594482422,
      "loss": 0.3284,
      "rewards/chosen": -0.3849499225616455,
      "rewards/margins": 1.7102558612823486,
      "rewards/rejected": -2.095205783843994,
      "step": 1398
    },
    {
      "epoch": 0.37,
      "grad_norm": 29.989870071411133,
      "kl": 0.0,
      "learning_rate": 3.169327401203873e-07,
      "logps/chosen": -340.0752868652344,
      "logps/rejected": -181.56031799316406,
      "loss": 0.2352,
      "rewards/chosen": 3.104139804840088,
      "rewards/margins": 6.055088996887207,
      "rewards/rejected": -2.95094895362854,
      "step": 1399
    },
    {
      "epoch": 0.37,
      "grad_norm": 36.66115188598633,
      "kl": 0.0,
      "learning_rate": 3.1680188432347557e-07,
      "logps/chosen": -242.0625,
      "logps/rejected": -340.12542724609375,
      "loss": 0.3103,
      "rewards/chosen": 0.20509715378284454,
      "rewards/margins": 3.771449089050293,
      "rewards/rejected": -3.566351890563965,
      "step": 1400
    },
    {
      "epoch": 0.37,
      "grad_norm": 30.042667388916016,
      "kl": 0.0,
      "learning_rate": 3.166710285265637e-07,
      "logps/chosen": -234.99282836914062,
      "logps/rejected": -247.1533203125,
      "loss": 0.2896,
      "rewards/chosen": -0.4193776547908783,
      "rewards/margins": 4.4713287353515625,
      "rewards/rejected": -4.890706539154053,
      "step": 1401
    },
    {
      "epoch": 0.37,
      "grad_norm": 40.00863265991211,
      "kl": 0.0,
      "learning_rate": 3.165401727296519e-07,
      "logps/chosen": -206.06680297851562,
      "logps/rejected": -228.10067749023438,
      "loss": 0.4289,
      "rewards/chosen": -0.21271708607673645,
      "rewards/margins": 2.1718521118164062,
      "rewards/rejected": -2.3845691680908203,
      "step": 1402
    },
    {
      "epoch": 0.37,
      "grad_norm": 31.949710845947266,
      "kl": 0.0,
      "learning_rate": 3.164093169327401e-07,
      "logps/chosen": -302.5041198730469,
      "logps/rejected": -295.21661376953125,
      "loss": 0.2195,
      "rewards/chosen": 0.8447118401527405,
      "rewards/margins": 4.163388252258301,
      "rewards/rejected": -3.318676471710205,
      "step": 1403
    },
    {
      "epoch": 0.37,
      "grad_norm": 30.53171157836914,
      "kl": 0.0,
      "learning_rate": 3.162784611358283e-07,
      "logps/chosen": -221.29522705078125,
      "logps/rejected": -240.13375854492188,
      "loss": 0.3361,
      "rewards/chosen": 0.3290228247642517,
      "rewards/margins": 3.6443843841552734,
      "rewards/rejected": -3.315361499786377,
      "step": 1404
    },
    {
      "epoch": 0.37,
      "grad_norm": 29.357757568359375,
      "kl": 0.0,
      "learning_rate": 3.161476053389165e-07,
      "logps/chosen": -222.23890686035156,
      "logps/rejected": -256.6720275878906,
      "loss": 0.3401,
      "rewards/chosen": -0.36933091282844543,
      "rewards/margins": 2.098792314529419,
      "rewards/rejected": -2.468123197555542,
      "step": 1405
    },
    {
      "epoch": 0.37,
      "grad_norm": 38.137054443359375,
      "kl": 0.0,
      "learning_rate": 3.160167495420047e-07,
      "logps/chosen": -224.1837615966797,
      "logps/rejected": -277.242919921875,
      "loss": 0.218,
      "rewards/chosen": 0.6834139227867126,
      "rewards/margins": 3.4380838871002197,
      "rewards/rejected": -2.7546699047088623,
      "step": 1406
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.083553314208984,
      "kl": 0.0,
      "learning_rate": 3.158858937450929e-07,
      "logps/chosen": -161.6896209716797,
      "logps/rejected": -279.6732177734375,
      "loss": 0.2805,
      "rewards/chosen": 0.5990890860557556,
      "rewards/margins": 5.596165180206299,
      "rewards/rejected": -4.997076034545898,
      "step": 1407
    },
    {
      "epoch": 0.37,
      "grad_norm": 30.056991577148438,
      "kl": 0.0,
      "learning_rate": 3.157550379481811e-07,
      "logps/chosen": -184.24851989746094,
      "logps/rejected": -239.89739990234375,
      "loss": 0.2644,
      "rewards/chosen": 0.6428428888320923,
      "rewards/margins": 3.2653703689575195,
      "rewards/rejected": -2.622527599334717,
      "step": 1408
    },
    {
      "epoch": 0.37,
      "grad_norm": 38.32441711425781,
      "kl": 0.0,
      "learning_rate": 3.156241821512693e-07,
      "logps/chosen": -213.1717071533203,
      "logps/rejected": -268.25689697265625,
      "loss": 0.1622,
      "rewards/chosen": 2.8008837699890137,
      "rewards/margins": 6.9828104972839355,
      "rewards/rejected": -4.181926727294922,
      "step": 1409
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.889060974121094,
      "kl": 0.0,
      "learning_rate": 3.154933263543575e-07,
      "logps/chosen": -222.70004272460938,
      "logps/rejected": -199.03073120117188,
      "loss": 0.4285,
      "rewards/chosen": -0.5681661367416382,
      "rewards/margins": 1.663326382637024,
      "rewards/rejected": -2.231492519378662,
      "step": 1410
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.03827667236328,
      "kl": 0.0,
      "learning_rate": 3.1536247055744567e-07,
      "logps/chosen": -199.5806427001953,
      "logps/rejected": -234.4297637939453,
      "loss": 0.2155,
      "rewards/chosen": 1.4614603519439697,
      "rewards/margins": 4.551479339599609,
      "rewards/rejected": -3.0900187492370605,
      "step": 1411
    },
    {
      "epoch": 0.37,
      "grad_norm": 28.646682739257812,
      "kl": 0.0,
      "learning_rate": 3.152316147605339e-07,
      "logps/chosen": -185.1912384033203,
      "logps/rejected": -371.14825439453125,
      "loss": 0.3057,
      "rewards/chosen": -0.053407806903123856,
      "rewards/margins": 4.185282230377197,
      "rewards/rejected": -4.23868989944458,
      "step": 1412
    },
    {
      "epoch": 0.37,
      "grad_norm": 36.860076904296875,
      "kl": 0.0,
      "learning_rate": 3.151007589636221e-07,
      "logps/chosen": -191.5133514404297,
      "logps/rejected": -253.78466796875,
      "loss": 0.2214,
      "rewards/chosen": 2.807755708694458,
      "rewards/margins": 5.15962028503418,
      "rewards/rejected": -2.3518643379211426,
      "step": 1413
    },
    {
      "epoch": 0.37,
      "grad_norm": 29.74126625061035,
      "kl": 0.0,
      "learning_rate": 3.149699031667103e-07,
      "logps/chosen": -219.7583770751953,
      "logps/rejected": -290.28253173828125,
      "loss": 0.2206,
      "rewards/chosen": 0.3516443073749542,
      "rewards/margins": 4.822754859924316,
      "rewards/rejected": -4.4711103439331055,
      "step": 1414
    },
    {
      "epoch": 0.37,
      "grad_norm": 33.049774169921875,
      "kl": 0.0,
      "learning_rate": 3.148390473697985e-07,
      "logps/chosen": -243.57969665527344,
      "logps/rejected": -400.4478454589844,
      "loss": 0.2593,
      "rewards/chosen": 1.0341131687164307,
      "rewards/margins": 7.01644229888916,
      "rewards/rejected": -5.982329368591309,
      "step": 1415
    },
    {
      "epoch": 0.37,
      "grad_norm": 27.80577278137207,
      "kl": 0.0,
      "learning_rate": 3.1470819157288665e-07,
      "logps/chosen": -198.178955078125,
      "logps/rejected": -237.63461303710938,
      "loss": 0.2456,
      "rewards/chosen": 0.9747270941734314,
      "rewards/margins": 4.922013282775879,
      "rewards/rejected": -3.9472861289978027,
      "step": 1416
    },
    {
      "epoch": 0.37,
      "grad_norm": 37.619537353515625,
      "kl": 0.0,
      "learning_rate": 3.1457733577597485e-07,
      "logps/chosen": -212.923583984375,
      "logps/rejected": -216.25405883789062,
      "loss": 0.3014,
      "rewards/chosen": -0.093365877866745,
      "rewards/margins": 2.6899845600128174,
      "rewards/rejected": -2.7833504676818848,
      "step": 1417
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.109161376953125,
      "kl": 0.0,
      "learning_rate": 3.1444647997906304e-07,
      "logps/chosen": -199.86090087890625,
      "logps/rejected": -230.55740356445312,
      "loss": 0.2963,
      "rewards/chosen": 0.27489233016967773,
      "rewards/margins": 2.1686692237854004,
      "rewards/rejected": -1.8937770128250122,
      "step": 1418
    },
    {
      "epoch": 0.37,
      "grad_norm": 35.83468246459961,
      "kl": 0.0,
      "learning_rate": 3.1431562418215124e-07,
      "logps/chosen": -225.78807067871094,
      "logps/rejected": -251.05801391601562,
      "loss": 0.3557,
      "rewards/chosen": 0.7054110765457153,
      "rewards/margins": 3.20662260055542,
      "rewards/rejected": -2.501211643218994,
      "step": 1419
    },
    {
      "epoch": 0.37,
      "grad_norm": 40.44602966308594,
      "kl": 0.0,
      "learning_rate": 3.1418476838523943e-07,
      "logps/chosen": -268.45751953125,
      "logps/rejected": -209.78469848632812,
      "loss": 0.3089,
      "rewards/chosen": 1.3857576847076416,
      "rewards/margins": 3.0642151832580566,
      "rewards/rejected": -1.678457498550415,
      "step": 1420
    },
    {
      "epoch": 0.37,
      "grad_norm": 33.158668518066406,
      "kl": 0.0,
      "learning_rate": 3.1405391258832763e-07,
      "logps/chosen": -183.05435180664062,
      "logps/rejected": -246.34567260742188,
      "loss": 0.3375,
      "rewards/chosen": 0.30931970477104187,
      "rewards/margins": 3.5352070331573486,
      "rewards/rejected": -3.2258872985839844,
      "step": 1421
    },
    {
      "epoch": 0.37,
      "grad_norm": 37.4815559387207,
      "kl": 0.0,
      "learning_rate": 3.1392305679141583e-07,
      "logps/chosen": -197.08316040039062,
      "logps/rejected": -252.58250427246094,
      "loss": 0.2512,
      "rewards/chosen": 1.102900743484497,
      "rewards/margins": 3.982532501220703,
      "rewards/rejected": -2.879631757736206,
      "step": 1422
    },
    {
      "epoch": 0.37,
      "grad_norm": 33.83235168457031,
      "kl": 0.0,
      "learning_rate": 3.13792200994504e-07,
      "logps/chosen": -212.65773010253906,
      "logps/rejected": -306.92974853515625,
      "loss": 0.2468,
      "rewards/chosen": 0.312684565782547,
      "rewards/margins": 3.8462066650390625,
      "rewards/rejected": -3.533522129058838,
      "step": 1423
    },
    {
      "epoch": 0.37,
      "grad_norm": 32.37043380737305,
      "kl": 0.0,
      "learning_rate": 3.136613451975922e-07,
      "logps/chosen": -202.6951904296875,
      "logps/rejected": -128.4596405029297,
      "loss": 0.2921,
      "rewards/chosen": -0.11523666977882385,
      "rewards/margins": 1.6435047388076782,
      "rewards/rejected": -1.7587413787841797,
      "step": 1424
    },
    {
      "epoch": 0.37,
      "grad_norm": 28.053985595703125,
      "kl": 0.0,
      "learning_rate": 3.1353048940068047e-07,
      "logps/chosen": -216.8121795654297,
      "logps/rejected": -221.2266387939453,
      "loss": 0.1942,
      "rewards/chosen": 0.6265994310379028,
      "rewards/margins": 4.50843620300293,
      "rewards/rejected": -3.8818366527557373,
      "step": 1425
    },
    {
      "epoch": 0.37,
      "grad_norm": 39.29544448852539,
      "kl": 0.0,
      "learning_rate": 3.1339963360376866e-07,
      "logps/chosen": -183.68960571289062,
      "logps/rejected": -299.44647216796875,
      "loss": 0.2205,
      "rewards/chosen": 0.5857275128364563,
      "rewards/margins": 3.8476221561431885,
      "rewards/rejected": -3.261894702911377,
      "step": 1426
    },
    {
      "epoch": 0.37,
      "grad_norm": 41.37693786621094,
      "kl": 0.0,
      "learning_rate": 3.1326877780685686e-07,
      "logps/chosen": -191.17079162597656,
      "logps/rejected": -236.34121704101562,
      "loss": 0.2157,
      "rewards/chosen": 0.22563637793064117,
      "rewards/margins": 3.2330408096313477,
      "rewards/rejected": -3.007404327392578,
      "step": 1427
    },
    {
      "epoch": 0.37,
      "grad_norm": 31.09884262084961,
      "kl": 0.0,
      "learning_rate": 3.1313792200994506e-07,
      "logps/chosen": -195.26025390625,
      "logps/rejected": -295.53948974609375,
      "loss": 0.3539,
      "rewards/chosen": -0.5408454537391663,
      "rewards/margins": 2.64743971824646,
      "rewards/rejected": -3.1882851123809814,
      "step": 1428
    },
    {
      "epoch": 0.37,
      "grad_norm": 31.855932235717773,
      "kl": 0.0,
      "learning_rate": 3.1300706621303325e-07,
      "logps/chosen": -185.37232971191406,
      "logps/rejected": -397.4236145019531,
      "loss": 0.2468,
      "rewards/chosen": 0.33413439989089966,
      "rewards/margins": 4.738489151000977,
      "rewards/rejected": -4.404354572296143,
      "step": 1429
    },
    {
      "epoch": 0.37,
      "grad_norm": 26.01105499267578,
      "kl": 0.0,
      "learning_rate": 3.1287621041612145e-07,
      "logps/chosen": -191.81692504882812,
      "logps/rejected": -241.3495635986328,
      "loss": 0.1754,
      "rewards/chosen": 0.1391974687576294,
      "rewards/margins": 3.8511571884155273,
      "rewards/rejected": -3.7119598388671875,
      "step": 1430
    },
    {
      "epoch": 0.37,
      "grad_norm": 33.91474151611328,
      "kl": 0.0,
      "learning_rate": 3.127453546192096e-07,
      "logps/chosen": -314.10211181640625,
      "logps/rejected": -266.388671875,
      "loss": 0.2915,
      "rewards/chosen": 0.3027981221675873,
      "rewards/margins": 4.647043228149414,
      "rewards/rejected": -4.344244956970215,
      "step": 1431
    },
    {
      "epoch": 0.37,
      "grad_norm": 29.76590347290039,
      "kl": 0.0,
      "learning_rate": 3.126144988222978e-07,
      "logps/chosen": -160.12327575683594,
      "logps/rejected": -245.9610137939453,
      "loss": 0.3936,
      "rewards/chosen": -0.5193794965744019,
      "rewards/margins": 2.19705867767334,
      "rewards/rejected": -2.716438055038452,
      "step": 1432
    },
    {
      "epoch": 0.38,
      "grad_norm": 32.13069534301758,
      "kl": 0.0,
      "learning_rate": 3.12483643025386e-07,
      "logps/chosen": -240.64173889160156,
      "logps/rejected": -193.10438537597656,
      "loss": 0.2948,
      "rewards/chosen": 0.27804023027420044,
      "rewards/margins": 3.268437385559082,
      "rewards/rejected": -2.9903972148895264,
      "step": 1433
    },
    {
      "epoch": 0.38,
      "grad_norm": 27.502201080322266,
      "kl": 0.0,
      "learning_rate": 3.123527872284742e-07,
      "logps/chosen": -235.20938110351562,
      "logps/rejected": -323.3491516113281,
      "loss": 0.2264,
      "rewards/chosen": 1.5059895515441895,
      "rewards/margins": 5.093905448913574,
      "rewards/rejected": -3.587916135787964,
      "step": 1434
    },
    {
      "epoch": 0.38,
      "grad_norm": 35.56685256958008,
      "kl": 0.0,
      "learning_rate": 3.122219314315624e-07,
      "logps/chosen": -182.975830078125,
      "logps/rejected": -133.8419189453125,
      "loss": 0.2527,
      "rewards/chosen": 0.5755879282951355,
      "rewards/margins": 3.9616384506225586,
      "rewards/rejected": -3.3860504627227783,
      "step": 1435
    },
    {
      "epoch": 0.38,
      "grad_norm": 29.447450637817383,
      "kl": 0.0,
      "learning_rate": 3.1209107563465057e-07,
      "logps/chosen": -191.61111450195312,
      "logps/rejected": -312.0847473144531,
      "loss": 0.2663,
      "rewards/chosen": -0.32649320363998413,
      "rewards/margins": 4.456490993499756,
      "rewards/rejected": -4.782984256744385,
      "step": 1436
    },
    {
      "epoch": 0.38,
      "grad_norm": 39.114532470703125,
      "kl": 0.0,
      "learning_rate": 3.1196021983773877e-07,
      "logps/chosen": -248.420654296875,
      "logps/rejected": -363.3230285644531,
      "loss": 0.2007,
      "rewards/chosen": 1.6235179901123047,
      "rewards/margins": 5.3430280685424805,
      "rewards/rejected": -3.719510078430176,
      "step": 1437
    },
    {
      "epoch": 0.38,
      "grad_norm": 26.85320281982422,
      "kl": 0.0,
      "learning_rate": 3.11829364040827e-07,
      "logps/chosen": -168.63673400878906,
      "logps/rejected": -158.07162475585938,
      "loss": 0.3226,
      "rewards/chosen": 0.7311351299285889,
      "rewards/margins": 3.2565670013427734,
      "rewards/rejected": -2.5254318714141846,
      "step": 1438
    },
    {
      "epoch": 0.38,
      "grad_norm": 35.99043273925781,
      "kl": 0.0,
      "learning_rate": 3.116985082439152e-07,
      "logps/chosen": -275.37347412109375,
      "logps/rejected": -253.6946563720703,
      "loss": 0.2417,
      "rewards/chosen": 0.5541010499000549,
      "rewards/margins": 4.043929576873779,
      "rewards/rejected": -3.48982834815979,
      "step": 1439
    },
    {
      "epoch": 0.38,
      "grad_norm": 39.426124572753906,
      "kl": 0.0,
      "learning_rate": 3.115676524470034e-07,
      "logps/chosen": -251.13905334472656,
      "logps/rejected": -192.73565673828125,
      "loss": 0.3679,
      "rewards/chosen": 0.6304320096969604,
      "rewards/margins": 2.555609941482544,
      "rewards/rejected": -1.9251779317855835,
      "step": 1440
    },
    {
      "epoch": 0.38,
      "grad_norm": 27.877065658569336,
      "kl": 0.0,
      "learning_rate": 3.114367966500916e-07,
      "logps/chosen": -208.22756958007812,
      "logps/rejected": -168.75772094726562,
      "loss": 0.319,
      "rewards/chosen": -0.7844001650810242,
      "rewards/margins": 2.3113536834716797,
      "rewards/rejected": -3.0957539081573486,
      "step": 1441
    },
    {
      "epoch": 0.38,
      "grad_norm": 42.60310745239258,
      "kl": 0.0,
      "learning_rate": 3.113059408531798e-07,
      "logps/chosen": -238.57073974609375,
      "logps/rejected": -259.5094909667969,
      "loss": 0.4606,
      "rewards/chosen": -0.1071540042757988,
      "rewards/margins": 1.33914053440094,
      "rewards/rejected": -1.4462945461273193,
      "step": 1442
    },
    {
      "epoch": 0.38,
      "grad_norm": 31.70445442199707,
      "kl": 0.0,
      "learning_rate": 3.11175085056268e-07,
      "logps/chosen": -188.24851989746094,
      "logps/rejected": -266.7518310546875,
      "loss": 0.3175,
      "rewards/chosen": 0.5412964820861816,
      "rewards/margins": 3.8777060508728027,
      "rewards/rejected": -3.336409568786621,
      "step": 1443
    },
    {
      "epoch": 0.38,
      "grad_norm": 42.61405944824219,
      "kl": 0.0,
      "learning_rate": 3.110442292593562e-07,
      "logps/chosen": -159.64474487304688,
      "logps/rejected": -288.9511413574219,
      "loss": 0.2807,
      "rewards/chosen": 0.07189539819955826,
      "rewards/margins": 4.065637588500977,
      "rewards/rejected": -3.993741989135742,
      "step": 1444
    },
    {
      "epoch": 0.38,
      "grad_norm": 33.7481689453125,
      "kl": 0.0,
      "learning_rate": 3.109133734624444e-07,
      "logps/chosen": -176.5513916015625,
      "logps/rejected": -195.16612243652344,
      "loss": 0.3098,
      "rewards/chosen": -0.15705662965774536,
      "rewards/margins": 2.0167901515960693,
      "rewards/rejected": -2.17384672164917,
      "step": 1445
    },
    {
      "epoch": 0.38,
      "grad_norm": 31.605884552001953,
      "kl": 0.0,
      "learning_rate": 3.107825176655326e-07,
      "logps/chosen": -246.95982360839844,
      "logps/rejected": -293.61956787109375,
      "loss": 0.2951,
      "rewards/chosen": 0.4098077118396759,
      "rewards/margins": 4.766162395477295,
      "rewards/rejected": -4.356354713439941,
      "step": 1446
    },
    {
      "epoch": 0.38,
      "grad_norm": 37.16173553466797,
      "kl": 0.0,
      "learning_rate": 3.1065166186862073e-07,
      "logps/chosen": -241.196044921875,
      "logps/rejected": -211.0733642578125,
      "loss": 0.2888,
      "rewards/chosen": -0.24421033263206482,
      "rewards/margins": 4.125661373138428,
      "rewards/rejected": -4.369871616363525,
      "step": 1447
    },
    {
      "epoch": 0.38,
      "grad_norm": 39.998992919921875,
      "kl": 0.0,
      "learning_rate": 3.105208060717089e-07,
      "logps/chosen": -293.6846618652344,
      "logps/rejected": -206.80398559570312,
      "loss": 0.2356,
      "rewards/chosen": -0.6223787665367126,
      "rewards/margins": 2.1629295349121094,
      "rewards/rejected": -2.785308361053467,
      "step": 1448
    },
    {
      "epoch": 0.38,
      "grad_norm": 52.20500183105469,
      "kl": 0.0,
      "learning_rate": 3.103899502747971e-07,
      "logps/chosen": -192.3514404296875,
      "logps/rejected": -288.9823913574219,
      "loss": 0.2946,
      "rewards/chosen": -0.11612102389335632,
      "rewards/margins": 3.5733249187469482,
      "rewards/rejected": -3.689445972442627,
      "step": 1449
    },
    {
      "epoch": 0.38,
      "grad_norm": 31.664487838745117,
      "kl": 0.0,
      "learning_rate": 3.102590944778853e-07,
      "logps/chosen": -199.2932891845703,
      "logps/rejected": -276.72418212890625,
      "loss": 0.2768,
      "rewards/chosen": 0.11254112422466278,
      "rewards/margins": 3.0545997619628906,
      "rewards/rejected": -2.942058563232422,
      "step": 1450
    },
    {
      "epoch": 0.38,
      "grad_norm": 38.79372787475586,
      "kl": 0.0,
      "learning_rate": 3.1012823868097357e-07,
      "logps/chosen": -289.4332275390625,
      "logps/rejected": -180.63380432128906,
      "loss": 0.4345,
      "rewards/chosen": -0.293634831905365,
      "rewards/margins": 0.8389768004417419,
      "rewards/rejected": -1.132611632347107,
      "step": 1451
    },
    {
      "epoch": 0.38,
      "grad_norm": 34.717159271240234,
      "kl": 0.0,
      "learning_rate": 3.0999738288406176e-07,
      "logps/chosen": -122.23812103271484,
      "logps/rejected": -292.23309326171875,
      "loss": 0.2905,
      "rewards/chosen": -0.4397892653942108,
      "rewards/margins": 2.1818888187408447,
      "rewards/rejected": -2.621678113937378,
      "step": 1452
    },
    {
      "epoch": 0.38,
      "grad_norm": 26.660770416259766,
      "kl": 0.0,
      "learning_rate": 3.0986652708714996e-07,
      "logps/chosen": -157.556640625,
      "logps/rejected": -295.0622863769531,
      "loss": 0.2446,
      "rewards/chosen": 1.0527524948120117,
      "rewards/margins": 3.977877616882324,
      "rewards/rejected": -2.9251251220703125,
      "step": 1453
    },
    {
      "epoch": 0.38,
      "grad_norm": 29.619903564453125,
      "kl": 0.0,
      "learning_rate": 3.0973567129023815e-07,
      "logps/chosen": -192.88714599609375,
      "logps/rejected": -202.2433624267578,
      "loss": 0.2015,
      "rewards/chosen": 0.7536472678184509,
      "rewards/margins": 3.491544246673584,
      "rewards/rejected": -2.7378969192504883,
      "step": 1454
    },
    {
      "epoch": 0.38,
      "grad_norm": 34.550262451171875,
      "kl": 0.0,
      "learning_rate": 3.0960481549332635e-07,
      "logps/chosen": -225.03179931640625,
      "logps/rejected": -210.81072998046875,
      "loss": 0.3427,
      "rewards/chosen": 2.0873336791992188,
      "rewards/margins": 4.194190979003906,
      "rewards/rejected": -2.1068575382232666,
      "step": 1455
    },
    {
      "epoch": 0.38,
      "grad_norm": 39.06039047241211,
      "kl": 0.0,
      "learning_rate": 3.0947395969641455e-07,
      "logps/chosen": -235.35470581054688,
      "logps/rejected": -283.54583740234375,
      "loss": 0.2915,
      "rewards/chosen": 0.08544189482927322,
      "rewards/margins": 3.5063464641571045,
      "rewards/rejected": -3.4209046363830566,
      "step": 1456
    },
    {
      "epoch": 0.38,
      "grad_norm": 37.075130462646484,
      "kl": 0.0,
      "learning_rate": 3.0934310389950274e-07,
      "logps/chosen": -140.18020629882812,
      "logps/rejected": -242.32400512695312,
      "loss": 0.3857,
      "rewards/chosen": -0.3754658102989197,
      "rewards/margins": 1.4958202838897705,
      "rewards/rejected": -1.8712860345840454,
      "step": 1457
    },
    {
      "epoch": 0.38,
      "grad_norm": 30.9268741607666,
      "kl": 0.0,
      "learning_rate": 3.0921224810259094e-07,
      "logps/chosen": -211.39682006835938,
      "logps/rejected": -248.64735412597656,
      "loss": 0.3061,
      "rewards/chosen": 0.48306411504745483,
      "rewards/margins": 3.4601712226867676,
      "rewards/rejected": -2.977107048034668,
      "step": 1458
    },
    {
      "epoch": 0.38,
      "grad_norm": 27.856122970581055,
      "kl": 0.0,
      "learning_rate": 3.0908139230567914e-07,
      "logps/chosen": -225.61920166015625,
      "logps/rejected": -236.6941680908203,
      "loss": 0.3685,
      "rewards/chosen": -0.30282920598983765,
      "rewards/margins": 2.0515077114105225,
      "rewards/rejected": -2.354336977005005,
      "step": 1459
    },
    {
      "epoch": 0.38,
      "grad_norm": 29.09127426147461,
      "kl": 0.0,
      "learning_rate": 3.0895053650876733e-07,
      "logps/chosen": -157.025390625,
      "logps/rejected": -247.02685546875,
      "loss": 0.3274,
      "rewards/chosen": 0.052327901124954224,
      "rewards/margins": 2.787691354751587,
      "rewards/rejected": -2.735363483428955,
      "step": 1460
    },
    {
      "epoch": 0.38,
      "grad_norm": 31.07024383544922,
      "kl": 0.0,
      "learning_rate": 3.0881968071185553e-07,
      "logps/chosen": -137.29736328125,
      "logps/rejected": -241.9585723876953,
      "loss": 0.1837,
      "rewards/chosen": 0.11936046183109283,
      "rewards/margins": 3.824509859085083,
      "rewards/rejected": -3.7051494121551514,
      "step": 1461
    },
    {
      "epoch": 0.38,
      "grad_norm": 36.02007293701172,
      "kl": 0.0,
      "learning_rate": 3.0868882491494367e-07,
      "logps/chosen": -166.37362670898438,
      "logps/rejected": -258.8514099121094,
      "loss": 0.1196,
      "rewards/chosen": 1.0134023427963257,
      "rewards/margins": 4.809149742126465,
      "rewards/rejected": -3.7957472801208496,
      "step": 1462
    },
    {
      "epoch": 0.38,
      "grad_norm": 30.613178253173828,
      "kl": 0.0,
      "learning_rate": 3.0855796911803187e-07,
      "logps/chosen": -274.941650390625,
      "logps/rejected": -276.4577941894531,
      "loss": 0.3452,
      "rewards/chosen": 0.25566530227661133,
      "rewards/margins": 5.204866409301758,
      "rewards/rejected": -4.9492011070251465,
      "step": 1463
    },
    {
      "epoch": 0.38,
      "grad_norm": 28.9886417388916,
      "kl": 0.0,
      "learning_rate": 3.084271133211201e-07,
      "logps/chosen": -115.51466369628906,
      "logps/rejected": -223.60037231445312,
      "loss": 0.2394,
      "rewards/chosen": 1.5236237049102783,
      "rewards/margins": 5.302260875701904,
      "rewards/rejected": -3.778637170791626,
      "step": 1464
    },
    {
      "epoch": 0.38,
      "grad_norm": 27.671178817749023,
      "kl": 0.0,
      "learning_rate": 3.082962575242083e-07,
      "logps/chosen": -185.50547790527344,
      "logps/rejected": -286.89459228515625,
      "loss": 0.3222,
      "rewards/chosen": 0.7556943297386169,
      "rewards/margins": 3.5602149963378906,
      "rewards/rejected": -2.804520606994629,
      "step": 1465
    },
    {
      "epoch": 0.38,
      "grad_norm": 29.94826889038086,
      "kl": 0.0,
      "learning_rate": 3.081654017272965e-07,
      "logps/chosen": -247.38006591796875,
      "logps/rejected": -202.25840759277344,
      "loss": 0.1946,
      "rewards/chosen": 1.0844428539276123,
      "rewards/margins": 5.710987091064453,
      "rewards/rejected": -4.626543998718262,
      "step": 1466
    },
    {
      "epoch": 0.38,
      "grad_norm": 25.045530319213867,
      "kl": 0.0,
      "learning_rate": 3.080345459303847e-07,
      "logps/chosen": -220.7410125732422,
      "logps/rejected": -191.14923095703125,
      "loss": 0.3061,
      "rewards/chosen": -1.2042438983917236,
      "rewards/margins": 0.7004872560501099,
      "rewards/rejected": -1.9047311544418335,
      "step": 1467
    },
    {
      "epoch": 0.38,
      "grad_norm": 35.02234649658203,
      "kl": 0.0,
      "learning_rate": 3.079036901334729e-07,
      "logps/chosen": -198.464599609375,
      "logps/rejected": -260.91424560546875,
      "loss": 0.3501,
      "rewards/chosen": 0.9540805220603943,
      "rewards/margins": 3.882875919342041,
      "rewards/rejected": -2.928795337677002,
      "step": 1468
    },
    {
      "epoch": 0.38,
      "grad_norm": 33.84938430786133,
      "kl": 0.0,
      "learning_rate": 3.077728343365611e-07,
      "logps/chosen": -152.50367736816406,
      "logps/rejected": -298.82403564453125,
      "loss": 0.3167,
      "rewards/chosen": -0.22875815629959106,
      "rewards/margins": 4.95879602432251,
      "rewards/rejected": -5.187554359436035,
      "step": 1469
    },
    {
      "epoch": 0.38,
      "grad_norm": 35.08100128173828,
      "kl": 0.0,
      "learning_rate": 3.076419785396493e-07,
      "logps/chosen": -295.23663330078125,
      "logps/rejected": -158.22804260253906,
      "loss": 0.3239,
      "rewards/chosen": 1.505220651626587,
      "rewards/margins": 4.030706405639648,
      "rewards/rejected": -2.5254859924316406,
      "step": 1470
    },
    {
      "epoch": 0.38,
      "grad_norm": 33.56315231323242,
      "kl": 0.0,
      "learning_rate": 3.075111227427375e-07,
      "logps/chosen": -125.35154724121094,
      "logps/rejected": -183.9165802001953,
      "loss": 0.2298,
      "rewards/chosen": 1.0760307312011719,
      "rewards/margins": 4.734567642211914,
      "rewards/rejected": -3.6585371494293213,
      "step": 1471
    },
    {
      "epoch": 0.39,
      "grad_norm": 46.655113220214844,
      "kl": 0.0,
      "learning_rate": 3.073802669458257e-07,
      "logps/chosen": -321.3465576171875,
      "logps/rejected": -221.3088836669922,
      "loss": 0.4142,
      "rewards/chosen": 0.4878466725349426,
      "rewards/margins": 2.085667371749878,
      "rewards/rejected": -1.5978206396102905,
      "step": 1472
    },
    {
      "epoch": 0.39,
      "grad_norm": 35.99465560913086,
      "kl": 0.0,
      "learning_rate": 3.072494111489139e-07,
      "logps/chosen": -208.22398376464844,
      "logps/rejected": -208.73878479003906,
      "loss": 0.2003,
      "rewards/chosen": 1.324711561203003,
      "rewards/margins": 4.5703043937683105,
      "rewards/rejected": -3.2455928325653076,
      "step": 1473
    },
    {
      "epoch": 0.39,
      "grad_norm": 32.40290451049805,
      "kl": 0.0,
      "learning_rate": 3.071185553520021e-07,
      "logps/chosen": -204.934814453125,
      "logps/rejected": -203.62197875976562,
      "loss": 0.3736,
      "rewards/chosen": -0.7168243527412415,
      "rewards/margins": 2.0529723167419434,
      "rewards/rejected": -2.76979660987854,
      "step": 1474
    },
    {
      "epoch": 0.39,
      "grad_norm": 28.98914909362793,
      "kl": 0.0,
      "learning_rate": 3.0698769955509027e-07,
      "logps/chosen": -138.81582641601562,
      "logps/rejected": -237.28030395507812,
      "loss": 0.279,
      "rewards/chosen": 0.6532577276229858,
      "rewards/margins": 3.616654396057129,
      "rewards/rejected": -2.9633965492248535,
      "step": 1475
    },
    {
      "epoch": 0.39,
      "grad_norm": 34.76724624633789,
      "kl": 0.0,
      "learning_rate": 3.068568437581785e-07,
      "logps/chosen": -257.87762451171875,
      "logps/rejected": -243.04725646972656,
      "loss": 0.219,
      "rewards/chosen": 0.8108853697776794,
      "rewards/margins": 3.9558494091033936,
      "rewards/rejected": -3.1449639797210693,
      "step": 1476
    },
    {
      "epoch": 0.39,
      "grad_norm": 22.456140518188477,
      "kl": 0.0,
      "learning_rate": 3.067259879612667e-07,
      "logps/chosen": -159.96438598632812,
      "logps/rejected": -202.48800659179688,
      "loss": 0.2281,
      "rewards/chosen": 0.7046743035316467,
      "rewards/margins": 4.733370780944824,
      "rewards/rejected": -4.028696537017822,
      "step": 1477
    },
    {
      "epoch": 0.39,
      "grad_norm": 28.346275329589844,
      "kl": 0.0,
      "learning_rate": 3.0659513216435486e-07,
      "logps/chosen": -163.13380432128906,
      "logps/rejected": -276.5238952636719,
      "loss": 0.2247,
      "rewards/chosen": 1.4744484424591064,
      "rewards/margins": 5.781862258911133,
      "rewards/rejected": -4.3074140548706055,
      "step": 1478
    },
    {
      "epoch": 0.39,
      "grad_norm": 26.092206954956055,
      "kl": 0.0,
      "learning_rate": 3.0646427636744306e-07,
      "logps/chosen": -268.532470703125,
      "logps/rejected": -272.1234436035156,
      "loss": 0.2369,
      "rewards/chosen": 1.1939743757247925,
      "rewards/margins": 5.351585388183594,
      "rewards/rejected": -4.157610893249512,
      "step": 1479
    },
    {
      "epoch": 0.39,
      "grad_norm": 36.25248718261719,
      "kl": 0.0,
      "learning_rate": 3.0633342057053125e-07,
      "logps/chosen": -230.22296142578125,
      "logps/rejected": -373.62445068359375,
      "loss": 0.2736,
      "rewards/chosen": -1.2396442890167236,
      "rewards/margins": 4.840928077697754,
      "rewards/rejected": -6.080572605133057,
      "step": 1480
    },
    {
      "epoch": 0.39,
      "grad_norm": 30.960378646850586,
      "kl": 0.0,
      "learning_rate": 3.0620256477361945e-07,
      "logps/chosen": -168.9617462158203,
      "logps/rejected": -217.56732177734375,
      "loss": 0.3062,
      "rewards/chosen": 0.2559138834476471,
      "rewards/margins": 3.4393367767333984,
      "rewards/rejected": -3.183422803878784,
      "step": 1481
    },
    {
      "epoch": 0.39,
      "grad_norm": 27.86515998840332,
      "kl": 0.0,
      "learning_rate": 3.0607170897670765e-07,
      "logps/chosen": -219.33834838867188,
      "logps/rejected": -238.18690490722656,
      "loss": 0.1837,
      "rewards/chosen": -0.1820228099822998,
      "rewards/margins": 2.337970733642578,
      "rewards/rejected": -2.519993543624878,
      "step": 1482
    },
    {
      "epoch": 0.39,
      "grad_norm": 30.410737991333008,
      "kl": 0.0,
      "learning_rate": 3.0594085317979584e-07,
      "logps/chosen": -152.96058654785156,
      "logps/rejected": -216.08642578125,
      "loss": 0.2986,
      "rewards/chosen": -0.333772748708725,
      "rewards/margins": 3.3381154537200928,
      "rewards/rejected": -3.6718881130218506,
      "step": 1483
    },
    {
      "epoch": 0.39,
      "grad_norm": 45.52098846435547,
      "kl": 0.0,
      "learning_rate": 3.0580999738288404e-07,
      "logps/chosen": -235.53439331054688,
      "logps/rejected": -242.63815307617188,
      "loss": 0.2718,
      "rewards/chosen": 1.6429567337036133,
      "rewards/margins": 4.876601219177246,
      "rewards/rejected": -3.233644485473633,
      "step": 1484
    },
    {
      "epoch": 0.39,
      "grad_norm": 33.77592849731445,
      "kl": 0.0,
      "learning_rate": 3.0567914158597223e-07,
      "logps/chosen": -222.42430114746094,
      "logps/rejected": -269.167724609375,
      "loss": 0.3382,
      "rewards/chosen": 0.28296148777008057,
      "rewards/margins": 3.6499133110046387,
      "rewards/rejected": -3.3669517040252686,
      "step": 1485
    },
    {
      "epoch": 0.39,
      "grad_norm": 34.32917785644531,
      "kl": 0.0,
      "learning_rate": 3.0554828578906043e-07,
      "logps/chosen": -245.42189025878906,
      "logps/rejected": -234.22384643554688,
      "loss": 0.2435,
      "rewards/chosen": 1.0255693197250366,
      "rewards/margins": 2.8453550338745117,
      "rewards/rejected": -1.8197858333587646,
      "step": 1486
    },
    {
      "epoch": 0.39,
      "grad_norm": 40.217166900634766,
      "kl": 0.0,
      "learning_rate": 3.054174299921486e-07,
      "logps/chosen": -209.5061798095703,
      "logps/rejected": -204.7925262451172,
      "loss": 0.347,
      "rewards/chosen": 0.07800575345754623,
      "rewards/margins": 3.161608934402466,
      "rewards/rejected": -3.0836031436920166,
      "step": 1487
    },
    {
      "epoch": 0.39,
      "grad_norm": 34.6190185546875,
      "kl": 0.0,
      "learning_rate": 3.052865741952368e-07,
      "logps/chosen": -317.90216064453125,
      "logps/rejected": -208.20858764648438,
      "loss": 0.3854,
      "rewards/chosen": -0.6948710680007935,
      "rewards/margins": 1.9840534925460815,
      "rewards/rejected": -2.678924560546875,
      "step": 1488
    },
    {
      "epoch": 0.39,
      "grad_norm": 33.66693115234375,
      "kl": 0.0,
      "learning_rate": 3.0515571839832507e-07,
      "logps/chosen": -131.39944458007812,
      "logps/rejected": -219.7893524169922,
      "loss": 0.272,
      "rewards/chosen": 1.1535712480545044,
      "rewards/margins": 3.9656424522399902,
      "rewards/rejected": -2.8120710849761963,
      "step": 1489
    },
    {
      "epoch": 0.39,
      "grad_norm": 27.822031021118164,
      "kl": 0.0,
      "learning_rate": 3.0502486260141327e-07,
      "logps/chosen": -205.55569458007812,
      "logps/rejected": -261.0909729003906,
      "loss": 0.4186,
      "rewards/chosen": -0.6630886793136597,
      "rewards/margins": 1.5766831636428833,
      "rewards/rejected": -2.239771842956543,
      "step": 1490
    },
    {
      "epoch": 0.39,
      "grad_norm": 43.77429962158203,
      "kl": 0.0,
      "learning_rate": 3.0489400680450146e-07,
      "logps/chosen": -233.81439208984375,
      "logps/rejected": -243.33828735351562,
      "loss": 0.3327,
      "rewards/chosen": -0.9625567197799683,
      "rewards/margins": 4.0890398025512695,
      "rewards/rejected": -5.051596641540527,
      "step": 1491
    },
    {
      "epoch": 0.39,
      "grad_norm": 35.22560501098633,
      "kl": 0.0,
      "learning_rate": 3.0476315100758966e-07,
      "logps/chosen": -270.0503234863281,
      "logps/rejected": -254.5584259033203,
      "loss": 0.2098,
      "rewards/chosen": -0.6624004244804382,
      "rewards/margins": 5.66503381729126,
      "rewards/rejected": -6.327434062957764,
      "step": 1492
    },
    {
      "epoch": 0.39,
      "grad_norm": 38.9492073059082,
      "kl": 0.0,
      "learning_rate": 3.046322952106778e-07,
      "logps/chosen": -220.76625061035156,
      "logps/rejected": -245.36216735839844,
      "loss": 0.3059,
      "rewards/chosen": 0.005003486294299364,
      "rewards/margins": 3.092819929122925,
      "rewards/rejected": -3.0878164768218994,
      "step": 1493
    },
    {
      "epoch": 0.39,
      "grad_norm": 48.36807632446289,
      "kl": 0.0,
      "learning_rate": 3.04501439413766e-07,
      "logps/chosen": -230.12994384765625,
      "logps/rejected": -240.77676391601562,
      "loss": 0.4136,
      "rewards/chosen": 1.2152693271636963,
      "rewards/margins": 1.9342682361602783,
      "rewards/rejected": -0.718998908996582,
      "step": 1494
    },
    {
      "epoch": 0.39,
      "grad_norm": 38.448631286621094,
      "kl": 0.0,
      "learning_rate": 3.043705836168542e-07,
      "logps/chosen": -296.76776123046875,
      "logps/rejected": -231.88865661621094,
      "loss": 0.2302,
      "rewards/chosen": 1.8158690929412842,
      "rewards/margins": 4.516867637634277,
      "rewards/rejected": -2.700998544692993,
      "step": 1495
    },
    {
      "epoch": 0.39,
      "grad_norm": 34.014060974121094,
      "kl": 0.0,
      "learning_rate": 3.042397278199424e-07,
      "logps/chosen": -133.33018493652344,
      "logps/rejected": -316.0521240234375,
      "loss": 0.2851,
      "rewards/chosen": 1.3611581325531006,
      "rewards/margins": 4.80728006362915,
      "rewards/rejected": -3.44612193107605,
      "step": 1496
    },
    {
      "epoch": 0.39,
      "grad_norm": 24.460899353027344,
      "kl": 0.0,
      "learning_rate": 3.041088720230306e-07,
      "logps/chosen": -225.04161071777344,
      "logps/rejected": -238.4369354248047,
      "loss": 0.1244,
      "rewards/chosen": 2.800896406173706,
      "rewards/margins": 6.36253547668457,
      "rewards/rejected": -3.5616390705108643,
      "step": 1497
    },
    {
      "epoch": 0.39,
      "grad_norm": 28.581289291381836,
      "kl": 0.0,
      "learning_rate": 3.039780162261188e-07,
      "logps/chosen": -245.81613159179688,
      "logps/rejected": -208.173095703125,
      "loss": 0.3861,
      "rewards/chosen": -0.5592122673988342,
      "rewards/margins": 2.6268491744995117,
      "rewards/rejected": -3.186061382293701,
      "step": 1498
    },
    {
      "epoch": 0.39,
      "grad_norm": 29.391639709472656,
      "kl": 0.0,
      "learning_rate": 3.03847160429207e-07,
      "logps/chosen": -277.779541015625,
      "logps/rejected": -284.0148620605469,
      "loss": 0.3022,
      "rewards/chosen": -0.8414028882980347,
      "rewards/margins": 1.9396697282791138,
      "rewards/rejected": -2.7810726165771484,
      "step": 1499
    },
    {
      "epoch": 0.39,
      "grad_norm": 40.777618408203125,
      "kl": 0.0,
      "learning_rate": 3.037163046322952e-07,
      "logps/chosen": -203.26284790039062,
      "logps/rejected": -250.16270446777344,
      "loss": 0.3193,
      "rewards/chosen": 0.002718701958656311,
      "rewards/margins": 2.7231438159942627,
      "rewards/rejected": -2.7204251289367676,
      "step": 1500
    },
    {
      "epoch": 0.39,
      "grad_norm": 27.155214309692383,
      "kl": 0.0,
      "learning_rate": 3.0358544883538337e-07,
      "logps/chosen": -213.1066131591797,
      "logps/rejected": -231.6353759765625,
      "loss": 0.1685,
      "rewards/chosen": 0.4949774444103241,
      "rewards/margins": 4.511791229248047,
      "rewards/rejected": -4.0168137550354,
      "step": 1501
    },
    {
      "epoch": 0.39,
      "grad_norm": 29.21578598022461,
      "kl": 0.0,
      "learning_rate": 3.034545930384716e-07,
      "logps/chosen": -184.6173553466797,
      "logps/rejected": -220.74658203125,
      "loss": 0.2779,
      "rewards/chosen": -0.2819909155368805,
      "rewards/margins": 3.6761438846588135,
      "rewards/rejected": -3.958134889602661,
      "step": 1502
    },
    {
      "epoch": 0.39,
      "grad_norm": 38.4527702331543,
      "kl": 0.0,
      "learning_rate": 3.033237372415598e-07,
      "logps/chosen": -261.9404296875,
      "logps/rejected": -197.33087158203125,
      "loss": 0.357,
      "rewards/chosen": 0.11832320690155029,
      "rewards/margins": 2.584242820739746,
      "rewards/rejected": -2.4659194946289062,
      "step": 1503
    },
    {
      "epoch": 0.39,
      "grad_norm": 28.692373275756836,
      "kl": 0.0,
      "learning_rate": 3.03192881444648e-07,
      "logps/chosen": -186.4076690673828,
      "logps/rejected": -177.31590270996094,
      "loss": 0.1979,
      "rewards/chosen": -0.825257420539856,
      "rewards/margins": 2.0939512252807617,
      "rewards/rejected": -2.919208526611328,
      "step": 1504
    },
    {
      "epoch": 0.39,
      "grad_norm": 39.02003479003906,
      "kl": 0.0,
      "learning_rate": 3.030620256477362e-07,
      "logps/chosen": -253.4713134765625,
      "logps/rejected": -160.2345733642578,
      "loss": 0.407,
      "rewards/chosen": -0.8305866122245789,
      "rewards/margins": 0.5052904486656189,
      "rewards/rejected": -1.3358770608901978,
      "step": 1505
    },
    {
      "epoch": 0.39,
      "grad_norm": 41.82636642456055,
      "kl": 0.0,
      "learning_rate": 3.029311698508244e-07,
      "logps/chosen": -75.3058853149414,
      "logps/rejected": -257.3697814941406,
      "loss": 0.2283,
      "rewards/chosen": 1.5601531267166138,
      "rewards/margins": 3.760997772216797,
      "rewards/rejected": -2.2008447647094727,
      "step": 1506
    },
    {
      "epoch": 0.39,
      "grad_norm": 37.105777740478516,
      "kl": 0.0,
      "learning_rate": 3.028003140539126e-07,
      "logps/chosen": -292.6695556640625,
      "logps/rejected": -244.73265075683594,
      "loss": 0.3224,
      "rewards/chosen": -1.939176321029663,
      "rewards/margins": 3.3143680095672607,
      "rewards/rejected": -5.253544330596924,
      "step": 1507
    },
    {
      "epoch": 0.39,
      "grad_norm": 58.40880584716797,
      "kl": 0.0,
      "learning_rate": 3.026694582570008e-07,
      "logps/chosen": -144.26760864257812,
      "logps/rejected": -260.7916259765625,
      "loss": 0.2336,
      "rewards/chosen": 0.3313431739807129,
      "rewards/margins": 4.2500715255737305,
      "rewards/rejected": -3.9187281131744385,
      "step": 1508
    },
    {
      "epoch": 0.39,
      "grad_norm": 36.10960006713867,
      "kl": 0.0,
      "learning_rate": 3.0253860246008894e-07,
      "logps/chosen": -216.76353454589844,
      "logps/rejected": -235.7379150390625,
      "loss": 0.2212,
      "rewards/chosen": 0.9106661677360535,
      "rewards/margins": 4.330042839050293,
      "rewards/rejected": -3.419376850128174,
      "step": 1509
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.00156784057617,
      "kl": 0.0,
      "learning_rate": 3.0240774666317714e-07,
      "logps/chosen": -247.1843719482422,
      "logps/rejected": -234.66162109375,
      "loss": 0.3298,
      "rewards/chosen": 0.6620055437088013,
      "rewards/margins": 2.648651599884033,
      "rewards/rejected": -1.986646056175232,
      "step": 1510
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.013389587402344,
      "kl": 0.0,
      "learning_rate": 3.0227689086626533e-07,
      "logps/chosen": -267.90496826171875,
      "logps/rejected": -251.13446044921875,
      "loss": 0.2749,
      "rewards/chosen": 1.9397257566452026,
      "rewards/margins": 5.9958577156066895,
      "rewards/rejected": -4.056131839752197,
      "step": 1511
    },
    {
      "epoch": 0.4,
      "grad_norm": 24.649215698242188,
      "kl": 0.0,
      "learning_rate": 3.0214603506935353e-07,
      "logps/chosen": -128.24273681640625,
      "logps/rejected": -199.8554229736328,
      "loss": 0.247,
      "rewards/chosen": 0.5000770688056946,
      "rewards/margins": 4.609408378601074,
      "rewards/rejected": -4.109331130981445,
      "step": 1512
    },
    {
      "epoch": 0.4,
      "grad_norm": 49.26161193847656,
      "kl": 0.0,
      "learning_rate": 3.020151792724417e-07,
      "logps/chosen": -228.639892578125,
      "logps/rejected": -158.06222534179688,
      "loss": 0.4441,
      "rewards/chosen": -0.19630053639411926,
      "rewards/margins": 1.6684128046035767,
      "rewards/rejected": -1.8647133111953735,
      "step": 1513
    },
    {
      "epoch": 0.4,
      "grad_norm": 34.833702087402344,
      "kl": 0.0,
      "learning_rate": 3.018843234755299e-07,
      "logps/chosen": -253.34954833984375,
      "logps/rejected": -217.84603881835938,
      "loss": 0.4131,
      "rewards/chosen": -1.3816742897033691,
      "rewards/margins": 1.0617821216583252,
      "rewards/rejected": -2.4434564113616943,
      "step": 1514
    },
    {
      "epoch": 0.4,
      "grad_norm": 28.9056396484375,
      "kl": 0.0,
      "learning_rate": 3.0175346767861817e-07,
      "logps/chosen": -172.053955078125,
      "logps/rejected": -256.8983154296875,
      "loss": 0.2978,
      "rewards/chosen": -1.2212709188461304,
      "rewards/margins": 1.9896234273910522,
      "rewards/rejected": -3.2108943462371826,
      "step": 1515
    },
    {
      "epoch": 0.4,
      "grad_norm": 28.236330032348633,
      "kl": 0.0,
      "learning_rate": 3.0162261188170637e-07,
      "logps/chosen": -263.5512390136719,
      "logps/rejected": -252.2602996826172,
      "loss": 0.2911,
      "rewards/chosen": -0.8032323718070984,
      "rewards/margins": 2.6989030838012695,
      "rewards/rejected": -3.5021355152130127,
      "step": 1516
    },
    {
      "epoch": 0.4,
      "grad_norm": 27.29069709777832,
      "kl": 0.0,
      "learning_rate": 3.0149175608479456e-07,
      "logps/chosen": -195.29360961914062,
      "logps/rejected": -219.7592010498047,
      "loss": 0.2787,
      "rewards/chosen": -0.900688648223877,
      "rewards/margins": 3.03352952003479,
      "rewards/rejected": -3.934218168258667,
      "step": 1517
    },
    {
      "epoch": 0.4,
      "grad_norm": 37.780662536621094,
      "kl": 0.0,
      "learning_rate": 3.0136090028788276e-07,
      "logps/chosen": -224.90914916992188,
      "logps/rejected": -257.0303955078125,
      "loss": 0.3451,
      "rewards/chosen": -0.3784347176551819,
      "rewards/margins": 4.28721284866333,
      "rewards/rejected": -4.665647506713867,
      "step": 1518
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.100135803222656,
      "kl": 0.0,
      "learning_rate": 3.0123004449097095e-07,
      "logps/chosen": -210.57568359375,
      "logps/rejected": -216.138916015625,
      "loss": 0.3389,
      "rewards/chosen": 0.2311960607767105,
      "rewards/margins": 3.171760320663452,
      "rewards/rejected": -2.9405641555786133,
      "step": 1519
    },
    {
      "epoch": 0.4,
      "grad_norm": 29.088823318481445,
      "kl": 0.0,
      "learning_rate": 3.0109918869405915e-07,
      "logps/chosen": -164.3828125,
      "logps/rejected": -214.0646514892578,
      "loss": 0.2862,
      "rewards/chosen": 0.353228896856308,
      "rewards/margins": 3.9613986015319824,
      "rewards/rejected": -3.6081697940826416,
      "step": 1520
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.2971305847168,
      "kl": 0.0,
      "learning_rate": 3.0096833289714735e-07,
      "logps/chosen": -220.14793395996094,
      "logps/rejected": -321.53192138671875,
      "loss": 0.2287,
      "rewards/chosen": 0.3785732686519623,
      "rewards/margins": 5.689975738525391,
      "rewards/rejected": -5.311402320861816,
      "step": 1521
    },
    {
      "epoch": 0.4,
      "grad_norm": 42.90241622924805,
      "kl": 0.0,
      "learning_rate": 3.0083747710023554e-07,
      "logps/chosen": -195.2041778564453,
      "logps/rejected": -241.295654296875,
      "loss": 0.2787,
      "rewards/chosen": 1.4876168966293335,
      "rewards/margins": 4.329094409942627,
      "rewards/rejected": -2.841477632522583,
      "step": 1522
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.62684631347656,
      "kl": 0.0,
      "learning_rate": 3.0070662130332374e-07,
      "logps/chosen": -272.9668273925781,
      "logps/rejected": -261.4366760253906,
      "loss": 0.3369,
      "rewards/chosen": -0.7848519086837769,
      "rewards/margins": 3.7503604888916016,
      "rewards/rejected": -4.535212516784668,
      "step": 1523
    },
    {
      "epoch": 0.4,
      "grad_norm": 34.81737518310547,
      "kl": 0.0,
      "learning_rate": 3.005757655064119e-07,
      "logps/chosen": -173.27880859375,
      "logps/rejected": -293.0011901855469,
      "loss": 0.3702,
      "rewards/chosen": -0.3220140337944031,
      "rewards/margins": 5.201068878173828,
      "rewards/rejected": -5.523082733154297,
      "step": 1524
    },
    {
      "epoch": 0.4,
      "grad_norm": 39.21683120727539,
      "kl": 0.0,
      "learning_rate": 3.004449097095001e-07,
      "logps/chosen": -164.2547607421875,
      "logps/rejected": -258.0382080078125,
      "loss": 0.3019,
      "rewards/chosen": 0.24068419635295868,
      "rewards/margins": 2.7888166904449463,
      "rewards/rejected": -2.5481324195861816,
      "step": 1525
    },
    {
      "epoch": 0.4,
      "grad_norm": 25.111703872680664,
      "kl": 0.0,
      "learning_rate": 3.0031405391258827e-07,
      "logps/chosen": -181.2828369140625,
      "logps/rejected": -229.8302001953125,
      "loss": 0.1429,
      "rewards/chosen": 3.282607316970825,
      "rewards/margins": 7.605630874633789,
      "rewards/rejected": -4.323023796081543,
      "step": 1526
    },
    {
      "epoch": 0.4,
      "grad_norm": 31.849605560302734,
      "kl": 0.0,
      "learning_rate": 3.0018319811567647e-07,
      "logps/chosen": -239.89051818847656,
      "logps/rejected": -295.36663818359375,
      "loss": 0.397,
      "rewards/chosen": -0.9494417905807495,
      "rewards/margins": 1.8495646715164185,
      "rewards/rejected": -2.799006462097168,
      "step": 1527
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.73810958862305,
      "kl": 0.0,
      "learning_rate": 3.000523423187647e-07,
      "logps/chosen": -194.4100341796875,
      "logps/rejected": -226.74893188476562,
      "loss": 0.3235,
      "rewards/chosen": 0.38581228256225586,
      "rewards/margins": 3.156494140625,
      "rewards/rejected": -2.770681858062744,
      "step": 1528
    },
    {
      "epoch": 0.4,
      "grad_norm": 34.644859313964844,
      "kl": 0.0,
      "learning_rate": 2.999214865218529e-07,
      "logps/chosen": -202.15171813964844,
      "logps/rejected": -245.19308471679688,
      "loss": 0.2077,
      "rewards/chosen": 1.9559669494628906,
      "rewards/margins": 4.376315116882324,
      "rewards/rejected": -2.4203479290008545,
      "step": 1529
    },
    {
      "epoch": 0.4,
      "grad_norm": 34.94104766845703,
      "kl": 0.0,
      "learning_rate": 2.997906307249411e-07,
      "logps/chosen": -188.9501190185547,
      "logps/rejected": -271.6665344238281,
      "loss": 0.3114,
      "rewards/chosen": -0.6005575060844421,
      "rewards/margins": 3.01674485206604,
      "rewards/rejected": -3.617302417755127,
      "step": 1530
    },
    {
      "epoch": 0.4,
      "grad_norm": 38.38316345214844,
      "kl": 0.0,
      "learning_rate": 2.996597749280293e-07,
      "logps/chosen": -214.96990966796875,
      "logps/rejected": -289.9461975097656,
      "loss": 0.3475,
      "rewards/chosen": -0.06996187567710876,
      "rewards/margins": 2.5924227237701416,
      "rewards/rejected": -2.662384510040283,
      "step": 1531
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.79578399658203,
      "kl": 0.0,
      "learning_rate": 2.995289191311175e-07,
      "logps/chosen": -281.37237548828125,
      "logps/rejected": -356.375,
      "loss": 0.3213,
      "rewards/chosen": 0.9624449014663696,
      "rewards/margins": 6.715054988861084,
      "rewards/rejected": -5.752610206604004,
      "step": 1532
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.78695297241211,
      "kl": 0.0,
      "learning_rate": 2.993980633342057e-07,
      "logps/chosen": -208.66139221191406,
      "logps/rejected": -307.55810546875,
      "loss": 0.1853,
      "rewards/chosen": 2.171360492706299,
      "rewards/margins": 4.887650489807129,
      "rewards/rejected": -2.71628999710083,
      "step": 1533
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.36848831176758,
      "kl": 0.0,
      "learning_rate": 2.992672075372939e-07,
      "logps/chosen": -276.29547119140625,
      "logps/rejected": -322.4742431640625,
      "loss": 0.2057,
      "rewards/chosen": 1.1996976137161255,
      "rewards/margins": 4.880166053771973,
      "rewards/rejected": -3.6804683208465576,
      "step": 1534
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.62516784667969,
      "kl": 0.0,
      "learning_rate": 2.991363517403821e-07,
      "logps/chosen": -186.53976440429688,
      "logps/rejected": -294.9364013671875,
      "loss": 0.3682,
      "rewards/chosen": -0.6345914006233215,
      "rewards/margins": 3.435218572616577,
      "rewards/rejected": -4.069809913635254,
      "step": 1535
    },
    {
      "epoch": 0.4,
      "grad_norm": 35.93933868408203,
      "kl": 0.0,
      "learning_rate": 2.990054959434703e-07,
      "logps/chosen": -292.94549560546875,
      "logps/rejected": -264.8148498535156,
      "loss": 0.2498,
      "rewards/chosen": 1.626547932624817,
      "rewards/margins": 4.3725175857543945,
      "rewards/rejected": -2.745969772338867,
      "step": 1536
    },
    {
      "epoch": 0.4,
      "grad_norm": 66.67236328125,
      "kl": 0.0,
      "learning_rate": 2.988746401465585e-07,
      "logps/chosen": -252.86598205566406,
      "logps/rejected": -177.5896759033203,
      "loss": 0.3376,
      "rewards/chosen": -0.02513676881790161,
      "rewards/margins": 2.472538471221924,
      "rewards/rejected": -2.4976751804351807,
      "step": 1537
    },
    {
      "epoch": 0.4,
      "grad_norm": 32.29557418823242,
      "kl": 0.0,
      "learning_rate": 2.987437843496467e-07,
      "logps/chosen": -196.1752166748047,
      "logps/rejected": -275.33575439453125,
      "loss": 0.2844,
      "rewards/chosen": 1.8063740730285645,
      "rewards/margins": 4.412812232971191,
      "rewards/rejected": -2.606438398361206,
      "step": 1538
    },
    {
      "epoch": 0.4,
      "grad_norm": 29.864439010620117,
      "kl": 0.0,
      "learning_rate": 2.986129285527349e-07,
      "logps/chosen": -245.36810302734375,
      "logps/rejected": -219.60891723632812,
      "loss": 0.2431,
      "rewards/chosen": 1.5060614347457886,
      "rewards/margins": 5.396974086761475,
      "rewards/rejected": -3.8909127712249756,
      "step": 1539
    },
    {
      "epoch": 0.4,
      "grad_norm": 34.409236907958984,
      "kl": 0.0,
      "learning_rate": 2.98482072755823e-07,
      "logps/chosen": -183.43905639648438,
      "logps/rejected": -157.3790740966797,
      "loss": 0.2875,
      "rewards/chosen": 1.2466129064559937,
      "rewards/margins": 2.63576078414917,
      "rewards/rejected": -1.3891478776931763,
      "step": 1540
    },
    {
      "epoch": 0.4,
      "grad_norm": 46.68579864501953,
      "kl": 0.0,
      "learning_rate": 2.9835121695891127e-07,
      "logps/chosen": -272.5766906738281,
      "logps/rejected": -255.7216339111328,
      "loss": 0.2431,
      "rewards/chosen": 0.9345579147338867,
      "rewards/margins": 3.078080892562866,
      "rewards/rejected": -2.1435229778289795,
      "step": 1541
    },
    {
      "epoch": 0.4,
      "grad_norm": 24.752788543701172,
      "kl": 0.0,
      "learning_rate": 2.9822036116199946e-07,
      "logps/chosen": -180.98768615722656,
      "logps/rejected": -276.1407775878906,
      "loss": 0.2213,
      "rewards/chosen": 0.8837758302688599,
      "rewards/margins": 4.952155113220215,
      "rewards/rejected": -4.0683794021606445,
      "step": 1542
    },
    {
      "epoch": 0.4,
      "grad_norm": 57.56275939941406,
      "kl": 0.0,
      "learning_rate": 2.9808950536508766e-07,
      "logps/chosen": -176.87188720703125,
      "logps/rejected": -266.2235107421875,
      "loss": 0.3233,
      "rewards/chosen": -0.016633659601211548,
      "rewards/margins": 3.9170687198638916,
      "rewards/rejected": -3.9337024688720703,
      "step": 1543
    },
    {
      "epoch": 0.4,
      "grad_norm": 32.71670150756836,
      "kl": 0.0,
      "learning_rate": 2.9795864956817586e-07,
      "logps/chosen": -222.072265625,
      "logps/rejected": -275.0723876953125,
      "loss": 0.297,
      "rewards/chosen": 0.750746488571167,
      "rewards/margins": 7.154358863830566,
      "rewards/rejected": -6.40361213684082,
      "step": 1544
    },
    {
      "epoch": 0.4,
      "grad_norm": 33.7441520690918,
      "kl": 0.0,
      "learning_rate": 2.9782779377126405e-07,
      "logps/chosen": -183.69985961914062,
      "logps/rejected": -291.5790100097656,
      "loss": 0.2655,
      "rewards/chosen": -0.3251463770866394,
      "rewards/margins": 3.752892255783081,
      "rewards/rejected": -4.078038692474365,
      "step": 1545
    },
    {
      "epoch": 0.4,
      "grad_norm": 26.29779815673828,
      "kl": 0.0,
      "learning_rate": 2.9769693797435225e-07,
      "logps/chosen": -258.9979248046875,
      "logps/rejected": -377.3835144042969,
      "loss": 0.2908,
      "rewards/chosen": 0.3104372024536133,
      "rewards/margins": 4.86877965927124,
      "rewards/rejected": -4.558342456817627,
      "step": 1546
    },
    {
      "epoch": 0.4,
      "grad_norm": 27.509925842285156,
      "kl": 0.0,
      "learning_rate": 2.9756608217744044e-07,
      "logps/chosen": -262.73876953125,
      "logps/rejected": -230.32789611816406,
      "loss": 0.2273,
      "rewards/chosen": -1.16804039478302,
      "rewards/margins": 2.2382097244262695,
      "rewards/rejected": -3.406250238418579,
      "step": 1547
    },
    {
      "epoch": 0.41,
      "grad_norm": 40.13040542602539,
      "kl": 0.0,
      "learning_rate": 2.9743522638052864e-07,
      "logps/chosen": -296.4833984375,
      "logps/rejected": -290.46551513671875,
      "loss": 0.2746,
      "rewards/chosen": 0.9714206457138062,
      "rewards/margins": 4.574386119842529,
      "rewards/rejected": -3.6029653549194336,
      "step": 1548
    },
    {
      "epoch": 0.41,
      "grad_norm": 38.02606201171875,
      "kl": 0.0,
      "learning_rate": 2.9730437058361684e-07,
      "logps/chosen": -194.1744384765625,
      "logps/rejected": -297.8863830566406,
      "loss": 0.2987,
      "rewards/chosen": 0.05778183043003082,
      "rewards/margins": 4.536423206329346,
      "rewards/rejected": -4.478641510009766,
      "step": 1549
    },
    {
      "epoch": 0.41,
      "grad_norm": 43.55545425415039,
      "kl": 0.0,
      "learning_rate": 2.9717351478670503e-07,
      "logps/chosen": -299.7167053222656,
      "logps/rejected": -213.52081298828125,
      "loss": 0.3064,
      "rewards/chosen": 0.4842572510242462,
      "rewards/margins": 3.815340042114258,
      "rewards/rejected": -3.331082820892334,
      "step": 1550
    },
    {
      "epoch": 0.41,
      "grad_norm": 32.664939880371094,
      "kl": 0.0,
      "learning_rate": 2.9704265898979323e-07,
      "logps/chosen": -279.3871765136719,
      "logps/rejected": -205.69642639160156,
      "loss": 0.2493,
      "rewards/chosen": 0.3699782192707062,
      "rewards/margins": 3.5103328227996826,
      "rewards/rejected": -3.140354633331299,
      "step": 1551
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.73197555541992,
      "kl": 0.0,
      "learning_rate": 2.969118031928814e-07,
      "logps/chosen": -270.1496276855469,
      "logps/rejected": -314.9805908203125,
      "loss": 0.3478,
      "rewards/chosen": 0.297321081161499,
      "rewards/margins": 4.266029357910156,
      "rewards/rejected": -3.9687085151672363,
      "step": 1552
    },
    {
      "epoch": 0.41,
      "grad_norm": 38.26441192626953,
      "kl": 0.0,
      "learning_rate": 2.967809473959697e-07,
      "logps/chosen": -146.8294219970703,
      "logps/rejected": -241.314697265625,
      "loss": 0.223,
      "rewards/chosen": 1.411135196685791,
      "rewards/margins": 5.363457679748535,
      "rewards/rejected": -3.9523227214813232,
      "step": 1553
    },
    {
      "epoch": 0.41,
      "grad_norm": 41.40995407104492,
      "kl": 0.0,
      "learning_rate": 2.9665009159905787e-07,
      "logps/chosen": -245.00399780273438,
      "logps/rejected": -275.0325012207031,
      "loss": 0.2849,
      "rewards/chosen": 0.1878739595413208,
      "rewards/margins": 3.4663233757019043,
      "rewards/rejected": -3.278449296951294,
      "step": 1554
    },
    {
      "epoch": 0.41,
      "grad_norm": 42.86524963378906,
      "kl": 0.0,
      "learning_rate": 2.96519235802146e-07,
      "logps/chosen": -219.91879272460938,
      "logps/rejected": -148.5173797607422,
      "loss": 0.359,
      "rewards/chosen": -0.5962321162223816,
      "rewards/margins": 1.0270888805389404,
      "rewards/rejected": -1.6233209371566772,
      "step": 1555
    },
    {
      "epoch": 0.41,
      "grad_norm": 39.4825553894043,
      "kl": 0.0,
      "learning_rate": 2.963883800052342e-07,
      "logps/chosen": -149.03903198242188,
      "logps/rejected": -281.20672607421875,
      "loss": 0.1473,
      "rewards/chosen": 2.119748830795288,
      "rewards/margins": 4.9517822265625,
      "rewards/rejected": -2.832033157348633,
      "step": 1556
    },
    {
      "epoch": 0.41,
      "grad_norm": 28.854963302612305,
      "kl": 0.0,
      "learning_rate": 2.962575242083224e-07,
      "logps/chosen": -158.43087768554688,
      "logps/rejected": -260.1464538574219,
      "loss": 0.2625,
      "rewards/chosen": -0.2116086781024933,
      "rewards/margins": 2.700449228286743,
      "rewards/rejected": -2.912057876586914,
      "step": 1557
    },
    {
      "epoch": 0.41,
      "grad_norm": 35.32528305053711,
      "kl": 0.0,
      "learning_rate": 2.961266684114106e-07,
      "logps/chosen": -278.7086486816406,
      "logps/rejected": -288.8902587890625,
      "loss": 0.2674,
      "rewards/chosen": 1.1448341608047485,
      "rewards/margins": 5.730721473693848,
      "rewards/rejected": -4.585887432098389,
      "step": 1558
    },
    {
      "epoch": 0.41,
      "grad_norm": 25.97184944152832,
      "kl": 0.0,
      "learning_rate": 2.959958126144988e-07,
      "logps/chosen": -300.5465087890625,
      "logps/rejected": -219.9793701171875,
      "loss": 0.3383,
      "rewards/chosen": 0.6771482229232788,
      "rewards/margins": 4.751331806182861,
      "rewards/rejected": -4.074183464050293,
      "step": 1559
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.6771240234375,
      "kl": 0.0,
      "learning_rate": 2.95864956817587e-07,
      "logps/chosen": -193.79806518554688,
      "logps/rejected": -223.5035400390625,
      "loss": 0.3794,
      "rewards/chosen": -0.3675355315208435,
      "rewards/margins": 1.791252613067627,
      "rewards/rejected": -2.1587882041931152,
      "step": 1560
    },
    {
      "epoch": 0.41,
      "grad_norm": 41.60268783569336,
      "kl": 0.0,
      "learning_rate": 2.957341010206752e-07,
      "logps/chosen": -245.3753662109375,
      "logps/rejected": -261.7906799316406,
      "loss": 0.3893,
      "rewards/chosen": -0.6306812763214111,
      "rewards/margins": 1.5200133323669434,
      "rewards/rejected": -2.1506946086883545,
      "step": 1561
    },
    {
      "epoch": 0.41,
      "grad_norm": 36.107933044433594,
      "kl": 0.0,
      "learning_rate": 2.956032452237634e-07,
      "logps/chosen": -274.00250244140625,
      "logps/rejected": -252.59808349609375,
      "loss": 0.4207,
      "rewards/chosen": -0.3567908704280853,
      "rewards/margins": 3.2441518306732178,
      "rewards/rejected": -3.600942611694336,
      "step": 1562
    },
    {
      "epoch": 0.41,
      "grad_norm": 30.820205688476562,
      "kl": 0.0,
      "learning_rate": 2.954723894268516e-07,
      "logps/chosen": -361.6520080566406,
      "logps/rejected": -216.23590087890625,
      "loss": 0.3088,
      "rewards/chosen": -2.397629976272583,
      "rewards/margins": 1.9007060527801514,
      "rewards/rejected": -4.298336029052734,
      "step": 1563
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.62519073486328,
      "kl": 0.0,
      "learning_rate": 2.953415336299398e-07,
      "logps/chosen": -258.6853332519531,
      "logps/rejected": -197.72561645507812,
      "loss": 0.3942,
      "rewards/chosen": -0.9516178965568542,
      "rewards/margins": 2.312492609024048,
      "rewards/rejected": -3.264110565185547,
      "step": 1564
    },
    {
      "epoch": 0.41,
      "grad_norm": 36.56438446044922,
      "kl": 0.0,
      "learning_rate": 2.95210677833028e-07,
      "logps/chosen": -248.93716430664062,
      "logps/rejected": -186.44081115722656,
      "loss": 0.2179,
      "rewards/chosen": 1.0057936906814575,
      "rewards/margins": 3.8947253227233887,
      "rewards/rejected": -2.8889315128326416,
      "step": 1565
    },
    {
      "epoch": 0.41,
      "grad_norm": 35.079612731933594,
      "kl": 0.0,
      "learning_rate": 2.950798220361162e-07,
      "logps/chosen": -300.0610046386719,
      "logps/rejected": -149.28273010253906,
      "loss": 0.2715,
      "rewards/chosen": -0.047494616359472275,
      "rewards/margins": 2.8899199962615967,
      "rewards/rejected": -2.9374146461486816,
      "step": 1566
    },
    {
      "epoch": 0.41,
      "grad_norm": 37.34353256225586,
      "kl": 0.0,
      "learning_rate": 2.949489662392044e-07,
      "logps/chosen": -259.10064697265625,
      "logps/rejected": -260.9805603027344,
      "loss": 0.2721,
      "rewards/chosen": 0.1995670050382614,
      "rewards/margins": 4.065165996551514,
      "rewards/rejected": -3.8655989170074463,
      "step": 1567
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.23500442504883,
      "kl": 0.0,
      "learning_rate": 2.948181104422926e-07,
      "logps/chosen": -254.81307983398438,
      "logps/rejected": -244.6279754638672,
      "loss": 0.3378,
      "rewards/chosen": -0.6847414970397949,
      "rewards/margins": 1.1335030794143677,
      "rewards/rejected": -1.8182445764541626,
      "step": 1568
    },
    {
      "epoch": 0.41,
      "grad_norm": 33.09608459472656,
      "kl": 0.0,
      "learning_rate": 2.946872546453808e-07,
      "logps/chosen": -192.080810546875,
      "logps/rejected": -347.98486328125,
      "loss": 0.2522,
      "rewards/chosen": 0.757779598236084,
      "rewards/margins": 4.43095588684082,
      "rewards/rejected": -3.6731760501861572,
      "step": 1569
    },
    {
      "epoch": 0.41,
      "grad_norm": 35.08234786987305,
      "kl": 0.0,
      "learning_rate": 2.94556398848469e-07,
      "logps/chosen": -202.5784912109375,
      "logps/rejected": -241.56146240234375,
      "loss": 0.2955,
      "rewards/chosen": 0.7428146600723267,
      "rewards/margins": 3.106171131134033,
      "rewards/rejected": -2.363356590270996,
      "step": 1570
    },
    {
      "epoch": 0.41,
      "grad_norm": 39.55061721801758,
      "kl": 0.0,
      "learning_rate": 2.9442554305155715e-07,
      "logps/chosen": -203.7303924560547,
      "logps/rejected": -218.20925903320312,
      "loss": 0.3215,
      "rewards/chosen": -0.3977167308330536,
      "rewards/margins": 3.4592111110687256,
      "rewards/rejected": -3.8569278717041016,
      "step": 1571
    },
    {
      "epoch": 0.41,
      "grad_norm": 32.61384963989258,
      "kl": 0.0,
      "learning_rate": 2.9429468725464535e-07,
      "logps/chosen": -154.0523223876953,
      "logps/rejected": -324.3094787597656,
      "loss": 0.2933,
      "rewards/chosen": -0.45588475465774536,
      "rewards/margins": 2.8898463249206543,
      "rewards/rejected": -3.345731019973755,
      "step": 1572
    },
    {
      "epoch": 0.41,
      "grad_norm": 33.56401824951172,
      "kl": 0.0,
      "learning_rate": 2.9416383145773354e-07,
      "logps/chosen": -252.0158233642578,
      "logps/rejected": -242.02838134765625,
      "loss": 0.2804,
      "rewards/chosen": 0.04096754267811775,
      "rewards/margins": 4.675113201141357,
      "rewards/rejected": -4.634145736694336,
      "step": 1573
    },
    {
      "epoch": 0.41,
      "grad_norm": 29.832853317260742,
      "kl": 0.0,
      "learning_rate": 2.9403297566082174e-07,
      "logps/chosen": -174.89089965820312,
      "logps/rejected": -227.717529296875,
      "loss": 0.2085,
      "rewards/chosen": 0.28123578429222107,
      "rewards/margins": 4.435916900634766,
      "rewards/rejected": -4.154681205749512,
      "step": 1574
    },
    {
      "epoch": 0.41,
      "grad_norm": 51.532833099365234,
      "kl": 0.0,
      "learning_rate": 2.9390211986390993e-07,
      "logps/chosen": -189.41305541992188,
      "logps/rejected": -229.99093627929688,
      "loss": 0.254,
      "rewards/chosen": 0.8586848974227905,
      "rewards/margins": 4.504140853881836,
      "rewards/rejected": -3.645456075668335,
      "step": 1575
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.82904052734375,
      "kl": 0.0,
      "learning_rate": 2.9377126406699813e-07,
      "logps/chosen": -235.12171936035156,
      "logps/rejected": -253.74807739257812,
      "loss": 0.2706,
      "rewards/chosen": 1.0586671829223633,
      "rewards/margins": 3.163533926010132,
      "rewards/rejected": -2.1048667430877686,
      "step": 1576
    },
    {
      "epoch": 0.41,
      "grad_norm": 39.55526351928711,
      "kl": 0.0,
      "learning_rate": 2.9364040827008633e-07,
      "logps/chosen": -207.1605224609375,
      "logps/rejected": -192.63937377929688,
      "loss": 0.3382,
      "rewards/chosen": 0.6457839012145996,
      "rewards/margins": 2.901211738586426,
      "rewards/rejected": -2.255427837371826,
      "step": 1577
    },
    {
      "epoch": 0.41,
      "grad_norm": 33.488189697265625,
      "kl": 0.0,
      "learning_rate": 2.935095524731745e-07,
      "logps/chosen": -185.49661254882812,
      "logps/rejected": -205.3595428466797,
      "loss": 0.2738,
      "rewards/chosen": 0.4008524715900421,
      "rewards/margins": 2.8835244178771973,
      "rewards/rejected": -2.4826719760894775,
      "step": 1578
    },
    {
      "epoch": 0.41,
      "grad_norm": 29.02250862121582,
      "kl": 0.0,
      "learning_rate": 2.9337869667626277e-07,
      "logps/chosen": -157.56988525390625,
      "logps/rejected": -229.55596923828125,
      "loss": 0.2364,
      "rewards/chosen": 0.9958530068397522,
      "rewards/margins": 5.4190287590026855,
      "rewards/rejected": -4.423175811767578,
      "step": 1579
    },
    {
      "epoch": 0.41,
      "grad_norm": 27.68988037109375,
      "kl": 0.0,
      "learning_rate": 2.9324784087935097e-07,
      "logps/chosen": -243.2689666748047,
      "logps/rejected": -219.78640747070312,
      "loss": 0.1673,
      "rewards/chosen": 1.2246966361999512,
      "rewards/margins": 4.660741806030273,
      "rewards/rejected": -3.436044931411743,
      "step": 1580
    },
    {
      "epoch": 0.41,
      "grad_norm": 34.465538024902344,
      "kl": 0.0,
      "learning_rate": 2.9311698508243916e-07,
      "logps/chosen": -306.2691345214844,
      "logps/rejected": -232.6453094482422,
      "loss": 0.2769,
      "rewards/chosen": 1.0992822647094727,
      "rewards/margins": 5.033297061920166,
      "rewards/rejected": -3.9340147972106934,
      "step": 1581
    },
    {
      "epoch": 0.41,
      "grad_norm": 37.50482940673828,
      "kl": 0.0,
      "learning_rate": 2.9298612928552736e-07,
      "logps/chosen": -142.67239379882812,
      "logps/rejected": -199.15847778320312,
      "loss": 0.3397,
      "rewards/chosen": -0.2452094852924347,
      "rewards/margins": 2.48954439163208,
      "rewards/rejected": -2.7347538471221924,
      "step": 1582
    },
    {
      "epoch": 0.41,
      "grad_norm": 32.987525939941406,
      "kl": 0.0,
      "learning_rate": 2.9285527348861556e-07,
      "logps/chosen": -178.70310974121094,
      "logps/rejected": -236.485107421875,
      "loss": 0.2765,
      "rewards/chosen": 0.8258715867996216,
      "rewards/margins": 4.397680759429932,
      "rewards/rejected": -3.5718092918395996,
      "step": 1583
    },
    {
      "epoch": 0.41,
      "grad_norm": 29.570829391479492,
      "kl": 0.0,
      "learning_rate": 2.9272441769170375e-07,
      "logps/chosen": -214.9452362060547,
      "logps/rejected": -255.57749938964844,
      "loss": 0.3128,
      "rewards/chosen": -0.2915063500404358,
      "rewards/margins": 2.860386848449707,
      "rewards/rejected": -3.151893138885498,
      "step": 1584
    },
    {
      "epoch": 0.41,
      "grad_norm": 31.92060089111328,
      "kl": 0.0,
      "learning_rate": 2.9259356189479195e-07,
      "logps/chosen": -207.015625,
      "logps/rejected": -247.09225463867188,
      "loss": 0.2024,
      "rewards/chosen": 1.528673529624939,
      "rewards/margins": 7.904787540435791,
      "rewards/rejected": -6.3761138916015625,
      "step": 1585
    },
    {
      "epoch": 0.42,
      "grad_norm": 31.320079803466797,
      "kl": 0.0,
      "learning_rate": 2.924627060978801e-07,
      "logps/chosen": -304.74462890625,
      "logps/rejected": -261.03466796875,
      "loss": 0.2907,
      "rewards/chosen": 1.158564567565918,
      "rewards/margins": 4.468474388122559,
      "rewards/rejected": -3.3099098205566406,
      "step": 1586
    },
    {
      "epoch": 0.42,
      "grad_norm": 32.51560592651367,
      "kl": 0.0,
      "learning_rate": 2.923318503009683e-07,
      "logps/chosen": -239.3182373046875,
      "logps/rejected": -222.99497985839844,
      "loss": 0.1696,
      "rewards/chosen": 0.8448255658149719,
      "rewards/margins": 4.6665825843811035,
      "rewards/rejected": -3.8217570781707764,
      "step": 1587
    },
    {
      "epoch": 0.42,
      "grad_norm": 30.755525588989258,
      "kl": 0.0,
      "learning_rate": 2.922009945040565e-07,
      "logps/chosen": -154.37701416015625,
      "logps/rejected": -225.18515014648438,
      "loss": 0.183,
      "rewards/chosen": -1.5603611469268799,
      "rewards/margins": 2.311732530593872,
      "rewards/rejected": -3.872093677520752,
      "step": 1588
    },
    {
      "epoch": 0.42,
      "grad_norm": 27.795978546142578,
      "kl": 0.0,
      "learning_rate": 2.920701387071447e-07,
      "logps/chosen": -164.93106079101562,
      "logps/rejected": -216.1005859375,
      "loss": 0.3562,
      "rewards/chosen": 0.017641305923461914,
      "rewards/margins": 3.949174165725708,
      "rewards/rejected": -3.931532859802246,
      "step": 1589
    },
    {
      "epoch": 0.42,
      "grad_norm": 32.155128479003906,
      "kl": 0.0,
      "learning_rate": 2.919392829102329e-07,
      "logps/chosen": -161.41615295410156,
      "logps/rejected": -250.1803436279297,
      "loss": 0.4051,
      "rewards/chosen": -0.46750572323799133,
      "rewards/margins": 2.207667350769043,
      "rewards/rejected": -2.675173044204712,
      "step": 1590
    },
    {
      "epoch": 0.42,
      "grad_norm": 32.22792053222656,
      "kl": 0.0,
      "learning_rate": 2.9180842711332107e-07,
      "logps/chosen": -145.3040008544922,
      "logps/rejected": -203.88059997558594,
      "loss": 0.2157,
      "rewards/chosen": 0.2076568603515625,
      "rewards/margins": 3.002925157546997,
      "rewards/rejected": -2.7952682971954346,
      "step": 1591
    },
    {
      "epoch": 0.42,
      "grad_norm": 32.03038024902344,
      "kl": 0.0,
      "learning_rate": 2.916775713164093e-07,
      "logps/chosen": -233.9254913330078,
      "logps/rejected": -181.0751953125,
      "loss": 0.254,
      "rewards/chosen": 0.3560665249824524,
      "rewards/margins": 3.7291817665100098,
      "rewards/rejected": -3.373115301132202,
      "step": 1592
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.68912124633789,
      "kl": 0.0,
      "learning_rate": 2.915467155194975e-07,
      "logps/chosen": -261.59320068359375,
      "logps/rejected": -266.7197570800781,
      "loss": 0.3849,
      "rewards/chosen": 1.6894758939743042,
      "rewards/margins": 4.872840881347656,
      "rewards/rejected": -3.1833648681640625,
      "step": 1593
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.84444046020508,
      "kl": 0.0,
      "learning_rate": 2.914158597225857e-07,
      "logps/chosen": -152.94725036621094,
      "logps/rejected": -311.8249816894531,
      "loss": 0.3181,
      "rewards/chosen": -0.022468430921435356,
      "rewards/margins": 2.707664728164673,
      "rewards/rejected": -2.730133056640625,
      "step": 1594
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.6119384765625,
      "kl": 0.0,
      "learning_rate": 2.912850039256739e-07,
      "logps/chosen": -184.41940307617188,
      "logps/rejected": -260.7890625,
      "loss": 0.3186,
      "rewards/chosen": 0.874193012714386,
      "rewards/margins": 4.659364223480225,
      "rewards/rejected": -3.7851712703704834,
      "step": 1595
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.459774017333984,
      "kl": 0.0,
      "learning_rate": 2.911541481287621e-07,
      "logps/chosen": -184.00550842285156,
      "logps/rejected": -214.40687561035156,
      "loss": 0.4171,
      "rewards/chosen": 0.19082167744636536,
      "rewards/margins": 2.454958438873291,
      "rewards/rejected": -2.264136791229248,
      "step": 1596
    },
    {
      "epoch": 0.42,
      "grad_norm": 38.99348449707031,
      "kl": 0.0,
      "learning_rate": 2.910232923318503e-07,
      "logps/chosen": -286.60687255859375,
      "logps/rejected": -240.58551025390625,
      "loss": 0.3764,
      "rewards/chosen": -0.34489428997039795,
      "rewards/margins": 3.8633813858032227,
      "rewards/rejected": -4.20827579498291,
      "step": 1597
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.473636627197266,
      "kl": 0.0,
      "learning_rate": 2.908924365349385e-07,
      "logps/chosen": -260.33843994140625,
      "logps/rejected": -172.78604125976562,
      "loss": 0.3135,
      "rewards/chosen": 1.2632750272750854,
      "rewards/margins": 3.5282578468322754,
      "rewards/rejected": -2.2649829387664795,
      "step": 1598
    },
    {
      "epoch": 0.42,
      "grad_norm": 31.902114868164062,
      "kl": 0.0,
      "learning_rate": 2.907615807380267e-07,
      "logps/chosen": -172.07113647460938,
      "logps/rejected": -185.29881286621094,
      "loss": 0.3438,
      "rewards/chosen": -0.7174520492553711,
      "rewards/margins": 1.5515670776367188,
      "rewards/rejected": -2.26901912689209,
      "step": 1599
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.15164566040039,
      "kl": 0.0,
      "learning_rate": 2.906307249411149e-07,
      "logps/chosen": -130.16259765625,
      "logps/rejected": -296.18731689453125,
      "loss": 0.3613,
      "rewards/chosen": 0.2204550802707672,
      "rewards/margins": 4.622489929199219,
      "rewards/rejected": -4.402034759521484,
      "step": 1600
    },
    {
      "epoch": 0.42,
      "grad_norm": 34.73413848876953,
      "kl": 0.0,
      "learning_rate": 2.904998691442031e-07,
      "logps/chosen": -184.0595703125,
      "logps/rejected": -321.77734375,
      "loss": 0.3076,
      "rewards/chosen": 1.8415918350219727,
      "rewards/margins": 6.024647235870361,
      "rewards/rejected": -4.183055400848389,
      "step": 1601
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.50428009033203,
      "kl": 0.0,
      "learning_rate": 2.9036901334729123e-07,
      "logps/chosen": -250.47787475585938,
      "logps/rejected": -304.5994873046875,
      "loss": 0.3039,
      "rewards/chosen": -0.5124570727348328,
      "rewards/margins": 2.7285592555999756,
      "rewards/rejected": -3.241016387939453,
      "step": 1602
    },
    {
      "epoch": 0.42,
      "grad_norm": 34.6931037902832,
      "kl": 0.0,
      "learning_rate": 2.902381575503794e-07,
      "logps/chosen": -223.81039428710938,
      "logps/rejected": -283.75616455078125,
      "loss": 0.3198,
      "rewards/chosen": 1.386765718460083,
      "rewards/margins": 3.858922243118286,
      "rewards/rejected": -2.472156524658203,
      "step": 1603
    },
    {
      "epoch": 0.42,
      "grad_norm": 32.15188217163086,
      "kl": 0.0,
      "learning_rate": 2.901073017534677e-07,
      "logps/chosen": -141.97601318359375,
      "logps/rejected": -281.28228759765625,
      "loss": 0.2395,
      "rewards/chosen": 0.25665849447250366,
      "rewards/margins": 3.629164695739746,
      "rewards/rejected": -3.3725061416625977,
      "step": 1604
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.40724182128906,
      "kl": 0.0,
      "learning_rate": 2.8997644595655587e-07,
      "logps/chosen": -256.0122375488281,
      "logps/rejected": -179.1112060546875,
      "loss": 0.2602,
      "rewards/chosen": 3.1538848876953125,
      "rewards/margins": 5.853321075439453,
      "rewards/rejected": -2.6994359493255615,
      "step": 1605
    },
    {
      "epoch": 0.42,
      "grad_norm": 29.446321487426758,
      "kl": 0.0,
      "learning_rate": 2.8984559015964407e-07,
      "logps/chosen": -253.1598663330078,
      "logps/rejected": -259.9168701171875,
      "loss": 0.2396,
      "rewards/chosen": 1.2982040643692017,
      "rewards/margins": 5.060348033905029,
      "rewards/rejected": -3.762143850326538,
      "step": 1606
    },
    {
      "epoch": 0.42,
      "grad_norm": 40.72247314453125,
      "kl": 0.0,
      "learning_rate": 2.8971473436273226e-07,
      "logps/chosen": -217.13844299316406,
      "logps/rejected": -189.36697387695312,
      "loss": 0.3974,
      "rewards/chosen": -0.5850549936294556,
      "rewards/margins": 1.4722565412521362,
      "rewards/rejected": -2.057311534881592,
      "step": 1607
    },
    {
      "epoch": 0.42,
      "grad_norm": 35.95814895629883,
      "kl": 0.0,
      "learning_rate": 2.8958387856582046e-07,
      "logps/chosen": -222.97215270996094,
      "logps/rejected": -227.67852783203125,
      "loss": 0.2516,
      "rewards/chosen": 1.4065536260604858,
      "rewards/margins": 3.951564311981201,
      "rewards/rejected": -2.545010566711426,
      "step": 1608
    },
    {
      "epoch": 0.42,
      "grad_norm": 34.521671295166016,
      "kl": 0.0,
      "learning_rate": 2.8945302276890866e-07,
      "logps/chosen": -234.58786010742188,
      "logps/rejected": -275.44482421875,
      "loss": 0.2856,
      "rewards/chosen": -0.958917498588562,
      "rewards/margins": 2.5073065757751465,
      "rewards/rejected": -3.466223955154419,
      "step": 1609
    },
    {
      "epoch": 0.42,
      "grad_norm": 30.042179107666016,
      "kl": 0.0,
      "learning_rate": 2.8932216697199685e-07,
      "logps/chosen": -130.65647888183594,
      "logps/rejected": -274.7435302734375,
      "loss": 0.1877,
      "rewards/chosen": 0.5573607683181763,
      "rewards/margins": 3.556046962738037,
      "rewards/rejected": -2.9986863136291504,
      "step": 1610
    },
    {
      "epoch": 0.42,
      "grad_norm": 34.71845245361328,
      "kl": 0.0,
      "learning_rate": 2.8919131117508505e-07,
      "logps/chosen": -289.10089111328125,
      "logps/rejected": -286.6903991699219,
      "loss": 0.2166,
      "rewards/chosen": -0.21719495952129364,
      "rewards/margins": 4.7296366691589355,
      "rewards/rejected": -4.946831703186035,
      "step": 1611
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.65607452392578,
      "kl": 0.0,
      "learning_rate": 2.8906045537817324e-07,
      "logps/chosen": -244.82997131347656,
      "logps/rejected": -242.37782287597656,
      "loss": 0.3627,
      "rewards/chosen": -0.37126827239990234,
      "rewards/margins": 3.5149543285369873,
      "rewards/rejected": -3.8862226009368896,
      "step": 1612
    },
    {
      "epoch": 0.42,
      "grad_norm": 38.01091384887695,
      "kl": 0.0,
      "learning_rate": 2.8892959958126144e-07,
      "logps/chosen": -264.7205810546875,
      "logps/rejected": -233.71192932128906,
      "loss": 0.361,
      "rewards/chosen": -0.027559369802474976,
      "rewards/margins": 3.0955469608306885,
      "rewards/rejected": -3.1231062412261963,
      "step": 1613
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.26441955566406,
      "kl": 0.0,
      "learning_rate": 2.8879874378434964e-07,
      "logps/chosen": -205.59405517578125,
      "logps/rejected": -183.545166015625,
      "loss": 0.271,
      "rewards/chosen": 0.7251332998275757,
      "rewards/margins": 4.0589494705200195,
      "rewards/rejected": -3.3338160514831543,
      "step": 1614
    },
    {
      "epoch": 0.42,
      "grad_norm": 39.93470764160156,
      "kl": 0.0,
      "learning_rate": 2.8866788798743783e-07,
      "logps/chosen": -238.8513641357422,
      "logps/rejected": -219.5674285888672,
      "loss": 0.2655,
      "rewards/chosen": 0.4453073740005493,
      "rewards/margins": 2.642704963684082,
      "rewards/rejected": -2.197397470474243,
      "step": 1615
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.35372543334961,
      "kl": 0.0,
      "learning_rate": 2.8853703219052603e-07,
      "logps/chosen": -186.39279174804688,
      "logps/rejected": -284.56536865234375,
      "loss": 0.2743,
      "rewards/chosen": 0.2608222961425781,
      "rewards/margins": 3.18856143951416,
      "rewards/rejected": -2.927739143371582,
      "step": 1616
    },
    {
      "epoch": 0.42,
      "grad_norm": 30.4285945892334,
      "kl": 0.0,
      "learning_rate": 2.884061763936142e-07,
      "logps/chosen": -184.7376251220703,
      "logps/rejected": -223.98312377929688,
      "loss": 0.2033,
      "rewards/chosen": 1.5121128559112549,
      "rewards/margins": 5.503354549407959,
      "rewards/rejected": -3.991241693496704,
      "step": 1617
    },
    {
      "epoch": 0.42,
      "grad_norm": 41.996219635009766,
      "kl": 0.0,
      "learning_rate": 2.882753205967024e-07,
      "logps/chosen": -256.65380859375,
      "logps/rejected": -216.48294067382812,
      "loss": 0.3603,
      "rewards/chosen": 0.522615909576416,
      "rewards/margins": 2.6449623107910156,
      "rewards/rejected": -2.1223464012145996,
      "step": 1618
    },
    {
      "epoch": 0.42,
      "grad_norm": 37.05546569824219,
      "kl": 0.0,
      "learning_rate": 2.881444647997906e-07,
      "logps/chosen": -246.1167449951172,
      "logps/rejected": -245.74496459960938,
      "loss": 0.199,
      "rewards/chosen": 0.8783416748046875,
      "rewards/margins": 4.541228294372559,
      "rewards/rejected": -3.66288685798645,
      "step": 1619
    },
    {
      "epoch": 0.42,
      "grad_norm": 39.39576721191406,
      "kl": 0.0,
      "learning_rate": 2.880136090028788e-07,
      "logps/chosen": -233.797119140625,
      "logps/rejected": -301.4216003417969,
      "loss": 0.3432,
      "rewards/chosen": -0.2727748453617096,
      "rewards/margins": 2.818376064300537,
      "rewards/rejected": -3.091150999069214,
      "step": 1620
    },
    {
      "epoch": 0.42,
      "grad_norm": 44.32394027709961,
      "kl": 0.0,
      "learning_rate": 2.87882753205967e-07,
      "logps/chosen": -172.6717529296875,
      "logps/rejected": -227.44888305664062,
      "loss": 0.3056,
      "rewards/chosen": 2.0932228565216064,
      "rewards/margins": 4.66046667098999,
      "rewards/rejected": -2.567243814468384,
      "step": 1621
    },
    {
      "epoch": 0.42,
      "grad_norm": 30.677011489868164,
      "kl": 0.0,
      "learning_rate": 2.877518974090552e-07,
      "logps/chosen": -221.4599151611328,
      "logps/rejected": -274.02984619140625,
      "loss": 0.2468,
      "rewards/chosen": 1.1671116352081299,
      "rewards/margins": 4.75871467590332,
      "rewards/rejected": -3.5916030406951904,
      "step": 1622
    },
    {
      "epoch": 0.42,
      "grad_norm": 33.010372161865234,
      "kl": 0.0,
      "learning_rate": 2.876210416121434e-07,
      "logps/chosen": -184.25941467285156,
      "logps/rejected": -311.1826477050781,
      "loss": 0.3287,
      "rewards/chosen": 0.30492183566093445,
      "rewards/margins": 3.5320022106170654,
      "rewards/rejected": -3.2270803451538086,
      "step": 1623
    },
    {
      "epoch": 0.43,
      "grad_norm": 28.4550838470459,
      "kl": 0.0,
      "learning_rate": 2.874901858152316e-07,
      "logps/chosen": -156.66748046875,
      "logps/rejected": -173.7084197998047,
      "loss": 0.2516,
      "rewards/chosen": 0.15340088307857513,
      "rewards/margins": 3.6066744327545166,
      "rewards/rejected": -3.4532735347747803,
      "step": 1624
    },
    {
      "epoch": 0.43,
      "grad_norm": 23.95566749572754,
      "kl": 0.0,
      "learning_rate": 2.873593300183198e-07,
      "logps/chosen": -226.8877716064453,
      "logps/rejected": -283.3754577636719,
      "loss": 0.2624,
      "rewards/chosen": -0.21693488955497742,
      "rewards/margins": 3.2981135845184326,
      "rewards/rejected": -3.5150485038757324,
      "step": 1625
    },
    {
      "epoch": 0.43,
      "grad_norm": 30.599740982055664,
      "kl": 0.0,
      "learning_rate": 2.87228474221408e-07,
      "logps/chosen": -221.4315643310547,
      "logps/rejected": -248.6028594970703,
      "loss": 0.2275,
      "rewards/chosen": 0.7559409141540527,
      "rewards/margins": 4.82999324798584,
      "rewards/rejected": -4.074052333831787,
      "step": 1626
    },
    {
      "epoch": 0.43,
      "grad_norm": 39.68366241455078,
      "kl": 0.0,
      "learning_rate": 2.870976184244962e-07,
      "logps/chosen": -265.5011291503906,
      "logps/rejected": -273.00732421875,
      "loss": 0.2918,
      "rewards/chosen": -0.13745827972888947,
      "rewards/margins": 3.9944634437561035,
      "rewards/rejected": -4.131921768188477,
      "step": 1627
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.131900787353516,
      "kl": 0.0,
      "learning_rate": 2.869667626275844e-07,
      "logps/chosen": -255.82313537597656,
      "logps/rejected": -212.59637451171875,
      "loss": 0.3587,
      "rewards/chosen": -0.8583320379257202,
      "rewards/margins": 1.7786017656326294,
      "rewards/rejected": -2.6369338035583496,
      "step": 1628
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.5303840637207,
      "kl": 0.0,
      "learning_rate": 2.868359068306726e-07,
      "logps/chosen": -262.96636962890625,
      "logps/rejected": -253.9119873046875,
      "loss": 0.2916,
      "rewards/chosen": -0.17899669706821442,
      "rewards/margins": 3.351792097091675,
      "rewards/rejected": -3.5307888984680176,
      "step": 1629
    },
    {
      "epoch": 0.43,
      "grad_norm": 34.768192291259766,
      "kl": 0.0,
      "learning_rate": 2.867050510337608e-07,
      "logps/chosen": -226.05429077148438,
      "logps/rejected": -276.16412353515625,
      "loss": 0.2543,
      "rewards/chosen": 0.31507861614227295,
      "rewards/margins": 3.9771056175231934,
      "rewards/rejected": -3.662026882171631,
      "step": 1630
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.05791473388672,
      "kl": 0.0,
      "learning_rate": 2.86574195236849e-07,
      "logps/chosen": -180.6664581298828,
      "logps/rejected": -284.9426574707031,
      "loss": 0.2811,
      "rewards/chosen": 0.8875211477279663,
      "rewards/margins": 5.335653305053711,
      "rewards/rejected": -4.448132038116455,
      "step": 1631
    },
    {
      "epoch": 0.43,
      "grad_norm": 23.5435733795166,
      "kl": 0.0,
      "learning_rate": 2.864433394399372e-07,
      "logps/chosen": -223.41973876953125,
      "logps/rejected": -323.1280517578125,
      "loss": 0.1187,
      "rewards/chosen": 2.1701741218566895,
      "rewards/margins": 6.089311599731445,
      "rewards/rejected": -3.919137716293335,
      "step": 1632
    },
    {
      "epoch": 0.43,
      "grad_norm": 32.40892028808594,
      "kl": 0.0,
      "learning_rate": 2.8631248364302536e-07,
      "logps/chosen": -264.6060485839844,
      "logps/rejected": -249.17063903808594,
      "loss": 0.3056,
      "rewards/chosen": -0.5933135747909546,
      "rewards/margins": 3.272615909576416,
      "rewards/rejected": -3.86592960357666,
      "step": 1633
    },
    {
      "epoch": 0.43,
      "grad_norm": 32.43721008300781,
      "kl": 0.0,
      "learning_rate": 2.8618162784611356e-07,
      "logps/chosen": -224.7136993408203,
      "logps/rejected": -233.19293212890625,
      "loss": 0.2528,
      "rewards/chosen": 0.22769694030284882,
      "rewards/margins": 3.5428829193115234,
      "rewards/rejected": -3.315186023712158,
      "step": 1634
    },
    {
      "epoch": 0.43,
      "grad_norm": 31.453256607055664,
      "kl": 0.0,
      "learning_rate": 2.8605077204920175e-07,
      "logps/chosen": -193.77178955078125,
      "logps/rejected": -260.65252685546875,
      "loss": 0.3186,
      "rewards/chosen": -0.4263240098953247,
      "rewards/margins": 4.058472156524658,
      "rewards/rejected": -4.484796047210693,
      "step": 1635
    },
    {
      "epoch": 0.43,
      "grad_norm": 36.352272033691406,
      "kl": 0.0,
      "learning_rate": 2.8591991625228995e-07,
      "logps/chosen": -212.51431274414062,
      "logps/rejected": -172.3106231689453,
      "loss": 0.2725,
      "rewards/chosen": 0.6394997835159302,
      "rewards/margins": 3.36637020111084,
      "rewards/rejected": -2.72687029838562,
      "step": 1636
    },
    {
      "epoch": 0.43,
      "grad_norm": 30.131851196289062,
      "kl": 0.0,
      "learning_rate": 2.8578906045537815e-07,
      "logps/chosen": -253.44989013671875,
      "logps/rejected": -225.4195098876953,
      "loss": 0.2125,
      "rewards/chosen": 2.016209125518799,
      "rewards/margins": 6.881429195404053,
      "rewards/rejected": -4.865220069885254,
      "step": 1637
    },
    {
      "epoch": 0.43,
      "grad_norm": 29.6743106842041,
      "kl": 0.0,
      "learning_rate": 2.8565820465846634e-07,
      "logps/chosen": -205.046630859375,
      "logps/rejected": -268.38018798828125,
      "loss": 0.3314,
      "rewards/chosen": 0.4701835513114929,
      "rewards/margins": 4.412789821624756,
      "rewards/rejected": -3.9426064491271973,
      "step": 1638
    },
    {
      "epoch": 0.43,
      "grad_norm": 26.07879638671875,
      "kl": 0.0,
      "learning_rate": 2.8552734886155454e-07,
      "logps/chosen": -194.8833465576172,
      "logps/rejected": -234.7974395751953,
      "loss": 0.2276,
      "rewards/chosen": 0.6492434144020081,
      "rewards/margins": 5.1089558601379395,
      "rewards/rejected": -4.459712505340576,
      "step": 1639
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.90639114379883,
      "kl": 0.0,
      "learning_rate": 2.8539649306464273e-07,
      "logps/chosen": -244.555908203125,
      "logps/rejected": -222.04403686523438,
      "loss": 0.2405,
      "rewards/chosen": 1.0766985416412354,
      "rewards/margins": 5.581542015075684,
      "rewards/rejected": -4.504843711853027,
      "step": 1640
    },
    {
      "epoch": 0.43,
      "grad_norm": 42.474082946777344,
      "kl": 0.0,
      "learning_rate": 2.8526563726773093e-07,
      "logps/chosen": -248.51821899414062,
      "logps/rejected": -244.5760040283203,
      "loss": 0.3104,
      "rewards/chosen": -0.07312363386154175,
      "rewards/margins": 3.078744888305664,
      "rewards/rejected": -3.1518685817718506,
      "step": 1641
    },
    {
      "epoch": 0.43,
      "grad_norm": 30.39196014404297,
      "kl": 0.0,
      "learning_rate": 2.851347814708191e-07,
      "logps/chosen": -208.0784912109375,
      "logps/rejected": -297.77337646484375,
      "loss": 0.2939,
      "rewards/chosen": -1.3725045919418335,
      "rewards/margins": 2.603248119354248,
      "rewards/rejected": -3.975752592086792,
      "step": 1642
    },
    {
      "epoch": 0.43,
      "grad_norm": 47.98642349243164,
      "kl": 0.0,
      "learning_rate": 2.850039256739074e-07,
      "logps/chosen": -230.6311798095703,
      "logps/rejected": -256.6865539550781,
      "loss": 0.2826,
      "rewards/chosen": 2.421473979949951,
      "rewards/margins": 3.9391822814941406,
      "rewards/rejected": -1.5177083015441895,
      "step": 1643
    },
    {
      "epoch": 0.43,
      "grad_norm": 41.3521614074707,
      "kl": 0.0,
      "learning_rate": 2.8487306987699557e-07,
      "logps/chosen": -248.044677734375,
      "logps/rejected": -151.1188201904297,
      "loss": 0.3185,
      "rewards/chosen": 0.3247153162956238,
      "rewards/margins": 1.6584868431091309,
      "rewards/rejected": -1.3337714672088623,
      "step": 1644
    },
    {
      "epoch": 0.43,
      "grad_norm": 36.328460693359375,
      "kl": 0.0,
      "learning_rate": 2.8474221408008377e-07,
      "logps/chosen": -200.56027221679688,
      "logps/rejected": -236.47607421875,
      "loss": 0.3498,
      "rewards/chosen": 0.5588264465332031,
      "rewards/margins": 3.169299602508545,
      "rewards/rejected": -2.610473155975342,
      "step": 1645
    },
    {
      "epoch": 0.43,
      "grad_norm": 28.64386749267578,
      "kl": 0.0,
      "learning_rate": 2.8461135828317196e-07,
      "logps/chosen": -160.4168701171875,
      "logps/rejected": -263.1020202636719,
      "loss": 0.1606,
      "rewards/chosen": 2.1916143894195557,
      "rewards/margins": 6.299075126647949,
      "rewards/rejected": -4.1074604988098145,
      "step": 1646
    },
    {
      "epoch": 0.43,
      "grad_norm": 38.80425262451172,
      "kl": 0.0,
      "learning_rate": 2.8448050248626016e-07,
      "logps/chosen": -367.1888427734375,
      "logps/rejected": -211.77659606933594,
      "loss": 0.2512,
      "rewards/chosen": 1.4123479127883911,
      "rewards/margins": 6.332348346710205,
      "rewards/rejected": -4.9200005531311035,
      "step": 1647
    },
    {
      "epoch": 0.43,
      "grad_norm": 39.87413787841797,
      "kl": 0.0,
      "learning_rate": 2.843496466893483e-07,
      "logps/chosen": -147.86988830566406,
      "logps/rejected": -287.0220031738281,
      "loss": 0.294,
      "rewards/chosen": 0.49987924098968506,
      "rewards/margins": 4.298526763916016,
      "rewards/rejected": -3.798647403717041,
      "step": 1648
    },
    {
      "epoch": 0.43,
      "grad_norm": 32.95948791503906,
      "kl": 0.0,
      "learning_rate": 2.842187908924365e-07,
      "logps/chosen": -231.50198364257812,
      "logps/rejected": -171.44778442382812,
      "loss": 0.3698,
      "rewards/chosen": 0.3435051143169403,
      "rewards/margins": 2.4760706424713135,
      "rewards/rejected": -2.132565498352051,
      "step": 1649
    },
    {
      "epoch": 0.43,
      "grad_norm": 30.405078887939453,
      "kl": 0.0,
      "learning_rate": 2.840879350955247e-07,
      "logps/chosen": -105.76648712158203,
      "logps/rejected": -305.2538757324219,
      "loss": 0.2833,
      "rewards/chosen": 0.7598097920417786,
      "rewards/margins": 4.988456726074219,
      "rewards/rejected": -4.228646755218506,
      "step": 1650
    },
    {
      "epoch": 0.43,
      "grad_norm": 31.918275833129883,
      "kl": 0.0,
      "learning_rate": 2.839570792986129e-07,
      "logps/chosen": -237.28346252441406,
      "logps/rejected": -207.51666259765625,
      "loss": 0.3442,
      "rewards/chosen": 0.7215090990066528,
      "rewards/margins": 3.5226168632507324,
      "rewards/rejected": -2.80110764503479,
      "step": 1651
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.554229736328125,
      "kl": 0.0,
      "learning_rate": 2.838262235017011e-07,
      "logps/chosen": -167.70065307617188,
      "logps/rejected": -213.15670776367188,
      "loss": 0.2661,
      "rewards/chosen": 1.394936442375183,
      "rewards/margins": 4.518024921417236,
      "rewards/rejected": -3.1230885982513428,
      "step": 1652
    },
    {
      "epoch": 0.43,
      "grad_norm": 32.72843551635742,
      "kl": 0.0,
      "learning_rate": 2.836953677047893e-07,
      "logps/chosen": -136.93429565429688,
      "logps/rejected": -387.04998779296875,
      "loss": 0.3315,
      "rewards/chosen": 0.10951113700866699,
      "rewards/margins": 6.089513778686523,
      "rewards/rejected": -5.980002403259277,
      "step": 1653
    },
    {
      "epoch": 0.43,
      "grad_norm": 31.057981491088867,
      "kl": 0.0,
      "learning_rate": 2.835645119078775e-07,
      "logps/chosen": -210.75360107421875,
      "logps/rejected": -283.0705871582031,
      "loss": 0.2595,
      "rewards/chosen": 1.8047876358032227,
      "rewards/margins": 4.513606071472168,
      "rewards/rejected": -2.708818197250366,
      "step": 1654
    },
    {
      "epoch": 0.43,
      "grad_norm": 32.37018966674805,
      "kl": 0.0,
      "learning_rate": 2.8343365611096573e-07,
      "logps/chosen": -169.20822143554688,
      "logps/rejected": -194.51992797851562,
      "loss": 0.2947,
      "rewards/chosen": 0.5286230444908142,
      "rewards/margins": 3.582944631576538,
      "rewards/rejected": -3.054321527481079,
      "step": 1655
    },
    {
      "epoch": 0.43,
      "grad_norm": 26.04457664489746,
      "kl": 0.0,
      "learning_rate": 2.833028003140539e-07,
      "logps/chosen": -249.79750061035156,
      "logps/rejected": -288.1142272949219,
      "loss": 0.2435,
      "rewards/chosen": 0.1828829050064087,
      "rewards/margins": 4.325873851776123,
      "rewards/rejected": -4.142991065979004,
      "step": 1656
    },
    {
      "epoch": 0.43,
      "grad_norm": 39.51694869995117,
      "kl": 0.0,
      "learning_rate": 2.831719445171421e-07,
      "logps/chosen": -181.7620849609375,
      "logps/rejected": -284.8299865722656,
      "loss": 0.224,
      "rewards/chosen": 1.3785738945007324,
      "rewards/margins": 6.513654708862305,
      "rewards/rejected": -5.135080814361572,
      "step": 1657
    },
    {
      "epoch": 0.43,
      "grad_norm": 24.030521392822266,
      "kl": 0.0,
      "learning_rate": 2.830410887202303e-07,
      "logps/chosen": -266.92938232421875,
      "logps/rejected": -246.81884765625,
      "loss": 0.2942,
      "rewards/chosen": 0.030593067407608032,
      "rewards/margins": 4.214050769805908,
      "rewards/rejected": -4.183457851409912,
      "step": 1658
    },
    {
      "epoch": 0.43,
      "grad_norm": 39.13831329345703,
      "kl": 0.0,
      "learning_rate": 2.829102329233185e-07,
      "logps/chosen": -185.76181030273438,
      "logps/rejected": -130.00856018066406,
      "loss": 0.2973,
      "rewards/chosen": 0.9576006531715393,
      "rewards/margins": 3.937793254852295,
      "rewards/rejected": -2.9801926612854004,
      "step": 1659
    },
    {
      "epoch": 0.43,
      "grad_norm": 33.2174186706543,
      "kl": 0.0,
      "learning_rate": 2.827793771264067e-07,
      "logps/chosen": -254.79701232910156,
      "logps/rejected": -287.5552062988281,
      "loss": 0.2675,
      "rewards/chosen": 1.2208576202392578,
      "rewards/margins": 5.56520938873291,
      "rewards/rejected": -4.344351768493652,
      "step": 1660
    },
    {
      "epoch": 0.43,
      "grad_norm": 35.70877456665039,
      "kl": 0.0,
      "learning_rate": 2.826485213294949e-07,
      "logps/chosen": -235.49278259277344,
      "logps/rejected": -189.52891540527344,
      "loss": 0.2972,
      "rewards/chosen": 0.6834301352500916,
      "rewards/margins": 3.502739906311035,
      "rewards/rejected": -2.819309711456299,
      "step": 1661
    },
    {
      "epoch": 0.43,
      "grad_norm": 38.852294921875,
      "kl": 0.0,
      "learning_rate": 2.825176655325831e-07,
      "logps/chosen": -183.91111755371094,
      "logps/rejected": -270.0567932128906,
      "loss": 0.4571,
      "rewards/chosen": -0.81363844871521,
      "rewards/margins": 0.5745820999145508,
      "rewards/rejected": -1.3882205486297607,
      "step": 1662
    },
    {
      "epoch": 0.44,
      "grad_norm": 34.467350006103516,
      "kl": 0.0,
      "learning_rate": 2.8238680973567124e-07,
      "logps/chosen": -158.2981719970703,
      "logps/rejected": -207.15960693359375,
      "loss": 0.4017,
      "rewards/chosen": 0.1074838638305664,
      "rewards/margins": 2.8124186992645264,
      "rewards/rejected": -2.70493483543396,
      "step": 1663
    },
    {
      "epoch": 0.44,
      "grad_norm": 30.37034034729004,
      "kl": 0.0,
      "learning_rate": 2.8225595393875944e-07,
      "logps/chosen": -167.8849639892578,
      "logps/rejected": -258.62823486328125,
      "loss": 0.2809,
      "rewards/chosen": 0.7363007664680481,
      "rewards/margins": 3.5093798637390137,
      "rewards/rejected": -2.7730791568756104,
      "step": 1664
    },
    {
      "epoch": 0.44,
      "grad_norm": 31.080869674682617,
      "kl": 0.0,
      "learning_rate": 2.8212509814184764e-07,
      "logps/chosen": -172.04812622070312,
      "logps/rejected": -267.6941223144531,
      "loss": 0.2494,
      "rewards/chosen": -0.2119612991809845,
      "rewards/margins": 3.346021890640259,
      "rewards/rejected": -3.557983160018921,
      "step": 1665
    },
    {
      "epoch": 0.44,
      "grad_norm": 33.17937088012695,
      "kl": 0.0,
      "learning_rate": 2.8199424234493583e-07,
      "logps/chosen": -184.60275268554688,
      "logps/rejected": -273.01202392578125,
      "loss": 0.2754,
      "rewards/chosen": 0.02937476895749569,
      "rewards/margins": 2.6486494541168213,
      "rewards/rejected": -2.619274616241455,
      "step": 1666
    },
    {
      "epoch": 0.44,
      "grad_norm": 37.046207427978516,
      "kl": 0.0,
      "learning_rate": 2.8186338654802403e-07,
      "logps/chosen": -208.9612579345703,
      "logps/rejected": -206.72579956054688,
      "loss": 0.1766,
      "rewards/chosen": 1.7866507768630981,
      "rewards/margins": 5.889621257781982,
      "rewards/rejected": -4.102970600128174,
      "step": 1667
    },
    {
      "epoch": 0.44,
      "grad_norm": 33.24836730957031,
      "kl": 0.0,
      "learning_rate": 2.817325307511123e-07,
      "logps/chosen": -246.86883544921875,
      "logps/rejected": -160.70315551757812,
      "loss": 0.3167,
      "rewards/chosen": 0.5075183510780334,
      "rewards/margins": 3.0673842430114746,
      "rewards/rejected": -2.559865951538086,
      "step": 1668
    },
    {
      "epoch": 0.44,
      "grad_norm": 32.354652404785156,
      "kl": 0.0,
      "learning_rate": 2.8160167495420047e-07,
      "logps/chosen": -188.59686279296875,
      "logps/rejected": -239.78671264648438,
      "loss": 0.306,
      "rewards/chosen": -0.3811313211917877,
      "rewards/margins": 3.6232974529266357,
      "rewards/rejected": -4.004428863525391,
      "step": 1669
    },
    {
      "epoch": 0.44,
      "grad_norm": 42.71672058105469,
      "kl": 0.0,
      "learning_rate": 2.8147081915728867e-07,
      "logps/chosen": -184.1849822998047,
      "logps/rejected": -235.50355529785156,
      "loss": 0.2995,
      "rewards/chosen": 0.20322510600090027,
      "rewards/margins": 2.809774875640869,
      "rewards/rejected": -2.6065497398376465,
      "step": 1670
    },
    {
      "epoch": 0.44,
      "grad_norm": 36.16896438598633,
      "kl": 0.0,
      "learning_rate": 2.8133996336037687e-07,
      "logps/chosen": -271.17999267578125,
      "logps/rejected": -224.8990478515625,
      "loss": 0.1511,
      "rewards/chosen": 1.3531864881515503,
      "rewards/margins": 4.555708885192871,
      "rewards/rejected": -3.2025222778320312,
      "step": 1671
    },
    {
      "epoch": 0.44,
      "grad_norm": 34.2087516784668,
      "kl": 0.0,
      "learning_rate": 2.8120910756346506e-07,
      "logps/chosen": -136.76390075683594,
      "logps/rejected": -273.3305358886719,
      "loss": 0.1379,
      "rewards/chosen": 1.9782171249389648,
      "rewards/margins": 5.933925628662109,
      "rewards/rejected": -3.9557087421417236,
      "step": 1672
    },
    {
      "epoch": 0.44,
      "grad_norm": 60.45607376098633,
      "kl": 0.0,
      "learning_rate": 2.8107825176655326e-07,
      "logps/chosen": -197.25955200195312,
      "logps/rejected": -257.97955322265625,
      "loss": 0.2683,
      "rewards/chosen": 1.5958689451217651,
      "rewards/margins": 6.442742824554443,
      "rewards/rejected": -4.846873760223389,
      "step": 1673
    },
    {
      "epoch": 0.44,
      "grad_norm": 65.07658386230469,
      "kl": 0.0,
      "learning_rate": 2.8094739596964145e-07,
      "logps/chosen": -146.33827209472656,
      "logps/rejected": -255.15676879882812,
      "loss": 0.3031,
      "rewards/chosen": -0.05155050754547119,
      "rewards/margins": 3.9145121574401855,
      "rewards/rejected": -3.966062545776367,
      "step": 1674
    },
    {
      "epoch": 0.44,
      "grad_norm": 32.69587707519531,
      "kl": 0.0,
      "learning_rate": 2.8081654017272965e-07,
      "logps/chosen": -273.9563903808594,
      "logps/rejected": -226.44253540039062,
      "loss": 0.2102,
      "rewards/chosen": 0.08316595107316971,
      "rewards/margins": 5.162535667419434,
      "rewards/rejected": -5.07936954498291,
      "step": 1675
    },
    {
      "epoch": 0.44,
      "grad_norm": 45.86758804321289,
      "kl": 0.0,
      "learning_rate": 2.8068568437581785e-07,
      "logps/chosen": -202.95440673828125,
      "logps/rejected": -185.07928466796875,
      "loss": 0.2519,
      "rewards/chosen": 1.0496774911880493,
      "rewards/margins": 3.4434638023376465,
      "rewards/rejected": -2.3937861919403076,
      "step": 1676
    },
    {
      "epoch": 0.44,
      "grad_norm": 47.8294563293457,
      "kl": 0.0,
      "learning_rate": 2.8055482857890604e-07,
      "logps/chosen": -176.25680541992188,
      "logps/rejected": -215.69883728027344,
      "loss": 0.2564,
      "rewards/chosen": -0.19729217886924744,
      "rewards/margins": 3.721970319747925,
      "rewards/rejected": -3.919262409210205,
      "step": 1677
    },
    {
      "epoch": 0.44,
      "grad_norm": 38.45645523071289,
      "kl": 0.0,
      "learning_rate": 2.8042397278199424e-07,
      "logps/chosen": -204.14828491210938,
      "logps/rejected": -286.4856872558594,
      "loss": 0.2808,
      "rewards/chosen": 0.12513354420661926,
      "rewards/margins": 1.9805330038070679,
      "rewards/rejected": -1.855399489402771,
      "step": 1678
    },
    {
      "epoch": 0.44,
      "grad_norm": 45.83350372314453,
      "kl": 0.0,
      "learning_rate": 2.802931169850824e-07,
      "logps/chosen": -217.64407348632812,
      "logps/rejected": -231.1699676513672,
      "loss": 0.3138,
      "rewards/chosen": 1.533355712890625,
      "rewards/margins": 3.561047077178955,
      "rewards/rejected": -2.02769136428833,
      "step": 1679
    },
    {
      "epoch": 0.44,
      "grad_norm": 55.16447067260742,
      "kl": 0.0,
      "learning_rate": 2.801622611881706e-07,
      "logps/chosen": -186.09637451171875,
      "logps/rejected": -243.02772521972656,
      "loss": 0.2658,
      "rewards/chosen": 1.1252343654632568,
      "rewards/margins": 4.571606636047363,
      "rewards/rejected": -3.4463725090026855,
      "step": 1680
    },
    {
      "epoch": 0.44,
      "grad_norm": 34.269187927246094,
      "kl": 0.0,
      "learning_rate": 2.8003140539125883e-07,
      "logps/chosen": -175.88824462890625,
      "logps/rejected": -248.73648071289062,
      "loss": 0.3089,
      "rewards/chosen": 0.7217296957969666,
      "rewards/margins": 4.206638336181641,
      "rewards/rejected": -3.4849085807800293,
      "step": 1681
    },
    {
      "epoch": 0.44,
      "grad_norm": 28.83681297302246,
      "kl": 0.0,
      "learning_rate": 2.79900549594347e-07,
      "logps/chosen": -217.58006286621094,
      "logps/rejected": -239.70579528808594,
      "loss": 0.1725,
      "rewards/chosen": 0.40505197644233704,
      "rewards/margins": 4.14391565322876,
      "rewards/rejected": -3.738863706588745,
      "step": 1682
    },
    {
      "epoch": 0.44,
      "grad_norm": 48.4196891784668,
      "kl": 0.0,
      "learning_rate": 2.797696937974352e-07,
      "logps/chosen": -257.90679931640625,
      "logps/rejected": -296.9246826171875,
      "loss": 0.2987,
      "rewards/chosen": 2.1349244117736816,
      "rewards/margins": 6.551563739776611,
      "rewards/rejected": -4.41663932800293,
      "step": 1683
    },
    {
      "epoch": 0.44,
      "grad_norm": 44.74823760986328,
      "kl": 0.0,
      "learning_rate": 2.796388380005234e-07,
      "logps/chosen": -266.4541931152344,
      "logps/rejected": -273.9454345703125,
      "loss": 0.1819,
      "rewards/chosen": 1.0883949995040894,
      "rewards/margins": 5.361190319061279,
      "rewards/rejected": -4.2727952003479,
      "step": 1684
    },
    {
      "epoch": 0.44,
      "grad_norm": 37.19141387939453,
      "kl": 0.0,
      "learning_rate": 2.795079822036116e-07,
      "logps/chosen": -178.24195861816406,
      "logps/rejected": -273.5054626464844,
      "loss": 0.3304,
      "rewards/chosen": 0.9247137308120728,
      "rewards/margins": 5.101280212402344,
      "rewards/rejected": -4.1765666007995605,
      "step": 1685
    },
    {
      "epoch": 0.44,
      "grad_norm": 35.41918182373047,
      "kl": 0.0,
      "learning_rate": 2.793771264066998e-07,
      "logps/chosen": -194.33016967773438,
      "logps/rejected": -309.4744567871094,
      "loss": 0.2389,
      "rewards/chosen": 0.6592968702316284,
      "rewards/margins": 6.351325988769531,
      "rewards/rejected": -5.692028999328613,
      "step": 1686
    },
    {
      "epoch": 0.44,
      "grad_norm": 39.674442291259766,
      "kl": 0.0,
      "learning_rate": 2.79246270609788e-07,
      "logps/chosen": -243.66392517089844,
      "logps/rejected": -168.36773681640625,
      "loss": 0.3489,
      "rewards/chosen": 0.40796035528182983,
      "rewards/margins": 2.883101224899292,
      "rewards/rejected": -2.4751408100128174,
      "step": 1687
    },
    {
      "epoch": 0.44,
      "grad_norm": 30.215177536010742,
      "kl": 0.0,
      "learning_rate": 2.791154148128762e-07,
      "logps/chosen": -166.00108337402344,
      "logps/rejected": -360.5684509277344,
      "loss": 0.3205,
      "rewards/chosen": 0.15947306156158447,
      "rewards/margins": 4.606205463409424,
      "rewards/rejected": -4.446732521057129,
      "step": 1688
    },
    {
      "epoch": 0.44,
      "grad_norm": 39.63730239868164,
      "kl": 0.0,
      "learning_rate": 2.789845590159644e-07,
      "logps/chosen": -229.3401336669922,
      "logps/rejected": -291.5890808105469,
      "loss": 0.2245,
      "rewards/chosen": 0.7700627446174622,
      "rewards/margins": 3.624380111694336,
      "rewards/rejected": -2.8543174266815186,
      "step": 1689
    },
    {
      "epoch": 0.44,
      "grad_norm": 35.81988525390625,
      "kl": 0.0,
      "learning_rate": 2.788537032190526e-07,
      "logps/chosen": -183.9010467529297,
      "logps/rejected": -276.3099365234375,
      "loss": 0.2818,
      "rewards/chosen": -0.11290199309587479,
      "rewards/margins": 4.365841865539551,
      "rewards/rejected": -4.478744029998779,
      "step": 1690
    },
    {
      "epoch": 0.44,
      "grad_norm": 40.292118072509766,
      "kl": 0.0,
      "learning_rate": 2.787228474221408e-07,
      "logps/chosen": -292.67730712890625,
      "logps/rejected": -255.4952392578125,
      "loss": 0.2497,
      "rewards/chosen": 0.150015190243721,
      "rewards/margins": 3.031562089920044,
      "rewards/rejected": -2.881546974182129,
      "step": 1691
    },
    {
      "epoch": 0.44,
      "grad_norm": 35.17213821411133,
      "kl": 0.0,
      "learning_rate": 2.78591991625229e-07,
      "logps/chosen": -194.00588989257812,
      "logps/rejected": -233.5550994873047,
      "loss": 0.2567,
      "rewards/chosen": 0.7114308476448059,
      "rewards/margins": 5.897915363311768,
      "rewards/rejected": -5.186484336853027,
      "step": 1692
    },
    {
      "epoch": 0.44,
      "grad_norm": 35.00627517700195,
      "kl": 0.0,
      "learning_rate": 2.7846113582831723e-07,
      "logps/chosen": -139.06361389160156,
      "logps/rejected": -284.79034423828125,
      "loss": 0.2337,
      "rewards/chosen": 1.307264804840088,
      "rewards/margins": 4.419478416442871,
      "rewards/rejected": -3.112213611602783,
      "step": 1693
    },
    {
      "epoch": 0.44,
      "grad_norm": 37.7496337890625,
      "kl": 0.0,
      "learning_rate": 2.783302800314054e-07,
      "logps/chosen": -239.04811096191406,
      "logps/rejected": -277.545166015625,
      "loss": 0.3579,
      "rewards/chosen": -0.09821644425392151,
      "rewards/margins": 3.1371920108795166,
      "rewards/rejected": -3.2354085445404053,
      "step": 1694
    },
    {
      "epoch": 0.44,
      "grad_norm": 42.01094436645508,
      "kl": 0.0,
      "learning_rate": 2.7819942423449357e-07,
      "logps/chosen": -301.322509765625,
      "logps/rejected": -200.0469512939453,
      "loss": 0.3572,
      "rewards/chosen": 1.5365756750106812,
      "rewards/margins": 4.429783344268799,
      "rewards/rejected": -2.893207550048828,
      "step": 1695
    },
    {
      "epoch": 0.44,
      "grad_norm": 33.279029846191406,
      "kl": 0.0,
      "learning_rate": 2.7806856843758177e-07,
      "logps/chosen": -258.8411865234375,
      "logps/rejected": -286.9688720703125,
      "loss": 0.1982,
      "rewards/chosen": 2.530242443084717,
      "rewards/margins": 5.06218147277832,
      "rewards/rejected": -2.5319392681121826,
      "step": 1696
    },
    {
      "epoch": 0.44,
      "grad_norm": 40.62521743774414,
      "kl": 0.0,
      "learning_rate": 2.7793771264066996e-07,
      "logps/chosen": -223.89183044433594,
      "logps/rejected": -249.8245391845703,
      "loss": 0.1846,
      "rewards/chosen": 1.793093204498291,
      "rewards/margins": 4.362424850463867,
      "rewards/rejected": -2.5693318843841553,
      "step": 1697
    },
    {
      "epoch": 0.44,
      "grad_norm": 27.853527069091797,
      "kl": 0.0,
      "learning_rate": 2.7780685684375816e-07,
      "logps/chosen": -175.04196166992188,
      "logps/rejected": -233.70567321777344,
      "loss": 0.2352,
      "rewards/chosen": 0.5203903317451477,
      "rewards/margins": 4.373768329620361,
      "rewards/rejected": -3.8533778190612793,
      "step": 1698
    },
    {
      "epoch": 0.44,
      "grad_norm": 36.20708084106445,
      "kl": 0.0,
      "learning_rate": 2.7767600104684636e-07,
      "logps/chosen": -243.81565856933594,
      "logps/rejected": -267.9309387207031,
      "loss": 0.3228,
      "rewards/chosen": 0.4174182415008545,
      "rewards/margins": 3.5275862216949463,
      "rewards/rejected": -3.110167980194092,
      "step": 1699
    },
    {
      "epoch": 0.44,
      "grad_norm": 31.369003295898438,
      "kl": 0.0,
      "learning_rate": 2.7754514524993455e-07,
      "logps/chosen": -150.89146423339844,
      "logps/rejected": -216.03077697753906,
      "loss": 0.19,
      "rewards/chosen": 0.7634617686271667,
      "rewards/margins": 3.220472574234009,
      "rewards/rejected": -2.4570107460021973,
      "step": 1700
    },
    {
      "epoch": 0.45,
      "grad_norm": 37.170772552490234,
      "kl": 0.0,
      "learning_rate": 2.7741428945302275e-07,
      "logps/chosen": -246.51806640625,
      "logps/rejected": -232.55331420898438,
      "loss": 0.2736,
      "rewards/chosen": 2.2666428089141846,
      "rewards/margins": 4.5983099937438965,
      "rewards/rejected": -2.331667184829712,
      "step": 1701
    },
    {
      "epoch": 0.45,
      "grad_norm": 26.145320892333984,
      "kl": 0.0,
      "learning_rate": 2.7728343365611094e-07,
      "logps/chosen": -138.7021484375,
      "logps/rejected": -237.99859619140625,
      "loss": 0.1992,
      "rewards/chosen": 1.8896682262420654,
      "rewards/margins": 4.866306304931641,
      "rewards/rejected": -2.976638078689575,
      "step": 1702
    },
    {
      "epoch": 0.45,
      "grad_norm": 31.039020538330078,
      "kl": 0.0,
      "learning_rate": 2.7715257785919914e-07,
      "logps/chosen": -202.62603759765625,
      "logps/rejected": -258.099609375,
      "loss": 0.1471,
      "rewards/chosen": 0.5484234690666199,
      "rewards/margins": 4.4137139320373535,
      "rewards/rejected": -3.865290403366089,
      "step": 1703
    },
    {
      "epoch": 0.45,
      "grad_norm": 45.64496994018555,
      "kl": 0.0,
      "learning_rate": 2.7702172206228734e-07,
      "logps/chosen": -198.77719116210938,
      "logps/rejected": -246.3597412109375,
      "loss": 0.2946,
      "rewards/chosen": 0.9636480808258057,
      "rewards/margins": 4.638223648071289,
      "rewards/rejected": -3.6745755672454834,
      "step": 1704
    },
    {
      "epoch": 0.45,
      "grad_norm": 34.864662170410156,
      "kl": 0.0,
      "learning_rate": 2.7689086626537553e-07,
      "logps/chosen": -227.4167938232422,
      "logps/rejected": -191.95358276367188,
      "loss": 0.3061,
      "rewards/chosen": 0.04578828811645508,
      "rewards/margins": 3.3845396041870117,
      "rewards/rejected": -3.3387513160705566,
      "step": 1705
    },
    {
      "epoch": 0.45,
      "grad_norm": 28.03066062927246,
      "kl": 0.0,
      "learning_rate": 2.767600104684638e-07,
      "logps/chosen": -272.80523681640625,
      "logps/rejected": -262.7962341308594,
      "loss": 0.3551,
      "rewards/chosen": -1.5040392875671387,
      "rewards/margins": 1.8724522590637207,
      "rewards/rejected": -3.3764915466308594,
      "step": 1706
    },
    {
      "epoch": 0.45,
      "grad_norm": 32.483184814453125,
      "kl": 0.0,
      "learning_rate": 2.76629154671552e-07,
      "logps/chosen": -178.73095703125,
      "logps/rejected": -269.7213134765625,
      "loss": 0.2432,
      "rewards/chosen": 0.8758571743965149,
      "rewards/margins": 3.153795003890991,
      "rewards/rejected": -2.277937889099121,
      "step": 1707
    },
    {
      "epoch": 0.45,
      "grad_norm": 32.03137969970703,
      "kl": 0.0,
      "learning_rate": 2.764982988746402e-07,
      "logps/chosen": -249.5450897216797,
      "logps/rejected": -209.1124725341797,
      "loss": 0.325,
      "rewards/chosen": -0.8352705836296082,
      "rewards/margins": 3.4512455463409424,
      "rewards/rejected": -4.286516189575195,
      "step": 1708
    },
    {
      "epoch": 0.45,
      "grad_norm": 35.8421516418457,
      "kl": 0.0,
      "learning_rate": 2.7636744307772837e-07,
      "logps/chosen": -259.5403747558594,
      "logps/rejected": -254.30734252929688,
      "loss": 0.3344,
      "rewards/chosen": -1.6663615703582764,
      "rewards/margins": 2.7338154315948486,
      "rewards/rejected": -4.400177001953125,
      "step": 1709
    },
    {
      "epoch": 0.45,
      "grad_norm": 30.6213321685791,
      "kl": 0.0,
      "learning_rate": 2.762365872808165e-07,
      "logps/chosen": -256.17169189453125,
      "logps/rejected": -184.3090057373047,
      "loss": 0.3511,
      "rewards/chosen": 0.3080146908760071,
      "rewards/margins": 2.139042615890503,
      "rewards/rejected": -1.8310279846191406,
      "step": 1710
    },
    {
      "epoch": 0.45,
      "grad_norm": 32.349365234375,
      "kl": 0.0,
      "learning_rate": 2.761057314839047e-07,
      "logps/chosen": -240.81570434570312,
      "logps/rejected": -275.6083068847656,
      "loss": 0.2586,
      "rewards/chosen": 2.547614574432373,
      "rewards/margins": 6.020605087280273,
      "rewards/rejected": -3.4729907512664795,
      "step": 1711
    },
    {
      "epoch": 0.45,
      "grad_norm": 27.55760383605957,
      "kl": 0.0,
      "learning_rate": 2.759748756869929e-07,
      "logps/chosen": -106.76872253417969,
      "logps/rejected": -210.98382568359375,
      "loss": 0.3021,
      "rewards/chosen": 1.2487657070159912,
      "rewards/margins": 4.193385124206543,
      "rewards/rejected": -2.944619655609131,
      "step": 1712
    },
    {
      "epoch": 0.45,
      "grad_norm": 36.03867721557617,
      "kl": 0.0,
      "learning_rate": 2.758440198900811e-07,
      "logps/chosen": -115.28714752197266,
      "logps/rejected": -229.01974487304688,
      "loss": 0.2283,
      "rewards/chosen": 1.1162008047103882,
      "rewards/margins": 3.21551513671875,
      "rewards/rejected": -2.0993142127990723,
      "step": 1713
    },
    {
      "epoch": 0.45,
      "grad_norm": 29.898643493652344,
      "kl": 0.0,
      "learning_rate": 2.757131640931693e-07,
      "logps/chosen": -173.66696166992188,
      "logps/rejected": -237.84140014648438,
      "loss": 0.2725,
      "rewards/chosen": 0.8104872107505798,
      "rewards/margins": 4.533384799957275,
      "rewards/rejected": -3.72289776802063,
      "step": 1714
    },
    {
      "epoch": 0.45,
      "grad_norm": 32.10867691040039,
      "kl": 0.0,
      "learning_rate": 2.755823082962575e-07,
      "logps/chosen": -227.56259155273438,
      "logps/rejected": -196.11566162109375,
      "loss": 0.2026,
      "rewards/chosen": 0.22661995887756348,
      "rewards/margins": 2.7698512077331543,
      "rewards/rejected": -2.543231248855591,
      "step": 1715
    },
    {
      "epoch": 0.45,
      "grad_norm": 53.901634216308594,
      "kl": 0.0,
      "learning_rate": 2.754514524993457e-07,
      "logps/chosen": -241.9610137939453,
      "logps/rejected": -256.36456298828125,
      "loss": 0.3786,
      "rewards/chosen": -1.0330169200897217,
      "rewards/margins": 2.7314131259918213,
      "rewards/rejected": -3.764430046081543,
      "step": 1716
    },
    {
      "epoch": 0.45,
      "grad_norm": 35.57015609741211,
      "kl": 0.0,
      "learning_rate": 2.753205967024339e-07,
      "logps/chosen": -222.4459686279297,
      "logps/rejected": -249.87548828125,
      "loss": 0.2803,
      "rewards/chosen": -0.750407874584198,
      "rewards/margins": 2.9313929080963135,
      "rewards/rejected": -3.6818008422851562,
      "step": 1717
    },
    {
      "epoch": 0.45,
      "grad_norm": 32.74998092651367,
      "kl": 0.0,
      "learning_rate": 2.751897409055221e-07,
      "logps/chosen": -225.82725524902344,
      "logps/rejected": -207.35427856445312,
      "loss": 0.2025,
      "rewards/chosen": 0.190630704164505,
      "rewards/margins": 4.504660129547119,
      "rewards/rejected": -4.314029216766357,
      "step": 1718
    },
    {
      "epoch": 0.45,
      "grad_norm": 30.85138702392578,
      "kl": 0.0,
      "learning_rate": 2.7505888510861033e-07,
      "logps/chosen": -276.42974853515625,
      "logps/rejected": -217.53627014160156,
      "loss": 0.2485,
      "rewards/chosen": 3.411306381225586,
      "rewards/margins": 6.3637566566467285,
      "rewards/rejected": -2.9524502754211426,
      "step": 1719
    },
    {
      "epoch": 0.45,
      "grad_norm": 24.205673217773438,
      "kl": 0.0,
      "learning_rate": 2.7492802931169853e-07,
      "logps/chosen": -173.72286987304688,
      "logps/rejected": -234.66258239746094,
      "loss": 0.4371,
      "rewards/chosen": -0.9166773557662964,
      "rewards/margins": 2.8205437660217285,
      "rewards/rejected": -3.7372212409973145,
      "step": 1720
    },
    {
      "epoch": 0.45,
      "grad_norm": 29.204938888549805,
      "kl": 0.0,
      "learning_rate": 2.747971735147867e-07,
      "logps/chosen": -190.07568359375,
      "logps/rejected": -196.97793579101562,
      "loss": 0.2964,
      "rewards/chosen": 0.2621425688266754,
      "rewards/margins": 2.3583216667175293,
      "rewards/rejected": -2.0961790084838867,
      "step": 1721
    },
    {
      "epoch": 0.45,
      "grad_norm": 53.4695930480957,
      "kl": 0.0,
      "learning_rate": 2.746663177178749e-07,
      "logps/chosen": -144.24964904785156,
      "logps/rejected": -246.5261688232422,
      "loss": 0.3908,
      "rewards/chosen": -1.2694023847579956,
      "rewards/margins": 1.4854682683944702,
      "rewards/rejected": -2.754870653152466,
      "step": 1722
    },
    {
      "epoch": 0.45,
      "grad_norm": 41.367244720458984,
      "kl": 0.0,
      "learning_rate": 2.745354619209631e-07,
      "logps/chosen": -209.56777954101562,
      "logps/rejected": -255.2227020263672,
      "loss": 0.3476,
      "rewards/chosen": 1.0162454843521118,
      "rewards/margins": 3.2657556533813477,
      "rewards/rejected": -2.2495102882385254,
      "step": 1723
    },
    {
      "epoch": 0.45,
      "grad_norm": 34.461116790771484,
      "kl": 0.0,
      "learning_rate": 2.744046061240513e-07,
      "logps/chosen": -206.70022583007812,
      "logps/rejected": -241.9964141845703,
      "loss": 0.2432,
      "rewards/chosen": 0.9106854200363159,
      "rewards/margins": 5.031620502471924,
      "rewards/rejected": -4.120934963226318,
      "step": 1724
    },
    {
      "epoch": 0.45,
      "grad_norm": 30.99492835998535,
      "kl": 0.0,
      "learning_rate": 2.7427375032713945e-07,
      "logps/chosen": -164.06423950195312,
      "logps/rejected": -181.23422241210938,
      "loss": 0.2385,
      "rewards/chosen": 0.9010778069496155,
      "rewards/margins": 3.430784225463867,
      "rewards/rejected": -2.5297064781188965,
      "step": 1725
    },
    {
      "epoch": 0.45,
      "grad_norm": 33.725914001464844,
      "kl": 0.0,
      "learning_rate": 2.7414289453022765e-07,
      "logps/chosen": -194.82861328125,
      "logps/rejected": -357.1357116699219,
      "loss": 0.3359,
      "rewards/chosen": -0.22986668348312378,
      "rewards/margins": 2.912917137145996,
      "rewards/rejected": -3.1427838802337646,
      "step": 1726
    },
    {
      "epoch": 0.45,
      "grad_norm": 29.54648208618164,
      "kl": 0.0,
      "learning_rate": 2.7401203873331585e-07,
      "logps/chosen": -227.89065551757812,
      "logps/rejected": -245.01356506347656,
      "loss": 0.2869,
      "rewards/chosen": 2.4143357276916504,
      "rewards/margins": 6.6730875968933105,
      "rewards/rejected": -4.25875186920166,
      "step": 1727
    },
    {
      "epoch": 0.45,
      "grad_norm": 33.45779037475586,
      "kl": 0.0,
      "learning_rate": 2.7388118293640404e-07,
      "logps/chosen": -226.79429626464844,
      "logps/rejected": -247.69236755371094,
      "loss": 0.2158,
      "rewards/chosen": 0.29212290048599243,
      "rewards/margins": 3.065626859664917,
      "rewards/rejected": -2.7735040187835693,
      "step": 1728
    },
    {
      "epoch": 0.45,
      "grad_norm": 30.237781524658203,
      "kl": 0.0,
      "learning_rate": 2.7375032713949224e-07,
      "logps/chosen": -179.5382080078125,
      "logps/rejected": -395.1669006347656,
      "loss": 0.2689,
      "rewards/chosen": -0.9008402228355408,
      "rewards/margins": 4.793771266937256,
      "rewards/rejected": -5.694611549377441,
      "step": 1729
    },
    {
      "epoch": 0.45,
      "grad_norm": 46.390830993652344,
      "kl": 0.0,
      "learning_rate": 2.7361947134258044e-07,
      "logps/chosen": -272.663330078125,
      "logps/rejected": -317.2637634277344,
      "loss": 0.2308,
      "rewards/chosen": 1.130327582359314,
      "rewards/margins": 5.195208549499512,
      "rewards/rejected": -4.064880847930908,
      "step": 1730
    },
    {
      "epoch": 0.45,
      "grad_norm": 29.513668060302734,
      "kl": 0.0,
      "learning_rate": 2.7348861554566863e-07,
      "logps/chosen": -218.89425659179688,
      "logps/rejected": -167.77731323242188,
      "loss": 0.335,
      "rewards/chosen": 0.20457467436790466,
      "rewards/margins": 2.1242728233337402,
      "rewards/rejected": -1.9196981191635132,
      "step": 1731
    },
    {
      "epoch": 0.45,
      "grad_norm": 36.8236083984375,
      "kl": 0.0,
      "learning_rate": 2.733577597487569e-07,
      "logps/chosen": -163.4239501953125,
      "logps/rejected": -268.9611511230469,
      "loss": 0.2571,
      "rewards/chosen": 1.3878527879714966,
      "rewards/margins": 5.625244617462158,
      "rewards/rejected": -4.237391948699951,
      "step": 1732
    },
    {
      "epoch": 0.45,
      "grad_norm": 36.46755599975586,
      "kl": 0.0,
      "learning_rate": 2.732269039518451e-07,
      "logps/chosen": -219.85479736328125,
      "logps/rejected": -258.7679443359375,
      "loss": 0.286,
      "rewards/chosen": 0.35300034284591675,
      "rewards/margins": 3.6059610843658447,
      "rewards/rejected": -3.252960681915283,
      "step": 1733
    },
    {
      "epoch": 0.45,
      "grad_norm": 37.581642150878906,
      "kl": 0.0,
      "learning_rate": 2.7309604815493327e-07,
      "logps/chosen": -187.14065551757812,
      "logps/rejected": -280.6612548828125,
      "loss": 0.2148,
      "rewards/chosen": 0.6708101630210876,
      "rewards/margins": 4.311281204223633,
      "rewards/rejected": -3.6404712200164795,
      "step": 1734
    },
    {
      "epoch": 0.45,
      "grad_norm": 43.155517578125,
      "kl": 0.0,
      "learning_rate": 2.7296519235802147e-07,
      "logps/chosen": -223.60923767089844,
      "logps/rejected": -187.82534790039062,
      "loss": 0.2841,
      "rewards/chosen": 1.4644616842269897,
      "rewards/margins": 4.722436428070068,
      "rewards/rejected": -3.257974863052368,
      "step": 1735
    },
    {
      "epoch": 0.45,
      "grad_norm": 35.82574462890625,
      "kl": 0.0,
      "learning_rate": 2.7283433656110966e-07,
      "logps/chosen": -220.65611267089844,
      "logps/rejected": -231.52175903320312,
      "loss": 0.156,
      "rewards/chosen": 1.0742979049682617,
      "rewards/margins": 4.472984790802002,
      "rewards/rejected": -3.3986868858337402,
      "step": 1736
    },
    {
      "epoch": 0.45,
      "grad_norm": 36.58572769165039,
      "kl": 0.0,
      "learning_rate": 2.7270348076419786e-07,
      "logps/chosen": -241.86561584472656,
      "logps/rejected": -254.5074005126953,
      "loss": 0.2847,
      "rewards/chosen": -0.3873230814933777,
      "rewards/margins": 4.528369426727295,
      "rewards/rejected": -4.915692329406738,
      "step": 1737
    },
    {
      "epoch": 0.45,
      "grad_norm": 38.22736740112305,
      "kl": 0.0,
      "learning_rate": 2.7257262496728606e-07,
      "logps/chosen": -266.9388427734375,
      "logps/rejected": -192.8319091796875,
      "loss": 0.3288,
      "rewards/chosen": 0.3637565076351166,
      "rewards/margins": 3.0919291973114014,
      "rewards/rejected": -2.728172779083252,
      "step": 1738
    },
    {
      "epoch": 0.46,
      "grad_norm": 27.466691970825195,
      "kl": 0.0,
      "learning_rate": 2.7244176917037425e-07,
      "logps/chosen": -129.4464569091797,
      "logps/rejected": -252.09034729003906,
      "loss": 0.2556,
      "rewards/chosen": 1.1280877590179443,
      "rewards/margins": 5.6138916015625,
      "rewards/rejected": -4.485803604125977,
      "step": 1739
    },
    {
      "epoch": 0.46,
      "grad_norm": 31.164216995239258,
      "kl": 0.0,
      "learning_rate": 2.7231091337346245e-07,
      "logps/chosen": -205.83444213867188,
      "logps/rejected": -282.27777099609375,
      "loss": 0.3561,
      "rewards/chosen": 0.007905125617980957,
      "rewards/margins": 3.2738075256347656,
      "rewards/rejected": -3.265902519226074,
      "step": 1740
    },
    {
      "epoch": 0.46,
      "grad_norm": 39.175819396972656,
      "kl": 0.0,
      "learning_rate": 2.721800575765506e-07,
      "logps/chosen": -189.10853576660156,
      "logps/rejected": -349.58447265625,
      "loss": 0.1715,
      "rewards/chosen": 0.36496955156326294,
      "rewards/margins": 5.270139694213867,
      "rewards/rejected": -4.90516996383667,
      "step": 1741
    },
    {
      "epoch": 0.46,
      "grad_norm": 29.220705032348633,
      "kl": 0.0,
      "learning_rate": 2.720492017796388e-07,
      "logps/chosen": -239.35317993164062,
      "logps/rejected": -262.12664794921875,
      "loss": 0.3557,
      "rewards/chosen": -0.8873398303985596,
      "rewards/margins": 3.00297474861145,
      "rewards/rejected": -3.8903145790100098,
      "step": 1742
    },
    {
      "epoch": 0.46,
      "grad_norm": 30.227474212646484,
      "kl": 0.0,
      "learning_rate": 2.71918345982727e-07,
      "logps/chosen": -168.98941040039062,
      "logps/rejected": -206.9694366455078,
      "loss": 0.1996,
      "rewards/chosen": 0.669614851474762,
      "rewards/margins": 5.056918144226074,
      "rewards/rejected": -4.387303352355957,
      "step": 1743
    },
    {
      "epoch": 0.46,
      "grad_norm": 33.103397369384766,
      "kl": 0.0,
      "learning_rate": 2.717874901858152e-07,
      "logps/chosen": -265.4107971191406,
      "logps/rejected": -283.2393798828125,
      "loss": 0.3026,
      "rewards/chosen": -0.4820014238357544,
      "rewards/margins": 4.627739429473877,
      "rewards/rejected": -5.109740734100342,
      "step": 1744
    },
    {
      "epoch": 0.46,
      "grad_norm": 37.08218765258789,
      "kl": 0.0,
      "learning_rate": 2.7165663438890343e-07,
      "logps/chosen": -183.61221313476562,
      "logps/rejected": -359.53314208984375,
      "loss": 0.3758,
      "rewards/chosen": 0.01924392580986023,
      "rewards/margins": 2.7295875549316406,
      "rewards/rejected": -2.710343599319458,
      "step": 1745
    },
    {
      "epoch": 0.46,
      "grad_norm": 44.7434196472168,
      "kl": 0.0,
      "learning_rate": 2.715257785919916e-07,
      "logps/chosen": -260.5285339355469,
      "logps/rejected": -237.41989135742188,
      "loss": 0.3212,
      "rewards/chosen": 0.23573371767997742,
      "rewards/margins": 3.459974765777588,
      "rewards/rejected": -3.224241018295288,
      "step": 1746
    },
    {
      "epoch": 0.46,
      "grad_norm": 44.934471130371094,
      "kl": 0.0,
      "learning_rate": 2.713949227950798e-07,
      "logps/chosen": -170.7688751220703,
      "logps/rejected": -228.92611694335938,
      "loss": 0.3384,
      "rewards/chosen": 0.4019045829772949,
      "rewards/margins": 3.453260898590088,
      "rewards/rejected": -3.051356315612793,
      "step": 1747
    },
    {
      "epoch": 0.46,
      "grad_norm": 43.90964126586914,
      "kl": 0.0,
      "learning_rate": 2.71264066998168e-07,
      "logps/chosen": -199.80911254882812,
      "logps/rejected": -300.0240783691406,
      "loss": 0.3233,
      "rewards/chosen": -0.9098714590072632,
      "rewards/margins": 2.1194257736206055,
      "rewards/rejected": -3.029297351837158,
      "step": 1748
    },
    {
      "epoch": 0.46,
      "grad_norm": 35.426116943359375,
      "kl": 0.0,
      "learning_rate": 2.711332112012562e-07,
      "logps/chosen": -218.34246826171875,
      "logps/rejected": -288.5013732910156,
      "loss": 0.2749,
      "rewards/chosen": 0.5279122591018677,
      "rewards/margins": 4.1315693855285645,
      "rewards/rejected": -3.6036572456359863,
      "step": 1749
    },
    {
      "epoch": 0.46,
      "grad_norm": 40.60729217529297,
      "kl": 0.0,
      "learning_rate": 2.710023554043444e-07,
      "logps/chosen": -169.89242553710938,
      "logps/rejected": -271.1588134765625,
      "loss": 0.3116,
      "rewards/chosen": 0.8719571828842163,
      "rewards/margins": 4.5694427490234375,
      "rewards/rejected": -3.6974854469299316,
      "step": 1750
    },
    {
      "epoch": 0.46,
      "grad_norm": 35.51378631591797,
      "kl": 0.0,
      "learning_rate": 2.708714996074326e-07,
      "logps/chosen": -209.006103515625,
      "logps/rejected": -205.5970916748047,
      "loss": 0.2389,
      "rewards/chosen": -0.20795372128486633,
      "rewards/margins": 2.6677086353302,
      "rewards/rejected": -2.875662326812744,
      "step": 1751
    },
    {
      "epoch": 0.46,
      "grad_norm": 27.783546447753906,
      "kl": 0.0,
      "learning_rate": 2.707406438105208e-07,
      "logps/chosen": -197.99685668945312,
      "logps/rejected": -267.1977844238281,
      "loss": 0.1513,
      "rewards/chosen": 1.9851152896881104,
      "rewards/margins": 5.490628719329834,
      "rewards/rejected": -3.5055134296417236,
      "step": 1752
    },
    {
      "epoch": 0.46,
      "grad_norm": 34.98855209350586,
      "kl": 0.0,
      "learning_rate": 2.70609788013609e-07,
      "logps/chosen": -233.90687561035156,
      "logps/rejected": -234.8693389892578,
      "loss": 0.1836,
      "rewards/chosen": 0.2806207835674286,
      "rewards/margins": 4.119460582733154,
      "rewards/rejected": -3.8388397693634033,
      "step": 1753
    },
    {
      "epoch": 0.46,
      "grad_norm": 33.62569046020508,
      "kl": 0.0,
      "learning_rate": 2.704789322166972e-07,
      "logps/chosen": -250.41143798828125,
      "logps/rejected": -276.34075927734375,
      "loss": 0.2799,
      "rewards/chosen": 1.218125581741333,
      "rewards/margins": 6.132073402404785,
      "rewards/rejected": -4.913948059082031,
      "step": 1754
    },
    {
      "epoch": 0.46,
      "grad_norm": 28.395240783691406,
      "kl": 0.0,
      "learning_rate": 2.703480764197854e-07,
      "logps/chosen": -160.5257110595703,
      "logps/rejected": -197.5235137939453,
      "loss": 0.2393,
      "rewards/chosen": 0.7340290546417236,
      "rewards/margins": 5.709027290344238,
      "rewards/rejected": -4.974998474121094,
      "step": 1755
    },
    {
      "epoch": 0.46,
      "grad_norm": 31.5032901763916,
      "kl": 0.0,
      "learning_rate": 2.7021722062287353e-07,
      "logps/chosen": -313.3423767089844,
      "logps/rejected": -189.52862548828125,
      "loss": 0.4041,
      "rewards/chosen": 0.41564011573791504,
      "rewards/margins": 3.1868736743927,
      "rewards/rejected": -2.771233558654785,
      "step": 1756
    },
    {
      "epoch": 0.46,
      "grad_norm": 39.107112884521484,
      "kl": 0.0,
      "learning_rate": 2.7008636482596173e-07,
      "logps/chosen": -245.70704650878906,
      "logps/rejected": -172.10501098632812,
      "loss": 0.3122,
      "rewards/chosen": 0.5999122858047485,
      "rewards/margins": 3.459867000579834,
      "rewards/rejected": -2.859954833984375,
      "step": 1757
    },
    {
      "epoch": 0.46,
      "grad_norm": 32.38665771484375,
      "kl": 0.0,
      "learning_rate": 2.6995550902905e-07,
      "logps/chosen": -158.6022186279297,
      "logps/rejected": -247.68646240234375,
      "loss": 0.2896,
      "rewards/chosen": -1.248726725578308,
      "rewards/margins": 3.0644445419311523,
      "rewards/rejected": -4.31317138671875,
      "step": 1758
    },
    {
      "epoch": 0.46,
      "grad_norm": 39.92535400390625,
      "kl": 0.0,
      "learning_rate": 2.698246532321382e-07,
      "logps/chosen": -182.69566345214844,
      "logps/rejected": -335.43505859375,
      "loss": 0.1626,
      "rewards/chosen": 1.3375664949417114,
      "rewards/margins": 6.369048595428467,
      "rewards/rejected": -5.031482219696045,
      "step": 1759
    },
    {
      "epoch": 0.46,
      "grad_norm": 28.15011215209961,
      "kl": 0.0,
      "learning_rate": 2.6969379743522637e-07,
      "logps/chosen": -182.77301025390625,
      "logps/rejected": -227.71070861816406,
      "loss": 0.2726,
      "rewards/chosen": 0.5093837380409241,
      "rewards/margins": 4.273017883300781,
      "rewards/rejected": -3.763633966445923,
      "step": 1760
    },
    {
      "epoch": 0.46,
      "grad_norm": 45.147037506103516,
      "kl": 0.0,
      "learning_rate": 2.6956294163831457e-07,
      "logps/chosen": -178.78170776367188,
      "logps/rejected": -242.56234741210938,
      "loss": 0.3427,
      "rewards/chosen": 0.8544749021530151,
      "rewards/margins": 4.4317498207092285,
      "rewards/rejected": -3.577275037765503,
      "step": 1761
    },
    {
      "epoch": 0.46,
      "grad_norm": 42.30143356323242,
      "kl": 0.0,
      "learning_rate": 2.6943208584140276e-07,
      "logps/chosen": -325.6743469238281,
      "logps/rejected": -199.9776611328125,
      "loss": 0.2854,
      "rewards/chosen": 0.4093678891658783,
      "rewards/margins": 2.5241336822509766,
      "rewards/rejected": -2.1147658824920654,
      "step": 1762
    },
    {
      "epoch": 0.46,
      "grad_norm": 31.528766632080078,
      "kl": 0.0,
      "learning_rate": 2.6930123004449096e-07,
      "logps/chosen": -166.7415771484375,
      "logps/rejected": -233.80612182617188,
      "loss": 0.2507,
      "rewards/chosen": 0.9934232831001282,
      "rewards/margins": 5.178330898284912,
      "rewards/rejected": -4.18490743637085,
      "step": 1763
    },
    {
      "epoch": 0.46,
      "grad_norm": 28.84537696838379,
      "kl": 0.0,
      "learning_rate": 2.6917037424757916e-07,
      "logps/chosen": -225.96017456054688,
      "logps/rejected": -223.4440155029297,
      "loss": 0.1548,
      "rewards/chosen": 2.732337236404419,
      "rewards/margins": 7.202681541442871,
      "rewards/rejected": -4.470344543457031,
      "step": 1764
    },
    {
      "epoch": 0.46,
      "grad_norm": 42.9340705871582,
      "kl": 0.0,
      "learning_rate": 2.6903951845066735e-07,
      "logps/chosen": -250.65216064453125,
      "logps/rejected": -240.86422729492188,
      "loss": 0.3431,
      "rewards/chosen": 0.7281718254089355,
      "rewards/margins": 2.66864013671875,
      "rewards/rejected": -1.9404683113098145,
      "step": 1765
    },
    {
      "epoch": 0.46,
      "grad_norm": 23.091651916503906,
      "kl": 0.0,
      "learning_rate": 2.6890866265375555e-07,
      "logps/chosen": -188.62490844726562,
      "logps/rejected": -295.15179443359375,
      "loss": 0.2365,
      "rewards/chosen": 1.522935152053833,
      "rewards/margins": 4.846446990966797,
      "rewards/rejected": -3.323512077331543,
      "step": 1766
    },
    {
      "epoch": 0.46,
      "grad_norm": 34.22391128540039,
      "kl": 0.0,
      "learning_rate": 2.6877780685684374e-07,
      "logps/chosen": -237.4374237060547,
      "logps/rejected": -255.03697204589844,
      "loss": 0.259,
      "rewards/chosen": 1.1581065654754639,
      "rewards/margins": 5.033398628234863,
      "rewards/rejected": -3.8752918243408203,
      "step": 1767
    },
    {
      "epoch": 0.46,
      "grad_norm": 32.2965202331543,
      "kl": 0.0,
      "learning_rate": 2.6864695105993194e-07,
      "logps/chosen": -227.5531005859375,
      "logps/rejected": -224.8191680908203,
      "loss": 0.3165,
      "rewards/chosen": 0.9469077587127686,
      "rewards/margins": 4.652983665466309,
      "rewards/rejected": -3.706075668334961,
      "step": 1768
    },
    {
      "epoch": 0.46,
      "grad_norm": 32.94017791748047,
      "kl": 0.0,
      "learning_rate": 2.6851609526302014e-07,
      "logps/chosen": -219.24993896484375,
      "logps/rejected": -199.1254119873047,
      "loss": 0.2666,
      "rewards/chosen": 0.9429276585578918,
      "rewards/margins": 5.1648478507995605,
      "rewards/rejected": -4.221920013427734,
      "step": 1769
    },
    {
      "epoch": 0.46,
      "grad_norm": 22.41168975830078,
      "kl": 0.0,
      "learning_rate": 2.683852394661084e-07,
      "logps/chosen": -125.49591064453125,
      "logps/rejected": -276.9358215332031,
      "loss": 0.2313,
      "rewards/chosen": 0.19119969010353088,
      "rewards/margins": 4.500891208648682,
      "rewards/rejected": -4.309691429138184,
      "step": 1770
    },
    {
      "epoch": 0.46,
      "grad_norm": 42.297550201416016,
      "kl": 0.0,
      "learning_rate": 2.682543836691966e-07,
      "logps/chosen": -145.0494384765625,
      "logps/rejected": -273.5556640625,
      "loss": 0.2749,
      "rewards/chosen": 0.5761336088180542,
      "rewards/margins": 4.615227699279785,
      "rewards/rejected": -4.039093971252441,
      "step": 1771
    },
    {
      "epoch": 0.46,
      "grad_norm": 34.109439849853516,
      "kl": 0.0,
      "learning_rate": 2.681235278722847e-07,
      "logps/chosen": -151.40411376953125,
      "logps/rejected": -197.5521240234375,
      "loss": 0.2645,
      "rewards/chosen": -0.12928418815135956,
      "rewards/margins": 2.1960153579711914,
      "rewards/rejected": -2.3252995014190674,
      "step": 1772
    },
    {
      "epoch": 0.46,
      "grad_norm": 32.29287338256836,
      "kl": 0.0,
      "learning_rate": 2.679926720753729e-07,
      "logps/chosen": -236.3354949951172,
      "logps/rejected": -219.23182678222656,
      "loss": 0.2023,
      "rewards/chosen": 1.2813626527786255,
      "rewards/margins": 3.9799628257751465,
      "rewards/rejected": -2.6986002922058105,
      "step": 1773
    },
    {
      "epoch": 0.46,
      "grad_norm": 31.360984802246094,
      "kl": 0.0,
      "learning_rate": 2.678618162784611e-07,
      "logps/chosen": -206.96392822265625,
      "logps/rejected": -266.0826416015625,
      "loss": 0.2598,
      "rewards/chosen": 0.18017417192459106,
      "rewards/margins": 3.8584301471710205,
      "rewards/rejected": -3.678256034851074,
      "step": 1774
    },
    {
      "epoch": 0.46,
      "grad_norm": 38.270694732666016,
      "kl": 0.0,
      "learning_rate": 2.677309604815493e-07,
      "logps/chosen": -201.98492431640625,
      "logps/rejected": -284.6060485839844,
      "loss": 0.3326,
      "rewards/chosen": 0.169302299618721,
      "rewards/margins": 2.283196210861206,
      "rewards/rejected": -2.113893985748291,
      "step": 1775
    },
    {
      "epoch": 0.46,
      "grad_norm": 27.302358627319336,
      "kl": 0.0,
      "learning_rate": 2.676001046846375e-07,
      "logps/chosen": -152.28955078125,
      "logps/rejected": -331.47662353515625,
      "loss": 0.1946,
      "rewards/chosen": 0.48952898383140564,
      "rewards/margins": 4.17086124420166,
      "rewards/rejected": -3.6813323497772217,
      "step": 1776
    },
    {
      "epoch": 0.47,
      "grad_norm": 28.06827163696289,
      "kl": 0.0,
      "learning_rate": 2.674692488877257e-07,
      "logps/chosen": -253.61843872070312,
      "logps/rejected": -256.72442626953125,
      "loss": 0.3883,
      "rewards/chosen": -0.4867596924304962,
      "rewards/margins": 3.532467842102051,
      "rewards/rejected": -4.019227504730225,
      "step": 1777
    },
    {
      "epoch": 0.47,
      "grad_norm": 28.32488250732422,
      "kl": 0.0,
      "learning_rate": 2.673383930908139e-07,
      "logps/chosen": -273.54400634765625,
      "logps/rejected": -182.26773071289062,
      "loss": 0.3289,
      "rewards/chosen": 0.13828414678573608,
      "rewards/margins": 2.294508934020996,
      "rewards/rejected": -2.1562247276306152,
      "step": 1778
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.519807815551758,
      "kl": 0.0,
      "learning_rate": 2.672075372939021e-07,
      "logps/chosen": -183.9783477783203,
      "logps/rejected": -324.3849792480469,
      "loss": 0.3371,
      "rewards/chosen": -0.7770126461982727,
      "rewards/margins": 5.614282131195068,
      "rewards/rejected": -6.391294956207275,
      "step": 1779
    },
    {
      "epoch": 0.47,
      "grad_norm": 35.09743881225586,
      "kl": 0.0,
      "learning_rate": 2.670766814969903e-07,
      "logps/chosen": -126.46038055419922,
      "logps/rejected": -287.030517578125,
      "loss": 0.3135,
      "rewards/chosen": 0.18409352004528046,
      "rewards/margins": 3.7861502170562744,
      "rewards/rejected": -3.6020567417144775,
      "step": 1780
    },
    {
      "epoch": 0.47,
      "grad_norm": 36.5549201965332,
      "kl": 0.0,
      "learning_rate": 2.669458257000785e-07,
      "logps/chosen": -218.15708923339844,
      "logps/rejected": -216.2779998779297,
      "loss": 0.2754,
      "rewards/chosen": 0.7770873308181763,
      "rewards/margins": 3.693025588989258,
      "rewards/rejected": -2.915938138961792,
      "step": 1781
    },
    {
      "epoch": 0.47,
      "grad_norm": 34.714717864990234,
      "kl": 0.0,
      "learning_rate": 2.668149699031667e-07,
      "logps/chosen": -266.4847412109375,
      "logps/rejected": -261.268798828125,
      "loss": 0.3335,
      "rewards/chosen": 0.46225249767303467,
      "rewards/margins": 2.131314992904663,
      "rewards/rejected": -1.6690624952316284,
      "step": 1782
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.801822662353516,
      "kl": 0.0,
      "learning_rate": 2.6668411410625493e-07,
      "logps/chosen": -214.0922393798828,
      "logps/rejected": -276.224609375,
      "loss": 0.3039,
      "rewards/chosen": 0.2218581587076187,
      "rewards/margins": 3.801252603530884,
      "rewards/rejected": -3.5793943405151367,
      "step": 1783
    },
    {
      "epoch": 0.47,
      "grad_norm": 27.524829864501953,
      "kl": 0.0,
      "learning_rate": 2.6655325830934313e-07,
      "logps/chosen": -186.962890625,
      "logps/rejected": -252.66331481933594,
      "loss": 0.2315,
      "rewards/chosen": -0.20698793232440948,
      "rewards/margins": 4.5711774826049805,
      "rewards/rejected": -4.778165340423584,
      "step": 1784
    },
    {
      "epoch": 0.47,
      "grad_norm": 28.168418884277344,
      "kl": 0.0,
      "learning_rate": 2.664224025124313e-07,
      "logps/chosen": -116.75592803955078,
      "logps/rejected": -254.7372283935547,
      "loss": 0.2833,
      "rewards/chosen": -0.1261283904314041,
      "rewards/margins": 1.8637630939483643,
      "rewards/rejected": -1.989891529083252,
      "step": 1785
    },
    {
      "epoch": 0.47,
      "grad_norm": 30.8496036529541,
      "kl": 0.0,
      "learning_rate": 2.662915467155195e-07,
      "logps/chosen": -305.5096130371094,
      "logps/rejected": -225.94699096679688,
      "loss": 0.1714,
      "rewards/chosen": 3.5320422649383545,
      "rewards/margins": 7.76275634765625,
      "rewards/rejected": -4.230713844299316,
      "step": 1786
    },
    {
      "epoch": 0.47,
      "grad_norm": 30.151742935180664,
      "kl": 0.0,
      "learning_rate": 2.6616069091860767e-07,
      "logps/chosen": -148.5421905517578,
      "logps/rejected": -241.40003967285156,
      "loss": 0.2474,
      "rewards/chosen": 1.5538842678070068,
      "rewards/margins": 4.725393295288086,
      "rewards/rejected": -3.171509027481079,
      "step": 1787
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.541250228881836,
      "kl": 0.0,
      "learning_rate": 2.6602983512169586e-07,
      "logps/chosen": -237.12210083007812,
      "logps/rejected": -302.9940490722656,
      "loss": 0.2254,
      "rewards/chosen": 0.5165008902549744,
      "rewards/margins": 7.595618724822998,
      "rewards/rejected": -7.079117774963379,
      "step": 1788
    },
    {
      "epoch": 0.47,
      "grad_norm": 26.707609176635742,
      "kl": 0.0,
      "learning_rate": 2.6589897932478406e-07,
      "logps/chosen": -168.9550018310547,
      "logps/rejected": -295.4699401855469,
      "loss": 0.2857,
      "rewards/chosen": 1.3008852005004883,
      "rewards/margins": 5.830302715301514,
      "rewards/rejected": -4.529417514801025,
      "step": 1789
    },
    {
      "epoch": 0.47,
      "grad_norm": 36.683982849121094,
      "kl": 0.0,
      "learning_rate": 2.6576812352787225e-07,
      "logps/chosen": -150.46591186523438,
      "logps/rejected": -191.10302734375,
      "loss": 0.361,
      "rewards/chosen": -0.044897012412548065,
      "rewards/margins": 2.294562339782715,
      "rewards/rejected": -2.3394594192504883,
      "step": 1790
    },
    {
      "epoch": 0.47,
      "grad_norm": 34.28142166137695,
      "kl": 0.0,
      "learning_rate": 2.6563726773096045e-07,
      "logps/chosen": -201.0179901123047,
      "logps/rejected": -262.7911071777344,
      "loss": 0.2706,
      "rewards/chosen": -0.44374069571495056,
      "rewards/margins": 6.126780986785889,
      "rewards/rejected": -6.570521831512451,
      "step": 1791
    },
    {
      "epoch": 0.47,
      "grad_norm": 28.13138771057129,
      "kl": 0.0,
      "learning_rate": 2.6550641193404865e-07,
      "logps/chosen": -272.98065185546875,
      "logps/rejected": -279.70684814453125,
      "loss": 0.2232,
      "rewards/chosen": 1.1704599857330322,
      "rewards/margins": 5.974841117858887,
      "rewards/rejected": -4.804380893707275,
      "step": 1792
    },
    {
      "epoch": 0.47,
      "grad_norm": 36.26626968383789,
      "kl": 0.0,
      "learning_rate": 2.6537555613713684e-07,
      "logps/chosen": -191.0433349609375,
      "logps/rejected": -271.6612854003906,
      "loss": 0.2392,
      "rewards/chosen": 1.1138032674789429,
      "rewards/margins": 4.200105667114258,
      "rewards/rejected": -3.0863025188446045,
      "step": 1793
    },
    {
      "epoch": 0.47,
      "grad_norm": 41.627620697021484,
      "kl": 0.0,
      "learning_rate": 2.6524470034022504e-07,
      "logps/chosen": -176.78253173828125,
      "logps/rejected": -245.73622131347656,
      "loss": 0.3471,
      "rewards/chosen": -1.435298204421997,
      "rewards/margins": 1.6505019664764404,
      "rewards/rejected": -3.0858001708984375,
      "step": 1794
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.305988311767578,
      "kl": 0.0,
      "learning_rate": 2.6511384454331323e-07,
      "logps/chosen": -209.2613983154297,
      "logps/rejected": -171.62149047851562,
      "loss": 0.2582,
      "rewards/chosen": 0.047622814774513245,
      "rewards/margins": 2.757302761077881,
      "rewards/rejected": -2.7096798419952393,
      "step": 1795
    },
    {
      "epoch": 0.47,
      "grad_norm": 33.22182083129883,
      "kl": 0.0,
      "learning_rate": 2.649829887464015e-07,
      "logps/chosen": -156.6744384765625,
      "logps/rejected": -246.99066162109375,
      "loss": 0.3714,
      "rewards/chosen": -1.121241569519043,
      "rewards/margins": 4.636855125427246,
      "rewards/rejected": -5.758096694946289,
      "step": 1796
    },
    {
      "epoch": 0.47,
      "grad_norm": 33.60297775268555,
      "kl": 0.0,
      "learning_rate": 2.648521329494897e-07,
      "logps/chosen": -240.12210083007812,
      "logps/rejected": -282.920654296875,
      "loss": 0.2713,
      "rewards/chosen": 1.0724228620529175,
      "rewards/margins": 4.727004528045654,
      "rewards/rejected": -3.6545817852020264,
      "step": 1797
    },
    {
      "epoch": 0.47,
      "grad_norm": 37.307891845703125,
      "kl": 0.0,
      "learning_rate": 2.647212771525779e-07,
      "logps/chosen": -168.95399475097656,
      "logps/rejected": -182.30612182617188,
      "loss": 0.1923,
      "rewards/chosen": 1.4341641664505005,
      "rewards/margins": 4.384321689605713,
      "rewards/rejected": -2.950157642364502,
      "step": 1798
    },
    {
      "epoch": 0.47,
      "grad_norm": 39.907894134521484,
      "kl": 0.0,
      "learning_rate": 2.6459042135566607e-07,
      "logps/chosen": -244.38983154296875,
      "logps/rejected": -295.0066833496094,
      "loss": 0.2557,
      "rewards/chosen": 1.7292518615722656,
      "rewards/margins": 4.0041303634643555,
      "rewards/rejected": -2.27487850189209,
      "step": 1799
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.53258514404297,
      "kl": 0.0,
      "learning_rate": 2.6445956555875427e-07,
      "logps/chosen": -147.8897705078125,
      "logps/rejected": -284.5010070800781,
      "loss": 0.3865,
      "rewards/chosen": -0.3604433536529541,
      "rewards/margins": 1.9473583698272705,
      "rewards/rejected": -2.3078017234802246,
      "step": 1800
    },
    {
      "epoch": 0.47,
      "grad_norm": 39.97542190551758,
      "kl": 0.0,
      "learning_rate": 2.6432870976184246e-07,
      "logps/chosen": -262.21710205078125,
      "logps/rejected": -261.5345458984375,
      "loss": 0.269,
      "rewards/chosen": 0.1343774050474167,
      "rewards/margins": 2.3693342208862305,
      "rewards/rejected": -2.234956741333008,
      "step": 1801
    },
    {
      "epoch": 0.47,
      "grad_norm": 42.28630828857422,
      "kl": 0.0,
      "learning_rate": 2.6419785396493066e-07,
      "logps/chosen": -169.6084442138672,
      "logps/rejected": -211.6229248046875,
      "loss": 0.4199,
      "rewards/chosen": -0.3721810281276703,
      "rewards/margins": 2.8687891960144043,
      "rewards/rejected": -3.2409701347351074,
      "step": 1802
    },
    {
      "epoch": 0.47,
      "grad_norm": 28.243871688842773,
      "kl": 0.0,
      "learning_rate": 2.640669981680188e-07,
      "logps/chosen": -220.0828857421875,
      "logps/rejected": -210.42112731933594,
      "loss": 0.3123,
      "rewards/chosen": -0.20898278057575226,
      "rewards/margins": 2.981920003890991,
      "rewards/rejected": -3.1909027099609375,
      "step": 1803
    },
    {
      "epoch": 0.47,
      "grad_norm": 29.4338436126709,
      "kl": 0.0,
      "learning_rate": 2.63936142371107e-07,
      "logps/chosen": -184.46368408203125,
      "logps/rejected": -190.14401245117188,
      "loss": 0.337,
      "rewards/chosen": 0.3699902296066284,
      "rewards/margins": 2.436811923980713,
      "rewards/rejected": -2.066821575164795,
      "step": 1804
    },
    {
      "epoch": 0.47,
      "grad_norm": 31.90092658996582,
      "kl": 0.0,
      "learning_rate": 2.638052865741952e-07,
      "logps/chosen": -219.99734497070312,
      "logps/rejected": -307.57318115234375,
      "loss": 0.2219,
      "rewards/chosen": 1.241980791091919,
      "rewards/margins": 6.160677909851074,
      "rewards/rejected": -4.918696880340576,
      "step": 1805
    },
    {
      "epoch": 0.47,
      "grad_norm": 34.10798645019531,
      "kl": 0.0,
      "learning_rate": 2.636744307772834e-07,
      "logps/chosen": -262.48138427734375,
      "logps/rejected": -194.05780029296875,
      "loss": 0.2215,
      "rewards/chosen": -1.38141667842865,
      "rewards/margins": 1.2182193994522095,
      "rewards/rejected": -2.5996360778808594,
      "step": 1806
    },
    {
      "epoch": 0.47,
      "grad_norm": 37.151607513427734,
      "kl": 0.0,
      "learning_rate": 2.635435749803716e-07,
      "logps/chosen": -222.68251037597656,
      "logps/rejected": -282.22100830078125,
      "loss": 0.197,
      "rewards/chosen": 0.9315857887268066,
      "rewards/margins": 4.866264820098877,
      "rewards/rejected": -3.9346790313720703,
      "step": 1807
    },
    {
      "epoch": 0.47,
      "grad_norm": 45.96467208862305,
      "kl": 0.0,
      "learning_rate": 2.634127191834598e-07,
      "logps/chosen": -260.59710693359375,
      "logps/rejected": -211.7723388671875,
      "loss": 0.3401,
      "rewards/chosen": 0.4457830488681793,
      "rewards/margins": 2.8025193214416504,
      "rewards/rejected": -2.356736183166504,
      "step": 1808
    },
    {
      "epoch": 0.47,
      "grad_norm": 36.53901672363281,
      "kl": 0.0,
      "learning_rate": 2.6328186338654803e-07,
      "logps/chosen": -229.66275024414062,
      "logps/rejected": -205.681396484375,
      "loss": 0.4063,
      "rewards/chosen": -0.6894323229789734,
      "rewards/margins": 1.5290522575378418,
      "rewards/rejected": -2.21848464012146,
      "step": 1809
    },
    {
      "epoch": 0.47,
      "grad_norm": 38.05723190307617,
      "kl": 0.0,
      "learning_rate": 2.6315100758963623e-07,
      "logps/chosen": -233.05426025390625,
      "logps/rejected": -192.11688232421875,
      "loss": 0.3905,
      "rewards/chosen": -0.518449068069458,
      "rewards/margins": 1.3643622398376465,
      "rewards/rejected": -1.8828113079071045,
      "step": 1810
    },
    {
      "epoch": 0.47,
      "grad_norm": 39.44814682006836,
      "kl": 0.0,
      "learning_rate": 2.630201517927244e-07,
      "logps/chosen": -165.95632934570312,
      "logps/rejected": -271.4263916015625,
      "loss": 0.2897,
      "rewards/chosen": 0.6428464651107788,
      "rewards/margins": 4.1496405601501465,
      "rewards/rejected": -3.5067942142486572,
      "step": 1811
    },
    {
      "epoch": 0.47,
      "grad_norm": 30.81804656982422,
      "kl": 0.0,
      "learning_rate": 2.628892959958126e-07,
      "logps/chosen": -235.5777587890625,
      "logps/rejected": -223.3262939453125,
      "loss": 0.2791,
      "rewards/chosen": 0.08995027095079422,
      "rewards/margins": 3.556304931640625,
      "rewards/rejected": -3.4663546085357666,
      "step": 1812
    },
    {
      "epoch": 0.47,
      "grad_norm": 33.08573913574219,
      "kl": 0.0,
      "learning_rate": 2.627584401989008e-07,
      "logps/chosen": -225.62367248535156,
      "logps/rejected": -210.410400390625,
      "loss": 0.2657,
      "rewards/chosen": -0.42150986194610596,
      "rewards/margins": 2.2237415313720703,
      "rewards/rejected": -2.645251512527466,
      "step": 1813
    },
    {
      "epoch": 0.47,
      "grad_norm": 32.53742218017578,
      "kl": 0.0,
      "learning_rate": 2.62627584401989e-07,
      "logps/chosen": -182.93365478515625,
      "logps/rejected": -269.56402587890625,
      "loss": 0.3082,
      "rewards/chosen": 0.7558895349502563,
      "rewards/margins": 4.601802349090576,
      "rewards/rejected": -3.8459129333496094,
      "step": 1814
    },
    {
      "epoch": 0.48,
      "grad_norm": 35.428428649902344,
      "kl": 0.0,
      "learning_rate": 2.624967286050772e-07,
      "logps/chosen": -179.17361450195312,
      "logps/rejected": -357.9075012207031,
      "loss": 0.3118,
      "rewards/chosen": -0.028548792004585266,
      "rewards/margins": 4.181148052215576,
      "rewards/rejected": -4.2096967697143555,
      "step": 1815
    },
    {
      "epoch": 0.48,
      "grad_norm": 32.03295135498047,
      "kl": 0.0,
      "learning_rate": 2.623658728081654e-07,
      "logps/chosen": -155.4049530029297,
      "logps/rejected": -197.88656616210938,
      "loss": 0.2179,
      "rewards/chosen": 1.8276541233062744,
      "rewards/margins": 4.273235321044922,
      "rewards/rejected": -2.4455809593200684,
      "step": 1816
    },
    {
      "epoch": 0.48,
      "grad_norm": 28.16794204711914,
      "kl": 0.0,
      "learning_rate": 2.622350170112536e-07,
      "logps/chosen": -196.6329803466797,
      "logps/rejected": -276.8796691894531,
      "loss": 0.3296,
      "rewards/chosen": -0.34935376048088074,
      "rewards/margins": 2.847820281982422,
      "rewards/rejected": -3.197174072265625,
      "step": 1817
    },
    {
      "epoch": 0.48,
      "grad_norm": 24.694307327270508,
      "kl": 0.0,
      "learning_rate": 2.6210416121434174e-07,
      "logps/chosen": -167.74771118164062,
      "logps/rejected": -248.10716247558594,
      "loss": 0.2452,
      "rewards/chosen": 0.8411151170730591,
      "rewards/margins": 4.5710296630859375,
      "rewards/rejected": -3.729914665222168,
      "step": 1818
    },
    {
      "epoch": 0.48,
      "grad_norm": 32.40667724609375,
      "kl": 0.0,
      "learning_rate": 2.6197330541742994e-07,
      "logps/chosen": -167.05648803710938,
      "logps/rejected": -260.3861083984375,
      "loss": 0.3702,
      "rewards/chosen": -0.3283267021179199,
      "rewards/margins": 1.5936791896820068,
      "rewards/rejected": -1.9220058917999268,
      "step": 1819
    },
    {
      "epoch": 0.48,
      "grad_norm": 29.981260299682617,
      "kl": 0.0,
      "learning_rate": 2.6184244962051814e-07,
      "logps/chosen": -205.59954833984375,
      "logps/rejected": -240.790771484375,
      "loss": 0.3203,
      "rewards/chosen": 0.49631327390670776,
      "rewards/margins": 3.0999534130096436,
      "rewards/rejected": -2.603640079498291,
      "step": 1820
    },
    {
      "epoch": 0.48,
      "grad_norm": 39.830020904541016,
      "kl": 0.0,
      "learning_rate": 2.6171159382360633e-07,
      "logps/chosen": -309.5193176269531,
      "logps/rejected": -225.8451690673828,
      "loss": 0.4059,
      "rewards/chosen": -0.7414736747741699,
      "rewards/margins": 1.5210020542144775,
      "rewards/rejected": -2.2624757289886475,
      "step": 1821
    },
    {
      "epoch": 0.48,
      "grad_norm": 28.099843978881836,
      "kl": 0.0,
      "learning_rate": 2.615807380266946e-07,
      "logps/chosen": -139.35086059570312,
      "logps/rejected": -258.74658203125,
      "loss": 0.3531,
      "rewards/chosen": 0.06932894885540009,
      "rewards/margins": 3.5908191204071045,
      "rewards/rejected": -3.5214900970458984,
      "step": 1822
    },
    {
      "epoch": 0.48,
      "grad_norm": 33.78807067871094,
      "kl": 0.0,
      "learning_rate": 2.614498822297828e-07,
      "logps/chosen": -235.4106903076172,
      "logps/rejected": -252.2579803466797,
      "loss": 0.2059,
      "rewards/chosen": 1.0817266702651978,
      "rewards/margins": 3.7266507148742676,
      "rewards/rejected": -2.6449241638183594,
      "step": 1823
    },
    {
      "epoch": 0.48,
      "grad_norm": 26.09669303894043,
      "kl": 0.0,
      "learning_rate": 2.61319026432871e-07,
      "logps/chosen": -201.91397094726562,
      "logps/rejected": -203.9112091064453,
      "loss": 0.2103,
      "rewards/chosen": 0.38464394211769104,
      "rewards/margins": 4.490221977233887,
      "rewards/rejected": -4.1055779457092285,
      "step": 1824
    },
    {
      "epoch": 0.48,
      "grad_norm": 32.747703552246094,
      "kl": 0.0,
      "learning_rate": 2.6118817063595917e-07,
      "logps/chosen": -130.88514709472656,
      "logps/rejected": -275.74755859375,
      "loss": 0.2718,
      "rewards/chosen": 0.1479092687368393,
      "rewards/margins": 5.113210201263428,
      "rewards/rejected": -4.965301036834717,
      "step": 1825
    },
    {
      "epoch": 0.48,
      "grad_norm": 33.221134185791016,
      "kl": 0.0,
      "learning_rate": 2.6105731483904737e-07,
      "logps/chosen": -268.6279296875,
      "logps/rejected": -209.37034606933594,
      "loss": 0.2728,
      "rewards/chosen": -0.01809675432741642,
      "rewards/margins": 3.1155669689178467,
      "rewards/rejected": -3.1336636543273926,
      "step": 1826
    },
    {
      "epoch": 0.48,
      "grad_norm": 33.30061340332031,
      "kl": 0.0,
      "learning_rate": 2.6092645904213556e-07,
      "logps/chosen": -270.9553527832031,
      "logps/rejected": -303.281005859375,
      "loss": 0.2219,
      "rewards/chosen": 1.86402428150177,
      "rewards/margins": 5.579380989074707,
      "rewards/rejected": -3.7153568267822266,
      "step": 1827
    },
    {
      "epoch": 0.48,
      "grad_norm": 35.51240921020508,
      "kl": 0.0,
      "learning_rate": 2.6079560324522376e-07,
      "logps/chosen": -212.87295532226562,
      "logps/rejected": -247.5990753173828,
      "loss": 0.2996,
      "rewards/chosen": 0.5374855399131775,
      "rewards/margins": 4.391886234283447,
      "rewards/rejected": -3.854400634765625,
      "step": 1828
    },
    {
      "epoch": 0.48,
      "grad_norm": 43.96573257446289,
      "kl": 0.0,
      "learning_rate": 2.6066474744831195e-07,
      "logps/chosen": -307.2767333984375,
      "logps/rejected": -211.69451904296875,
      "loss": 0.338,
      "rewards/chosen": -0.34222811460494995,
      "rewards/margins": 2.2243869304656982,
      "rewards/rejected": -2.566615104675293,
      "step": 1829
    },
    {
      "epoch": 0.48,
      "grad_norm": 34.17534255981445,
      "kl": 0.0,
      "learning_rate": 2.6053389165140015e-07,
      "logps/chosen": -189.63507080078125,
      "logps/rejected": -239.6513214111328,
      "loss": 0.2713,
      "rewards/chosen": -0.28839311003685,
      "rewards/margins": 3.142786741256714,
      "rewards/rejected": -3.4311797618865967,
      "step": 1830
    },
    {
      "epoch": 0.48,
      "grad_norm": 34.881080627441406,
      "kl": 0.0,
      "learning_rate": 2.6040303585448835e-07,
      "logps/chosen": -201.80218505859375,
      "logps/rejected": -237.1852569580078,
      "loss": 0.2662,
      "rewards/chosen": 0.23056668043136597,
      "rewards/margins": 3.316105604171753,
      "rewards/rejected": -3.085538864135742,
      "step": 1831
    },
    {
      "epoch": 0.48,
      "grad_norm": 35.58689880371094,
      "kl": 0.0,
      "learning_rate": 2.6027218005757654e-07,
      "logps/chosen": -205.9733123779297,
      "logps/rejected": -214.0948944091797,
      "loss": 0.3541,
      "rewards/chosen": 0.2090885192155838,
      "rewards/margins": 3.7775912284851074,
      "rewards/rejected": -3.56850266456604,
      "step": 1832
    },
    {
      "epoch": 0.48,
      "grad_norm": 46.46804428100586,
      "kl": 0.0,
      "learning_rate": 2.6014132426066474e-07,
      "logps/chosen": -243.96986389160156,
      "logps/rejected": -321.12237548828125,
      "loss": 0.2543,
      "rewards/chosen": 0.33036065101623535,
      "rewards/margins": 4.61346435546875,
      "rewards/rejected": -4.283103942871094,
      "step": 1833
    },
    {
      "epoch": 0.48,
      "grad_norm": 37.85405349731445,
      "kl": 0.0,
      "learning_rate": 2.600104684637529e-07,
      "logps/chosen": -262.3682556152344,
      "logps/rejected": -252.35533142089844,
      "loss": 0.2585,
      "rewards/chosen": 0.8365837335586548,
      "rewards/margins": 3.0271658897399902,
      "rewards/rejected": -2.190582036972046,
      "step": 1834
    },
    {
      "epoch": 0.48,
      "grad_norm": 39.19338607788086,
      "kl": 0.0,
      "learning_rate": 2.5987961266684113e-07,
      "logps/chosen": -260.9456481933594,
      "logps/rejected": -229.8648681640625,
      "loss": 0.3005,
      "rewards/chosen": 1.6565935611724854,
      "rewards/margins": 4.466734409332275,
      "rewards/rejected": -2.81014084815979,
      "step": 1835
    },
    {
      "epoch": 0.48,
      "grad_norm": 29.94069480895996,
      "kl": 0.0,
      "learning_rate": 2.5974875686992933e-07,
      "logps/chosen": -269.1368103027344,
      "logps/rejected": -189.29193115234375,
      "loss": 0.2599,
      "rewards/chosen": -0.17925623059272766,
      "rewards/margins": 3.1845247745513916,
      "rewards/rejected": -3.363780975341797,
      "step": 1836
    },
    {
      "epoch": 0.48,
      "grad_norm": 29.240211486816406,
      "kl": 0.0,
      "learning_rate": 2.596179010730175e-07,
      "logps/chosen": -244.43478393554688,
      "logps/rejected": -251.92523193359375,
      "loss": 0.3398,
      "rewards/chosen": -0.38684120774269104,
      "rewards/margins": 5.424040794372559,
      "rewards/rejected": -5.810882091522217,
      "step": 1837
    },
    {
      "epoch": 0.48,
      "grad_norm": 37.174434661865234,
      "kl": 0.0,
      "learning_rate": 2.594870452761057e-07,
      "logps/chosen": -188.85650634765625,
      "logps/rejected": -329.2757873535156,
      "loss": 0.4007,
      "rewards/chosen": -0.41793757677078247,
      "rewards/margins": 3.9053256511688232,
      "rewards/rejected": -4.323263168334961,
      "step": 1838
    },
    {
      "epoch": 0.48,
      "grad_norm": 41.535221099853516,
      "kl": 0.0,
      "learning_rate": 2.593561894791939e-07,
      "logps/chosen": -244.72862243652344,
      "logps/rejected": -299.87762451171875,
      "loss": 0.223,
      "rewards/chosen": 1.2217128276824951,
      "rewards/margins": 5.825761795043945,
      "rewards/rejected": -4.604048728942871,
      "step": 1839
    },
    {
      "epoch": 0.48,
      "grad_norm": 28.986780166625977,
      "kl": 0.0,
      "learning_rate": 2.592253336822821e-07,
      "logps/chosen": -204.30938720703125,
      "logps/rejected": -151.59121704101562,
      "loss": 0.3252,
      "rewards/chosen": 1.5701173543930054,
      "rewards/margins": 3.4767918586730957,
      "rewards/rejected": -1.9066746234893799,
      "step": 1840
    },
    {
      "epoch": 0.48,
      "grad_norm": 36.32473373413086,
      "kl": 0.0,
      "learning_rate": 2.590944778853703e-07,
      "logps/chosen": -207.05661010742188,
      "logps/rejected": -250.57901000976562,
      "loss": 0.4133,
      "rewards/chosen": -0.5402141809463501,
      "rewards/margins": 1.7857950925827026,
      "rewards/rejected": -2.3260092735290527,
      "step": 1841
    },
    {
      "epoch": 0.48,
      "grad_norm": 38.50576400756836,
      "kl": 0.0,
      "learning_rate": 2.589636220884585e-07,
      "logps/chosen": -277.8896484375,
      "logps/rejected": -229.23648071289062,
      "loss": 0.2772,
      "rewards/chosen": 0.07086262106895447,
      "rewards/margins": 4.372156620025635,
      "rewards/rejected": -4.301293849945068,
      "step": 1842
    },
    {
      "epoch": 0.48,
      "grad_norm": 31.36016845703125,
      "kl": 0.0,
      "learning_rate": 2.588327662915467e-07,
      "logps/chosen": -140.7576446533203,
      "logps/rejected": -238.2123260498047,
      "loss": 0.2594,
      "rewards/chosen": 1.0603423118591309,
      "rewards/margins": 2.653862476348877,
      "rewards/rejected": -1.593520164489746,
      "step": 1843
    },
    {
      "epoch": 0.48,
      "grad_norm": 38.02727127075195,
      "kl": 0.0,
      "learning_rate": 2.587019104946349e-07,
      "logps/chosen": -184.4716796875,
      "logps/rejected": -265.66009521484375,
      "loss": 0.3522,
      "rewards/chosen": -1.331416130065918,
      "rewards/margins": 2.9817352294921875,
      "rewards/rejected": -4.3131513595581055,
      "step": 1844
    },
    {
      "epoch": 0.48,
      "grad_norm": 33.963863372802734,
      "kl": 0.0,
      "learning_rate": 2.585710546977231e-07,
      "logps/chosen": -233.77108764648438,
      "logps/rejected": -203.09811401367188,
      "loss": 0.2908,
      "rewards/chosen": 0.03152484446763992,
      "rewards/margins": 4.830499649047852,
      "rewards/rejected": -4.798974990844727,
      "step": 1845
    },
    {
      "epoch": 0.48,
      "grad_norm": 32.82160186767578,
      "kl": 0.0,
      "learning_rate": 2.584401989008113e-07,
      "logps/chosen": -275.6189270019531,
      "logps/rejected": -189.2215118408203,
      "loss": 0.1936,
      "rewards/chosen": 2.4405970573425293,
      "rewards/margins": 6.280233860015869,
      "rewards/rejected": -3.83963680267334,
      "step": 1846
    },
    {
      "epoch": 0.48,
      "grad_norm": 28.793840408325195,
      "kl": 0.0,
      "learning_rate": 2.5830934310389954e-07,
      "logps/chosen": -202.758056640625,
      "logps/rejected": -209.53219604492188,
      "loss": 0.2467,
      "rewards/chosen": 0.4155222177505493,
      "rewards/margins": 4.262606620788574,
      "rewards/rejected": -3.8470845222473145,
      "step": 1847
    },
    {
      "epoch": 0.48,
      "grad_norm": 31.604778289794922,
      "kl": 0.0,
      "learning_rate": 2.5817848730698773e-07,
      "logps/chosen": -119.1492919921875,
      "logps/rejected": -248.53244018554688,
      "loss": 0.2821,
      "rewards/chosen": 1.3229519128799438,
      "rewards/margins": 3.7250118255615234,
      "rewards/rejected": -2.402060031890869,
      "step": 1848
    },
    {
      "epoch": 0.48,
      "grad_norm": 36.85615539550781,
      "kl": 0.0,
      "learning_rate": 2.580476315100759e-07,
      "logps/chosen": -279.9113464355469,
      "logps/rejected": -217.8267059326172,
      "loss": 0.2948,
      "rewards/chosen": 1.4072452783584595,
      "rewards/margins": 3.7849130630493164,
      "rewards/rejected": -2.3776676654815674,
      "step": 1849
    },
    {
      "epoch": 0.48,
      "grad_norm": 26.794940948486328,
      "kl": 0.0,
      "learning_rate": 2.5791677571316407e-07,
      "logps/chosen": -222.50216674804688,
      "logps/rejected": -312.8692321777344,
      "loss": 0.3357,
      "rewards/chosen": -0.4899716377258301,
      "rewards/margins": 5.189242362976074,
      "rewards/rejected": -5.679214000701904,
      "step": 1850
    },
    {
      "epoch": 0.48,
      "grad_norm": 31.417387008666992,
      "kl": 0.0,
      "learning_rate": 2.5778591991625227e-07,
      "logps/chosen": -93.98246002197266,
      "logps/rejected": -269.1598205566406,
      "loss": 0.3274,
      "rewards/chosen": -0.26879116892814636,
      "rewards/margins": 2.833065986633301,
      "rewards/rejected": -3.1018571853637695,
      "step": 1851
    },
    {
      "epoch": 0.48,
      "grad_norm": 38.49124526977539,
      "kl": 0.0,
      "learning_rate": 2.5765506411934046e-07,
      "logps/chosen": -267.2807312011719,
      "logps/rejected": -243.086669921875,
      "loss": 0.2353,
      "rewards/chosen": 2.298083782196045,
      "rewards/margins": 5.859939098358154,
      "rewards/rejected": -3.5618553161621094,
      "step": 1852
    },
    {
      "epoch": 0.48,
      "grad_norm": 39.02790832519531,
      "kl": 0.0,
      "learning_rate": 2.5752420832242866e-07,
      "logps/chosen": -215.70635986328125,
      "logps/rejected": -261.94781494140625,
      "loss": 0.3084,
      "rewards/chosen": 0.24146094918251038,
      "rewards/margins": 2.3436214923858643,
      "rewards/rejected": -2.1021604537963867,
      "step": 1853
    },
    {
      "epoch": 0.49,
      "grad_norm": 35.632415771484375,
      "kl": 0.0,
      "learning_rate": 2.5739335252551686e-07,
      "logps/chosen": -245.4604034423828,
      "logps/rejected": -216.57431030273438,
      "loss": 0.252,
      "rewards/chosen": 0.37408319115638733,
      "rewards/margins": 3.8141672611236572,
      "rewards/rejected": -3.4400839805603027,
      "step": 1854
    },
    {
      "epoch": 0.49,
      "grad_norm": 38.72943115234375,
      "kl": 0.0,
      "learning_rate": 2.5726249672860505e-07,
      "logps/chosen": -253.3831329345703,
      "logps/rejected": -220.69468688964844,
      "loss": 0.1547,
      "rewards/chosen": 0.9969064593315125,
      "rewards/margins": 5.076277732849121,
      "rewards/rejected": -4.079371452331543,
      "step": 1855
    },
    {
      "epoch": 0.49,
      "grad_norm": 33.00349807739258,
      "kl": 0.0,
      "learning_rate": 2.5713164093169325e-07,
      "logps/chosen": -178.433349609375,
      "logps/rejected": -243.36041259765625,
      "loss": 0.348,
      "rewards/chosen": -0.0945533812046051,
      "rewards/margins": 2.8740203380584717,
      "rewards/rejected": -2.968573808670044,
      "step": 1856
    },
    {
      "epoch": 0.49,
      "grad_norm": 23.94053077697754,
      "kl": 0.0,
      "learning_rate": 2.5700078513478144e-07,
      "logps/chosen": -238.0264434814453,
      "logps/rejected": -362.50677490234375,
      "loss": 0.2621,
      "rewards/chosen": 0.5840424299240112,
      "rewards/margins": 4.776507377624512,
      "rewards/rejected": -4.192464828491211,
      "step": 1857
    },
    {
      "epoch": 0.49,
      "grad_norm": 29.512399673461914,
      "kl": 0.0,
      "learning_rate": 2.5686992933786964e-07,
      "logps/chosen": -187.4696044921875,
      "logps/rejected": -281.3097839355469,
      "loss": 0.185,
      "rewards/chosen": 1.7165203094482422,
      "rewards/margins": 5.96409273147583,
      "rewards/rejected": -4.247572422027588,
      "step": 1858
    },
    {
      "epoch": 0.49,
      "grad_norm": 34.900535583496094,
      "kl": 0.0,
      "learning_rate": 2.5673907354095784e-07,
      "logps/chosen": -243.0150146484375,
      "logps/rejected": -188.5065155029297,
      "loss": 0.2916,
      "rewards/chosen": -0.3352580666542053,
      "rewards/margins": 2.3534703254699707,
      "rewards/rejected": -2.6887283325195312,
      "step": 1859
    },
    {
      "epoch": 0.49,
      "grad_norm": 37.05907440185547,
      "kl": 0.0,
      "learning_rate": 2.566082177440461e-07,
      "logps/chosen": -173.6273956298828,
      "logps/rejected": -222.750244140625,
      "loss": 0.32,
      "rewards/chosen": 1.3401601314544678,
      "rewards/margins": 3.5735867023468018,
      "rewards/rejected": -2.233426570892334,
      "step": 1860
    },
    {
      "epoch": 0.49,
      "grad_norm": 38.520572662353516,
      "kl": 0.0,
      "learning_rate": 2.564773619471343e-07,
      "logps/chosen": -198.89744567871094,
      "logps/rejected": -206.3017578125,
      "loss": 0.2883,
      "rewards/chosen": 0.6857025623321533,
      "rewards/margins": 2.900019884109497,
      "rewards/rejected": -2.2143173217773438,
      "step": 1861
    },
    {
      "epoch": 0.49,
      "grad_norm": 24.21077537536621,
      "kl": 0.0,
      "learning_rate": 2.563465061502225e-07,
      "logps/chosen": -205.73321533203125,
      "logps/rejected": -201.75450134277344,
      "loss": 0.2302,
      "rewards/chosen": 1.6169886589050293,
      "rewards/margins": 5.4056196212768555,
      "rewards/rejected": -3.788630962371826,
      "step": 1862
    },
    {
      "epoch": 0.49,
      "grad_norm": 27.92758560180664,
      "kl": 0.0,
      "learning_rate": 2.562156503533107e-07,
      "logps/chosen": -169.76693725585938,
      "logps/rejected": -300.3410949707031,
      "loss": 0.2816,
      "rewards/chosen": 1.1886805295944214,
      "rewards/margins": 6.535387992858887,
      "rewards/rejected": -5.346707344055176,
      "step": 1863
    },
    {
      "epoch": 0.49,
      "grad_norm": 40.24781036376953,
      "kl": 0.0,
      "learning_rate": 2.5608479455639887e-07,
      "logps/chosen": -308.61517333984375,
      "logps/rejected": -213.69386291503906,
      "loss": 0.3359,
      "rewards/chosen": -1.7393323183059692,
      "rewards/margins": 0.8687006235122681,
      "rewards/rejected": -2.6080329418182373,
      "step": 1864
    },
    {
      "epoch": 0.49,
      "grad_norm": 37.39738082885742,
      "kl": 0.0,
      "learning_rate": 2.55953938759487e-07,
      "logps/chosen": -237.5323486328125,
      "logps/rejected": -267.2157287597656,
      "loss": 0.314,
      "rewards/chosen": 1.9053438901901245,
      "rewards/margins": 5.274196147918701,
      "rewards/rejected": -3.368852138519287,
      "step": 1865
    },
    {
      "epoch": 0.49,
      "grad_norm": 27.616085052490234,
      "kl": 0.0,
      "learning_rate": 2.558230829625752e-07,
      "logps/chosen": -195.3375244140625,
      "logps/rejected": -274.4885559082031,
      "loss": 0.2278,
      "rewards/chosen": 1.3364429473876953,
      "rewards/margins": 3.6697354316711426,
      "rewards/rejected": -2.3332924842834473,
      "step": 1866
    },
    {
      "epoch": 0.49,
      "grad_norm": 31.84263801574707,
      "kl": 0.0,
      "learning_rate": 2.556922271656634e-07,
      "logps/chosen": -255.9803924560547,
      "logps/rejected": -363.9133605957031,
      "loss": 0.2285,
      "rewards/chosen": 0.1768454760313034,
      "rewards/margins": 5.845008373260498,
      "rewards/rejected": -5.668162822723389,
      "step": 1867
    },
    {
      "epoch": 0.49,
      "grad_norm": 24.04751968383789,
      "kl": 0.0,
      "learning_rate": 2.555613713687516e-07,
      "logps/chosen": -240.5629425048828,
      "logps/rejected": -328.81195068359375,
      "loss": 0.1552,
      "rewards/chosen": 0.5476657748222351,
      "rewards/margins": 5.483957767486572,
      "rewards/rejected": -4.9362921714782715,
      "step": 1868
    },
    {
      "epoch": 0.49,
      "grad_norm": 48.056461334228516,
      "kl": 0.0,
      "learning_rate": 2.554305155718398e-07,
      "logps/chosen": -198.63595581054688,
      "logps/rejected": -255.7877655029297,
      "loss": 0.2434,
      "rewards/chosen": 1.6382763385772705,
      "rewards/margins": 4.527665138244629,
      "rewards/rejected": -2.8893887996673584,
      "step": 1869
    },
    {
      "epoch": 0.49,
      "grad_norm": 32.82905578613281,
      "kl": 0.0,
      "learning_rate": 2.55299659774928e-07,
      "logps/chosen": -203.86563110351562,
      "logps/rejected": -264.9864501953125,
      "loss": 0.2917,
      "rewards/chosen": -0.517257809638977,
      "rewards/margins": 2.667818069458008,
      "rewards/rejected": -3.1850759983062744,
      "step": 1870
    },
    {
      "epoch": 0.49,
      "grad_norm": 25.104469299316406,
      "kl": 0.0,
      "learning_rate": 2.551688039780162e-07,
      "logps/chosen": -129.41441345214844,
      "logps/rejected": -279.703857421875,
      "loss": 0.3841,
      "rewards/chosen": -0.38443830609321594,
      "rewards/margins": 3.2106776237487793,
      "rewards/rejected": -3.595115900039673,
      "step": 1871
    },
    {
      "epoch": 0.49,
      "grad_norm": 29.48250961303711,
      "kl": 0.0,
      "learning_rate": 2.550379481811044e-07,
      "logps/chosen": -251.05799865722656,
      "logps/rejected": -246.1929168701172,
      "loss": 0.1636,
      "rewards/chosen": 2.388549566268921,
      "rewards/margins": 5.352682590484619,
      "rewards/rejected": -2.9641330242156982,
      "step": 1872
    },
    {
      "epoch": 0.49,
      "grad_norm": 40.20368957519531,
      "kl": 0.0,
      "learning_rate": 2.5490709238419264e-07,
      "logps/chosen": -231.3787384033203,
      "logps/rejected": -276.19232177734375,
      "loss": 0.2379,
      "rewards/chosen": 1.4038299322128296,
      "rewards/margins": 5.262545585632324,
      "rewards/rejected": -3.858715534210205,
      "step": 1873
    },
    {
      "epoch": 0.49,
      "grad_norm": 24.63336181640625,
      "kl": 0.0,
      "learning_rate": 2.5477623658728083e-07,
      "logps/chosen": -157.3682403564453,
      "logps/rejected": -257.8160400390625,
      "loss": 0.1499,
      "rewards/chosen": 2.0958898067474365,
      "rewards/margins": 5.802646160125732,
      "rewards/rejected": -3.706756353378296,
      "step": 1874
    },
    {
      "epoch": 0.49,
      "grad_norm": 27.928619384765625,
      "kl": 0.0,
      "learning_rate": 2.5464538079036903e-07,
      "logps/chosen": -163.24595642089844,
      "logps/rejected": -290.6197204589844,
      "loss": 0.1987,
      "rewards/chosen": 0.922084391117096,
      "rewards/margins": 4.095943450927734,
      "rewards/rejected": -3.173859119415283,
      "step": 1875
    },
    {
      "epoch": 0.49,
      "grad_norm": 36.277679443359375,
      "kl": 0.0,
      "learning_rate": 2.545145249934572e-07,
      "logps/chosen": -198.1995849609375,
      "logps/rejected": -264.11358642578125,
      "loss": 0.2921,
      "rewards/chosen": -0.16467052698135376,
      "rewards/margins": 4.354956150054932,
      "rewards/rejected": -4.519626617431641,
      "step": 1876
    },
    {
      "epoch": 0.49,
      "grad_norm": 38.40446853637695,
      "kl": 0.0,
      "learning_rate": 2.543836691965454e-07,
      "logps/chosen": -187.0970458984375,
      "logps/rejected": -296.1387634277344,
      "loss": 0.257,
      "rewards/chosen": 0.20785601437091827,
      "rewards/margins": 4.458132743835449,
      "rewards/rejected": -4.250276565551758,
      "step": 1877
    },
    {
      "epoch": 0.49,
      "grad_norm": 39.18281936645508,
      "kl": 0.0,
      "learning_rate": 2.542528133996336e-07,
      "logps/chosen": -186.9453125,
      "logps/rejected": -246.5155487060547,
      "loss": 0.3307,
      "rewards/chosen": -0.34305471181869507,
      "rewards/margins": 1.5182170867919922,
      "rewards/rejected": -1.8612717390060425,
      "step": 1878
    },
    {
      "epoch": 0.49,
      "grad_norm": 27.042150497436523,
      "kl": 0.0,
      "learning_rate": 2.541219576027218e-07,
      "logps/chosen": -189.15548706054688,
      "logps/rejected": -219.12112426757812,
      "loss": 0.1848,
      "rewards/chosen": 1.8135871887207031,
      "rewards/margins": 5.438680171966553,
      "rewards/rejected": -3.6250929832458496,
      "step": 1879
    },
    {
      "epoch": 0.49,
      "grad_norm": 42.695098876953125,
      "kl": 0.0,
      "learning_rate": 2.5399110180580995e-07,
      "logps/chosen": -186.45420837402344,
      "logps/rejected": -312.56304931640625,
      "loss": 0.2228,
      "rewards/chosen": 0.7028865814208984,
      "rewards/margins": 4.877556324005127,
      "rewards/rejected": -4.1746697425842285,
      "step": 1880
    },
    {
      "epoch": 0.49,
      "grad_norm": 38.940799713134766,
      "kl": 0.0,
      "learning_rate": 2.5386024600889815e-07,
      "logps/chosen": -347.6214599609375,
      "logps/rejected": -166.71929931640625,
      "loss": 0.2926,
      "rewards/chosen": -2.7828896045684814,
      "rewards/margins": -0.96140456199646,
      "rewards/rejected": -1.8214850425720215,
      "step": 1881
    },
    {
      "epoch": 0.49,
      "grad_norm": 30.760910034179688,
      "kl": 0.0,
      "learning_rate": 2.5372939021198635e-07,
      "logps/chosen": -205.28262329101562,
      "logps/rejected": -237.68161010742188,
      "loss": 0.3346,
      "rewards/chosen": -0.400518000125885,
      "rewards/margins": 4.106273174285889,
      "rewards/rejected": -4.506791114807129,
      "step": 1882
    },
    {
      "epoch": 0.49,
      "grad_norm": 39.819332122802734,
      "kl": 0.0,
      "learning_rate": 2.5359853441507454e-07,
      "logps/chosen": -278.8540344238281,
      "logps/rejected": -275.9216003417969,
      "loss": 0.2436,
      "rewards/chosen": 1.4045870304107666,
      "rewards/margins": 2.893503189086914,
      "rewards/rejected": -1.488916039466858,
      "step": 1883
    },
    {
      "epoch": 0.49,
      "grad_norm": 31.362224578857422,
      "kl": 0.0,
      "learning_rate": 2.5346767861816274e-07,
      "logps/chosen": -208.73666381835938,
      "logps/rejected": -221.90631103515625,
      "loss": 0.2985,
      "rewards/chosen": 1.8967316150665283,
      "rewards/margins": 6.240492820739746,
      "rewards/rejected": -4.343760967254639,
      "step": 1884
    },
    {
      "epoch": 0.49,
      "grad_norm": 31.99814224243164,
      "kl": 0.0,
      "learning_rate": 2.5333682282125094e-07,
      "logps/chosen": -212.91397094726562,
      "logps/rejected": -280.9819030761719,
      "loss": 0.3056,
      "rewards/chosen": -0.8252590894699097,
      "rewards/margins": 3.0601096153259277,
      "rewards/rejected": -3.885368824005127,
      "step": 1885
    },
    {
      "epoch": 0.49,
      "grad_norm": 36.41081237792969,
      "kl": 0.0,
      "learning_rate": 2.532059670243392e-07,
      "logps/chosen": -218.00038146972656,
      "logps/rejected": -217.99234008789062,
      "loss": 0.2384,
      "rewards/chosen": 0.8008521199226379,
      "rewards/margins": 4.729746341705322,
      "rewards/rejected": -3.92889404296875,
      "step": 1886
    },
    {
      "epoch": 0.49,
      "grad_norm": 37.86720657348633,
      "kl": 0.0,
      "learning_rate": 2.530751112274274e-07,
      "logps/chosen": -269.68780517578125,
      "logps/rejected": -251.1253204345703,
      "loss": 0.2796,
      "rewards/chosen": 0.7860704660415649,
      "rewards/margins": 3.951198101043701,
      "rewards/rejected": -3.165127754211426,
      "step": 1887
    },
    {
      "epoch": 0.49,
      "grad_norm": 31.15866470336914,
      "kl": 0.0,
      "learning_rate": 2.529442554305156e-07,
      "logps/chosen": -224.89462280273438,
      "logps/rejected": -279.7936706542969,
      "loss": 0.2781,
      "rewards/chosen": 0.3493259847164154,
      "rewards/margins": 5.08610200881958,
      "rewards/rejected": -4.736775875091553,
      "step": 1888
    },
    {
      "epoch": 0.49,
      "grad_norm": 34.115726470947266,
      "kl": 0.0,
      "learning_rate": 2.5281339963360377e-07,
      "logps/chosen": -234.70863342285156,
      "logps/rejected": -174.17152404785156,
      "loss": 0.2653,
      "rewards/chosen": 1.474958062171936,
      "rewards/margins": 3.463298797607422,
      "rewards/rejected": -1.9883406162261963,
      "step": 1889
    },
    {
      "epoch": 0.49,
      "grad_norm": 36.890350341796875,
      "kl": 0.0,
      "learning_rate": 2.5268254383669197e-07,
      "logps/chosen": -204.939208984375,
      "logps/rejected": -262.5244140625,
      "loss": 0.2881,
      "rewards/chosen": 0.6666855812072754,
      "rewards/margins": 4.215951919555664,
      "rewards/rejected": -3.5492665767669678,
      "step": 1890
    },
    {
      "epoch": 0.49,
      "grad_norm": 33.597843170166016,
      "kl": 0.0,
      "learning_rate": 2.5255168803978016e-07,
      "logps/chosen": -148.05075073242188,
      "logps/rejected": -116.547607421875,
      "loss": 0.2184,
      "rewards/chosen": 0.7623181939125061,
      "rewards/margins": 2.7595696449279785,
      "rewards/rejected": -1.9972515106201172,
      "step": 1891
    },
    {
      "epoch": 0.5,
      "grad_norm": 35.052284240722656,
      "kl": 0.0,
      "learning_rate": 2.5242083224286836e-07,
      "logps/chosen": -166.26486206054688,
      "logps/rejected": -197.2524871826172,
      "loss": 0.296,
      "rewards/chosen": 0.15562376379966736,
      "rewards/margins": 2.3366987705230713,
      "rewards/rejected": -2.181075096130371,
      "step": 1892
    },
    {
      "epoch": 0.5,
      "grad_norm": 32.913978576660156,
      "kl": 0.0,
      "learning_rate": 2.5228997644595656e-07,
      "logps/chosen": -143.8211669921875,
      "logps/rejected": -236.24700927734375,
      "loss": 0.3891,
      "rewards/chosen": 0.37257999181747437,
      "rewards/margins": 2.2927846908569336,
      "rewards/rejected": -1.9202046394348145,
      "step": 1893
    },
    {
      "epoch": 0.5,
      "grad_norm": 30.473838806152344,
      "kl": 0.0,
      "learning_rate": 2.5215912064904475e-07,
      "logps/chosen": -192.12429809570312,
      "logps/rejected": -227.31834411621094,
      "loss": 0.2552,
      "rewards/chosen": 2.3903250694274902,
      "rewards/margins": 6.382128715515137,
      "rewards/rejected": -3.9918034076690674,
      "step": 1894
    },
    {
      "epoch": 0.5,
      "grad_norm": 39.04948043823242,
      "kl": 0.0,
      "learning_rate": 2.5202826485213295e-07,
      "logps/chosen": -233.09219360351562,
      "logps/rejected": -185.82948303222656,
      "loss": 0.3997,
      "rewards/chosen": -0.4558795392513275,
      "rewards/margins": 2.3500428199768066,
      "rewards/rejected": -2.805922269821167,
      "step": 1895
    },
    {
      "epoch": 0.5,
      "grad_norm": 34.34318161010742,
      "kl": 0.0,
      "learning_rate": 2.518974090552211e-07,
      "logps/chosen": -171.54519653320312,
      "logps/rejected": -185.059814453125,
      "loss": 0.2155,
      "rewards/chosen": 2.059077739715576,
      "rewards/margins": 5.530086994171143,
      "rewards/rejected": -3.4710092544555664,
      "step": 1896
    },
    {
      "epoch": 0.5,
      "grad_norm": 38.1912956237793,
      "kl": 0.0,
      "learning_rate": 2.517665532583093e-07,
      "logps/chosen": -270.6682434082031,
      "logps/rejected": -306.77264404296875,
      "loss": 0.3807,
      "rewards/chosen": 1.2388911247253418,
      "rewards/margins": 3.8831653594970703,
      "rewards/rejected": -2.6442742347717285,
      "step": 1897
    },
    {
      "epoch": 0.5,
      "grad_norm": 30.48811912536621,
      "kl": 0.0,
      "learning_rate": 2.5163569746139754e-07,
      "logps/chosen": -209.73785400390625,
      "logps/rejected": -205.18353271484375,
      "loss": 0.3538,
      "rewards/chosen": 1.776361107826233,
      "rewards/margins": 3.798926830291748,
      "rewards/rejected": -2.0225658416748047,
      "step": 1898
    },
    {
      "epoch": 0.5,
      "grad_norm": 46.357425689697266,
      "kl": 0.0,
      "learning_rate": 2.5150484166448573e-07,
      "logps/chosen": -147.869140625,
      "logps/rejected": -235.92543029785156,
      "loss": 0.1959,
      "rewards/chosen": 1.779581069946289,
      "rewards/margins": 4.554662704467773,
      "rewards/rejected": -2.7750816345214844,
      "step": 1899
    },
    {
      "epoch": 0.5,
      "grad_norm": 40.692535400390625,
      "kl": 0.0,
      "learning_rate": 2.5137398586757393e-07,
      "logps/chosen": -180.42958068847656,
      "logps/rejected": -214.12777709960938,
      "loss": 0.38,
      "rewards/chosen": -0.06748411059379578,
      "rewards/margins": 1.6282284259796143,
      "rewards/rejected": -1.6957125663757324,
      "step": 1900
    },
    {
      "epoch": 0.5,
      "grad_norm": 37.360252380371094,
      "kl": 0.0,
      "learning_rate": 2.512431300706621e-07,
      "logps/chosen": -148.78396606445312,
      "logps/rejected": -240.42269897460938,
      "loss": 0.2743,
      "rewards/chosen": 0.6462602615356445,
      "rewards/margins": 3.7638111114501953,
      "rewards/rejected": -3.117550849914551,
      "step": 1901
    },
    {
      "epoch": 0.5,
      "grad_norm": 31.662979125976562,
      "kl": 0.0,
      "learning_rate": 2.511122742737503e-07,
      "logps/chosen": -178.69883728027344,
      "logps/rejected": -310.7706298828125,
      "loss": 0.2792,
      "rewards/chosen": 0.750849723815918,
      "rewards/margins": 4.345252990722656,
      "rewards/rejected": -3.594403028488159,
      "step": 1902
    },
    {
      "epoch": 0.5,
      "grad_norm": 34.83212661743164,
      "kl": 0.0,
      "learning_rate": 2.509814184768385e-07,
      "logps/chosen": -272.0216979980469,
      "logps/rejected": -305.064208984375,
      "loss": 0.231,
      "rewards/chosen": 0.39868250489234924,
      "rewards/margins": 4.167855739593506,
      "rewards/rejected": -3.7691731452941895,
      "step": 1903
    },
    {
      "epoch": 0.5,
      "grad_norm": 47.436729431152344,
      "kl": 0.0,
      "learning_rate": 2.508505626799267e-07,
      "logps/chosen": -206.93185424804688,
      "logps/rejected": -327.25689697265625,
      "loss": 0.3349,
      "rewards/chosen": 0.07472336292266846,
      "rewards/margins": 3.3958563804626465,
      "rewards/rejected": -3.3211331367492676,
      "step": 1904
    },
    {
      "epoch": 0.5,
      "grad_norm": 27.917442321777344,
      "kl": 0.0,
      "learning_rate": 2.507197068830149e-07,
      "logps/chosen": -222.82229614257812,
      "logps/rejected": -137.33285522460938,
      "loss": 0.2192,
      "rewards/chosen": -0.539939284324646,
      "rewards/margins": 2.519014835357666,
      "rewards/rejected": -3.0589540004730225,
      "step": 1905
    },
    {
      "epoch": 0.5,
      "grad_norm": 33.432159423828125,
      "kl": 0.0,
      "learning_rate": 2.505888510861031e-07,
      "logps/chosen": -246.05108642578125,
      "logps/rejected": -261.0732727050781,
      "loss": 0.3226,
      "rewards/chosen": 0.5959483981132507,
      "rewards/margins": 2.7738301753997803,
      "rewards/rejected": -2.1778817176818848,
      "step": 1906
    },
    {
      "epoch": 0.5,
      "grad_norm": 36.27235412597656,
      "kl": 0.0,
      "learning_rate": 2.504579952891913e-07,
      "logps/chosen": -227.25546264648438,
      "logps/rejected": -291.0820007324219,
      "loss": 0.2343,
      "rewards/chosen": 0.1316702663898468,
      "rewards/margins": 5.5357184410095215,
      "rewards/rejected": -5.404047966003418,
      "step": 1907
    },
    {
      "epoch": 0.5,
      "grad_norm": 45.1259765625,
      "kl": 0.0,
      "learning_rate": 2.503271394922795e-07,
      "logps/chosen": -191.4156951904297,
      "logps/rejected": -201.90771484375,
      "loss": 0.3261,
      "rewards/chosen": 1.4199970960617065,
      "rewards/margins": 3.0559487342834473,
      "rewards/rejected": -1.6359517574310303,
      "step": 1908
    },
    {
      "epoch": 0.5,
      "grad_norm": 43.99619674682617,
      "kl": 0.0,
      "learning_rate": 2.501962836953677e-07,
      "logps/chosen": -210.96713256835938,
      "logps/rejected": -218.38539123535156,
      "loss": 0.2506,
      "rewards/chosen": 0.7000874876976013,
      "rewards/margins": 4.751967906951904,
      "rewards/rejected": -4.051880359649658,
      "step": 1909
    },
    {
      "epoch": 0.5,
      "grad_norm": 34.98705291748047,
      "kl": 0.0,
      "learning_rate": 2.500654278984559e-07,
      "logps/chosen": -138.10946655273438,
      "logps/rejected": -166.25003051757812,
      "loss": 0.4363,
      "rewards/chosen": -0.312425434589386,
      "rewards/margins": 1.5524520874023438,
      "rewards/rejected": -1.864877462387085,
      "step": 1910
    },
    {
      "epoch": 0.5,
      "grad_norm": 33.36783981323242,
      "kl": 0.0,
      "learning_rate": 2.499345721015441e-07,
      "logps/chosen": -225.1962890625,
      "logps/rejected": -365.34783935546875,
      "loss": 0.2549,
      "rewards/chosen": -0.11971984803676605,
      "rewards/margins": 4.3747735023498535,
      "rewards/rejected": -4.49449348449707,
      "step": 1911
    },
    {
      "epoch": 0.5,
      "grad_norm": 33.521934509277344,
      "kl": 0.0,
      "learning_rate": 2.498037163046323e-07,
      "logps/chosen": -195.3630828857422,
      "logps/rejected": -289.0223693847656,
      "loss": 0.3226,
      "rewards/chosen": -0.13952356576919556,
      "rewards/margins": 2.2491676807403564,
      "rewards/rejected": -2.3886911869049072,
      "step": 1912
    },
    {
      "epoch": 0.5,
      "grad_norm": 31.397092819213867,
      "kl": 0.0,
      "learning_rate": 2.496728605077205e-07,
      "logps/chosen": -268.779541015625,
      "logps/rejected": -265.0915222167969,
      "loss": 0.2835,
      "rewards/chosen": -1.829577088356018,
      "rewards/margins": 3.003638744354248,
      "rewards/rejected": -4.833215713500977,
      "step": 1913
    },
    {
      "epoch": 0.5,
      "grad_norm": 37.29655075073242,
      "kl": 0.0,
      "learning_rate": 2.495420047108087e-07,
      "logps/chosen": -223.605712890625,
      "logps/rejected": -172.59487915039062,
      "loss": 0.2707,
      "rewards/chosen": -0.2785741686820984,
      "rewards/margins": 3.410541534423828,
      "rewards/rejected": -3.6891157627105713,
      "step": 1914
    },
    {
      "epoch": 0.5,
      "grad_norm": 28.963512420654297,
      "kl": 0.0,
      "learning_rate": 2.4941114891389687e-07,
      "logps/chosen": -220.93190002441406,
      "logps/rejected": -276.0335388183594,
      "loss": 0.2289,
      "rewards/chosen": 0.43622493743896484,
      "rewards/margins": 5.225882053375244,
      "rewards/rejected": -4.789657115936279,
      "step": 1915
    },
    {
      "epoch": 0.5,
      "grad_norm": 32.86867141723633,
      "kl": 0.0,
      "learning_rate": 2.4928029311698507e-07,
      "logps/chosen": -170.21336364746094,
      "logps/rejected": -254.4930419921875,
      "loss": 0.3359,
      "rewards/chosen": -0.02794739231467247,
      "rewards/margins": 3.419856548309326,
      "rewards/rejected": -3.4478039741516113,
      "step": 1916
    },
    {
      "epoch": 0.5,
      "grad_norm": 29.831283569335938,
      "kl": 0.0,
      "learning_rate": 2.4914943732007326e-07,
      "logps/chosen": -217.07911682128906,
      "logps/rejected": -244.5806427001953,
      "loss": 0.3747,
      "rewards/chosen": 0.47169598937034607,
      "rewards/margins": 3.60986065864563,
      "rewards/rejected": -3.138164758682251,
      "step": 1917
    },
    {
      "epoch": 0.5,
      "grad_norm": 29.670040130615234,
      "kl": 0.0,
      "learning_rate": 2.4901858152316146e-07,
      "logps/chosen": -168.487548828125,
      "logps/rejected": -217.94676208496094,
      "loss": 0.2863,
      "rewards/chosen": -0.3014002740383148,
      "rewards/margins": 3.647155284881592,
      "rewards/rejected": -3.9485554695129395,
      "step": 1918
    },
    {
      "epoch": 0.5,
      "grad_norm": 27.14519691467285,
      "kl": 0.0,
      "learning_rate": 2.4888772572624966e-07,
      "logps/chosen": -136.63169860839844,
      "logps/rejected": -230.33084106445312,
      "loss": 0.2316,
      "rewards/chosen": 0.6357839107513428,
      "rewards/margins": 6.009737968444824,
      "rewards/rejected": -5.373953819274902,
      "step": 1919
    },
    {
      "epoch": 0.5,
      "grad_norm": 50.06220245361328,
      "kl": 0.0,
      "learning_rate": 2.4875686992933785e-07,
      "logps/chosen": -264.7572326660156,
      "logps/rejected": -206.96310424804688,
      "loss": 0.3289,
      "rewards/chosen": -1.4153270721435547,
      "rewards/margins": 1.3842098712921143,
      "rewards/rejected": -2.799536943435669,
      "step": 1920
    },
    {
      "epoch": 0.5,
      "grad_norm": 37.91456604003906,
      "kl": 0.0,
      "learning_rate": 2.4862601413242605e-07,
      "logps/chosen": -232.99554443359375,
      "logps/rejected": -212.722412109375,
      "loss": 0.2997,
      "rewards/chosen": 1.4048177003860474,
      "rewards/margins": 2.8434221744537354,
      "rewards/rejected": -1.438604474067688,
      "step": 1921
    },
    {
      "epoch": 0.5,
      "grad_norm": 26.27916145324707,
      "kl": 0.0,
      "learning_rate": 2.4849515833551424e-07,
      "logps/chosen": -194.2124786376953,
      "logps/rejected": -242.9920654296875,
      "loss": 0.2118,
      "rewards/chosen": 0.6471109390258789,
      "rewards/margins": 4.211012840270996,
      "rewards/rejected": -3.563901662826538,
      "step": 1922
    },
    {
      "epoch": 0.5,
      "grad_norm": 33.186946868896484,
      "kl": 0.0,
      "learning_rate": 2.4836430253860244e-07,
      "logps/chosen": -223.27500915527344,
      "logps/rejected": -232.27818298339844,
      "loss": 0.3531,
      "rewards/chosen": 0.3251649737358093,
      "rewards/margins": 3.1628851890563965,
      "rewards/rejected": -2.8377201557159424,
      "step": 1923
    },
    {
      "epoch": 0.5,
      "grad_norm": 31.578989028930664,
      "kl": 0.0,
      "learning_rate": 2.4823344674169064e-07,
      "logps/chosen": -218.21646118164062,
      "logps/rejected": -155.0374298095703,
      "loss": 0.2636,
      "rewards/chosen": 0.9165531992912292,
      "rewards/margins": 3.4226291179656982,
      "rewards/rejected": -2.506075859069824,
      "step": 1924
    },
    {
      "epoch": 0.5,
      "grad_norm": 32.22088623046875,
      "kl": 0.0,
      "learning_rate": 2.4810259094477883e-07,
      "logps/chosen": -186.41725158691406,
      "logps/rejected": -283.95074462890625,
      "loss": 0.2782,
      "rewards/chosen": 0.27410784363746643,
      "rewards/margins": 3.7751660346984863,
      "rewards/rejected": -3.5010581016540527,
      "step": 1925
    },
    {
      "epoch": 0.5,
      "grad_norm": 31.215028762817383,
      "kl": 0.0,
      "learning_rate": 2.4797173514786703e-07,
      "logps/chosen": -230.7908172607422,
      "logps/rejected": -231.13796997070312,
      "loss": 0.2081,
      "rewards/chosen": 0.9891664385795593,
      "rewards/margins": 3.502727746963501,
      "rewards/rejected": -2.513561248779297,
      "step": 1926
    },
    {
      "epoch": 0.5,
      "grad_norm": 30.78005027770996,
      "kl": 0.0,
      "learning_rate": 2.478408793509552e-07,
      "logps/chosen": -268.7662048339844,
      "logps/rejected": -245.4067840576172,
      "loss": 0.2993,
      "rewards/chosen": 1.0699352025985718,
      "rewards/margins": 5.307055473327637,
      "rewards/rejected": -4.237120151519775,
      "step": 1927
    },
    {
      "epoch": 0.5,
      "grad_norm": 36.56181335449219,
      "kl": 0.0,
      "learning_rate": 2.477100235540434e-07,
      "logps/chosen": -168.37936401367188,
      "logps/rejected": -200.1322021484375,
      "loss": 0.3212,
      "rewards/chosen": 1.214407205581665,
      "rewards/margins": 3.4486401081085205,
      "rewards/rejected": -2.2342329025268555,
      "step": 1928
    },
    {
      "epoch": 0.5,
      "grad_norm": 32.369869232177734,
      "kl": 0.0,
      "learning_rate": 2.475791677571316e-07,
      "logps/chosen": -172.96864318847656,
      "logps/rejected": -308.66448974609375,
      "loss": 0.2533,
      "rewards/chosen": 1.525896430015564,
      "rewards/margins": 5.966020107269287,
      "rewards/rejected": -4.440123558044434,
      "step": 1929
    },
    {
      "epoch": 0.51,
      "grad_norm": 38.35368728637695,
      "kl": 0.0,
      "learning_rate": 2.474483119602198e-07,
      "logps/chosen": -219.0493927001953,
      "logps/rejected": -232.3117218017578,
      "loss": 0.2026,
      "rewards/chosen": 2.0987181663513184,
      "rewards/margins": 6.051818370819092,
      "rewards/rejected": -3.9531002044677734,
      "step": 1930
    },
    {
      "epoch": 0.51,
      "grad_norm": 30.660757064819336,
      "kl": 0.0,
      "learning_rate": 2.47317456163308e-07,
      "logps/chosen": -157.80386352539062,
      "logps/rejected": -218.20924377441406,
      "loss": 0.3078,
      "rewards/chosen": 1.5564416646957397,
      "rewards/margins": 4.841548442840576,
      "rewards/rejected": -3.285106897354126,
      "step": 1931
    },
    {
      "epoch": 0.51,
      "grad_norm": 25.03354835510254,
      "kl": 0.0,
      "learning_rate": 2.471866003663962e-07,
      "logps/chosen": -246.04446411132812,
      "logps/rejected": -182.98593139648438,
      "loss": 0.3044,
      "rewards/chosen": 0.08799563348293304,
      "rewards/margins": 3.25834321975708,
      "rewards/rejected": -3.1703476905822754,
      "step": 1932
    },
    {
      "epoch": 0.51,
      "grad_norm": 38.93879699707031,
      "kl": 0.0,
      "learning_rate": 2.470557445694844e-07,
      "logps/chosen": -197.60214233398438,
      "logps/rejected": -254.27537536621094,
      "loss": 0.3131,
      "rewards/chosen": 0.9349378943443298,
      "rewards/margins": 4.313072204589844,
      "rewards/rejected": -3.378134250640869,
      "step": 1933
    },
    {
      "epoch": 0.51,
      "grad_norm": 36.97758483886719,
      "kl": 0.0,
      "learning_rate": 2.469248887725726e-07,
      "logps/chosen": -185.9903564453125,
      "logps/rejected": -231.436767578125,
      "loss": 0.3016,
      "rewards/chosen": 1.0533219575881958,
      "rewards/margins": 3.887955665588379,
      "rewards/rejected": -2.8346335887908936,
      "step": 1934
    },
    {
      "epoch": 0.51,
      "grad_norm": 28.87652015686035,
      "kl": 0.0,
      "learning_rate": 2.467940329756608e-07,
      "logps/chosen": -187.3426055908203,
      "logps/rejected": -204.7564239501953,
      "loss": 0.3289,
      "rewards/chosen": 0.27323493361473083,
      "rewards/margins": 3.4943530559539795,
      "rewards/rejected": -3.221118211746216,
      "step": 1935
    },
    {
      "epoch": 0.51,
      "grad_norm": 26.652814865112305,
      "kl": 0.0,
      "learning_rate": 2.4666317717874904e-07,
      "logps/chosen": -243.4410858154297,
      "logps/rejected": -184.99859619140625,
      "loss": 0.2302,
      "rewards/chosen": 1.6804203987121582,
      "rewards/margins": 4.532587051391602,
      "rewards/rejected": -2.8521664142608643,
      "step": 1936
    },
    {
      "epoch": 0.51,
      "grad_norm": 41.8165168762207,
      "kl": 0.0,
      "learning_rate": 2.4653232138183724e-07,
      "logps/chosen": -239.2473907470703,
      "logps/rejected": -203.16265869140625,
      "loss": 0.2688,
      "rewards/chosen": 0.7100369930267334,
      "rewards/margins": 3.5892412662506104,
      "rewards/rejected": -2.879204273223877,
      "step": 1937
    },
    {
      "epoch": 0.51,
      "grad_norm": 40.11600875854492,
      "kl": 0.0,
      "learning_rate": 2.464014655849254e-07,
      "logps/chosen": -173.17868041992188,
      "logps/rejected": -304.7061462402344,
      "loss": 0.2404,
      "rewards/chosen": 0.33373430371284485,
      "rewards/margins": 5.660027503967285,
      "rewards/rejected": -5.326292991638184,
      "step": 1938
    },
    {
      "epoch": 0.51,
      "grad_norm": 38.29595184326172,
      "kl": 0.0,
      "learning_rate": 2.462706097880136e-07,
      "logps/chosen": -195.17929077148438,
      "logps/rejected": -195.31170654296875,
      "loss": 0.3822,
      "rewards/chosen": 0.5516433119773865,
      "rewards/margins": 3.5008597373962402,
      "rewards/rejected": -2.949216365814209,
      "step": 1939
    },
    {
      "epoch": 0.51,
      "grad_norm": 28.157041549682617,
      "kl": 0.0,
      "learning_rate": 2.4613975399110177e-07,
      "logps/chosen": -164.0011749267578,
      "logps/rejected": -276.69927978515625,
      "loss": 0.2932,
      "rewards/chosen": 0.31394287943840027,
      "rewards/margins": 3.9898202419281006,
      "rewards/rejected": -3.675877332687378,
      "step": 1940
    },
    {
      "epoch": 0.51,
      "grad_norm": 26.90674591064453,
      "kl": 0.0,
      "learning_rate": 2.4600889819418997e-07,
      "logps/chosen": -157.42709350585938,
      "logps/rejected": -193.3916015625,
      "loss": 0.2054,
      "rewards/chosen": 0.030436119064688683,
      "rewards/margins": 3.6355412006378174,
      "rewards/rejected": -3.60510516166687,
      "step": 1941
    },
    {
      "epoch": 0.51,
      "grad_norm": 37.283145904541016,
      "kl": 0.0,
      "learning_rate": 2.4587804239727817e-07,
      "logps/chosen": -256.7583923339844,
      "logps/rejected": -203.76365661621094,
      "loss": 0.305,
      "rewards/chosen": 1.2997283935546875,
      "rewards/margins": 2.928647994995117,
      "rewards/rejected": -1.6289196014404297,
      "step": 1942
    },
    {
      "epoch": 0.51,
      "grad_norm": 36.678279876708984,
      "kl": 0.0,
      "learning_rate": 2.457471866003664e-07,
      "logps/chosen": -220.28211975097656,
      "logps/rejected": -180.16119384765625,
      "loss": 0.3362,
      "rewards/chosen": 0.16220007836818695,
      "rewards/margins": 2.24269700050354,
      "rewards/rejected": -2.0804970264434814,
      "step": 1943
    },
    {
      "epoch": 0.51,
      "grad_norm": 28.942739486694336,
      "kl": 0.0,
      "learning_rate": 2.456163308034546e-07,
      "logps/chosen": -156.3596649169922,
      "logps/rejected": -227.49583435058594,
      "loss": 0.2208,
      "rewards/chosen": 0.7010388374328613,
      "rewards/margins": 3.153454303741455,
      "rewards/rejected": -2.4524154663085938,
      "step": 1944
    },
    {
      "epoch": 0.51,
      "grad_norm": 26.555570602416992,
      "kl": 0.0,
      "learning_rate": 2.454854750065428e-07,
      "logps/chosen": -144.39047241210938,
      "logps/rejected": -280.34881591796875,
      "loss": 0.2719,
      "rewards/chosen": 1.1340817213058472,
      "rewards/margins": 4.722748279571533,
      "rewards/rejected": -3.5886664390563965,
      "step": 1945
    },
    {
      "epoch": 0.51,
      "grad_norm": 39.58638000488281,
      "kl": 0.0,
      "learning_rate": 2.4535461920963095e-07,
      "logps/chosen": -204.01065063476562,
      "logps/rejected": -162.374267578125,
      "loss": 0.2145,
      "rewards/chosen": -0.44569912552833557,
      "rewards/margins": 3.055454969406128,
      "rewards/rejected": -3.5011541843414307,
      "step": 1946
    },
    {
      "epoch": 0.51,
      "grad_norm": 43.68654251098633,
      "kl": 0.0,
      "learning_rate": 2.4522376341271915e-07,
      "logps/chosen": -295.45184326171875,
      "logps/rejected": -255.74960327148438,
      "loss": 0.3089,
      "rewards/chosen": 0.3848971128463745,
      "rewards/margins": 3.178168773651123,
      "rewards/rejected": -2.793271541595459,
      "step": 1947
    },
    {
      "epoch": 0.51,
      "grad_norm": 32.19191360473633,
      "kl": 0.0,
      "learning_rate": 2.4509290761580734e-07,
      "logps/chosen": -210.06488037109375,
      "logps/rejected": -217.54795837402344,
      "loss": 0.3451,
      "rewards/chosen": 0.453418493270874,
      "rewards/margins": 4.383719444274902,
      "rewards/rejected": -3.9303009510040283,
      "step": 1948
    },
    {
      "epoch": 0.51,
      "grad_norm": 35.40753173828125,
      "kl": 0.0,
      "learning_rate": 2.449620518188956e-07,
      "logps/chosen": -277.6213684082031,
      "logps/rejected": -188.61752319335938,
      "loss": 0.2245,
      "rewards/chosen": 2.0205652713775635,
      "rewards/margins": 4.447739601135254,
      "rewards/rejected": -2.4271743297576904,
      "step": 1949
    },
    {
      "epoch": 0.51,
      "grad_norm": 34.40073776245117,
      "kl": 0.0,
      "learning_rate": 2.448311960219838e-07,
      "logps/chosen": -233.07374572753906,
      "logps/rejected": -211.34178161621094,
      "loss": 0.2372,
      "rewards/chosen": 0.8760273456573486,
      "rewards/margins": 3.128060817718506,
      "rewards/rejected": -2.2520334720611572,
      "step": 1950
    },
    {
      "epoch": 0.51,
      "grad_norm": 31.40346908569336,
      "kl": 0.0,
      "learning_rate": 2.44700340225072e-07,
      "logps/chosen": -213.6735076904297,
      "logps/rejected": -291.5716552734375,
      "loss": 0.2644,
      "rewards/chosen": 0.4644780457019806,
      "rewards/margins": 4.4314141273498535,
      "rewards/rejected": -3.9669361114501953,
      "step": 1951
    },
    {
      "epoch": 0.51,
      "grad_norm": 29.35367774963379,
      "kl": 0.0,
      "learning_rate": 2.445694844281602e-07,
      "logps/chosen": -187.635986328125,
      "logps/rejected": -354.99658203125,
      "loss": 0.2588,
      "rewards/chosen": 1.0316107273101807,
      "rewards/margins": 6.271823883056641,
      "rewards/rejected": -5.240213394165039,
      "step": 1952
    },
    {
      "epoch": 0.51,
      "grad_norm": 34.39826583862305,
      "kl": 0.0,
      "learning_rate": 2.444386286312484e-07,
      "logps/chosen": -279.107666015625,
      "logps/rejected": -217.9149169921875,
      "loss": 0.2955,
      "rewards/chosen": -0.4340619146823883,
      "rewards/margins": 1.6407526731491089,
      "rewards/rejected": -2.074814558029175,
      "step": 1953
    },
    {
      "epoch": 0.51,
      "grad_norm": 36.55764389038086,
      "kl": 0.0,
      "learning_rate": 2.443077728343365e-07,
      "logps/chosen": -185.63668823242188,
      "logps/rejected": -254.9978790283203,
      "loss": 0.2592,
      "rewards/chosen": 0.41983142495155334,
      "rewards/margins": 5.603705883026123,
      "rewards/rejected": -5.183874607086182,
      "step": 1954
    },
    {
      "epoch": 0.51,
      "grad_norm": 39.05849075317383,
      "kl": 0.0,
      "learning_rate": 2.441769170374247e-07,
      "logps/chosen": -125.34358215332031,
      "logps/rejected": -198.91729736328125,
      "loss": 0.2841,
      "rewards/chosen": 1.3333288431167603,
      "rewards/margins": 3.365635871887207,
      "rewards/rejected": -2.0323069095611572,
      "step": 1955
    },
    {
      "epoch": 0.51,
      "grad_norm": 39.851844787597656,
      "kl": 0.0,
      "learning_rate": 2.4404606124051296e-07,
      "logps/chosen": -264.37908935546875,
      "logps/rejected": -268.1690673828125,
      "loss": 0.325,
      "rewards/chosen": 1.1693298816680908,
      "rewards/margins": 3.3857524394989014,
      "rewards/rejected": -2.2164225578308105,
      "step": 1956
    },
    {
      "epoch": 0.51,
      "grad_norm": 31.71510887145996,
      "kl": 0.0,
      "learning_rate": 2.4391520544360116e-07,
      "logps/chosen": -243.44525146484375,
      "logps/rejected": -193.47544860839844,
      "loss": 0.2813,
      "rewards/chosen": -1.3031715154647827,
      "rewards/margins": 1.8220585584640503,
      "rewards/rejected": -3.125230073928833,
      "step": 1957
    },
    {
      "epoch": 0.51,
      "grad_norm": 33.640567779541016,
      "kl": 0.0,
      "learning_rate": 2.4378434964668936e-07,
      "logps/chosen": -217.40499877929688,
      "logps/rejected": -174.684326171875,
      "loss": 0.283,
      "rewards/chosen": 0.24121162295341492,
      "rewards/margins": 2.657731294631958,
      "rewards/rejected": -2.4165196418762207,
      "step": 1958
    },
    {
      "epoch": 0.51,
      "grad_norm": 35.96133041381836,
      "kl": 0.0,
      "learning_rate": 2.4365349384977755e-07,
      "logps/chosen": -214.48333740234375,
      "logps/rejected": -197.01153564453125,
      "loss": 0.1731,
      "rewards/chosen": 1.1828800439834595,
      "rewards/margins": 3.951263904571533,
      "rewards/rejected": -2.7683839797973633,
      "step": 1959
    },
    {
      "epoch": 0.51,
      "grad_norm": 33.87704849243164,
      "kl": 0.0,
      "learning_rate": 2.4352263805286575e-07,
      "logps/chosen": -209.52987670898438,
      "logps/rejected": -191.13827514648438,
      "loss": 0.3245,
      "rewards/chosen": 0.3229031562805176,
      "rewards/margins": 2.702648162841797,
      "rewards/rejected": -2.3797450065612793,
      "step": 1960
    },
    {
      "epoch": 0.51,
      "grad_norm": 32.43837356567383,
      "kl": 0.0,
      "learning_rate": 2.433917822559539e-07,
      "logps/chosen": -142.2443389892578,
      "logps/rejected": -217.48886108398438,
      "loss": 0.3319,
      "rewards/chosen": -0.2942158579826355,
      "rewards/margins": 3.3216006755828857,
      "rewards/rejected": -3.615816593170166,
      "step": 1961
    },
    {
      "epoch": 0.51,
      "grad_norm": 48.2795524597168,
      "kl": 0.0,
      "learning_rate": 2.4326092645904214e-07,
      "logps/chosen": -225.362548828125,
      "logps/rejected": -381.4612731933594,
      "loss": 0.2663,
      "rewards/chosen": 0.8754479885101318,
      "rewards/margins": 6.36458683013916,
      "rewards/rejected": -5.489138603210449,
      "step": 1962
    },
    {
      "epoch": 0.51,
      "grad_norm": 53.284034729003906,
      "kl": 0.0,
      "learning_rate": 2.4313007066213034e-07,
      "logps/chosen": -186.80157470703125,
      "logps/rejected": -194.40985107421875,
      "loss": 0.3662,
      "rewards/chosen": 0.3198693096637726,
      "rewards/margins": 2.4725139141082764,
      "rewards/rejected": -2.152644634246826,
      "step": 1963
    },
    {
      "epoch": 0.51,
      "grad_norm": 26.204788208007812,
      "kl": 0.0,
      "learning_rate": 2.4299921486521853e-07,
      "logps/chosen": -235.0901641845703,
      "logps/rejected": -214.7635955810547,
      "loss": 0.1917,
      "rewards/chosen": 0.1780397891998291,
      "rewards/margins": 4.566169738769531,
      "rewards/rejected": -4.388129711151123,
      "step": 1964
    },
    {
      "epoch": 0.51,
      "grad_norm": 34.75926208496094,
      "kl": 0.0,
      "learning_rate": 2.4286835906830673e-07,
      "logps/chosen": -254.1787567138672,
      "logps/rejected": -289.5296325683594,
      "loss": 0.1977,
      "rewards/chosen": -0.25179824233055115,
      "rewards/margins": 3.519669771194458,
      "rewards/rejected": -3.771467924118042,
      "step": 1965
    },
    {
      "epoch": 0.51,
      "grad_norm": 26.714811325073242,
      "kl": 0.0,
      "learning_rate": 2.427375032713949e-07,
      "logps/chosen": -163.204833984375,
      "logps/rejected": -167.9423828125,
      "loss": 0.2762,
      "rewards/chosen": 0.9019087553024292,
      "rewards/margins": 4.088323593139648,
      "rewards/rejected": -3.186414957046509,
      "step": 1966
    },
    {
      "epoch": 0.51,
      "grad_norm": 29.82887840270996,
      "kl": 0.0,
      "learning_rate": 2.426066474744831e-07,
      "logps/chosen": -211.63839721679688,
      "logps/rejected": -247.35690307617188,
      "loss": 0.2539,
      "rewards/chosen": 1.2622199058532715,
      "rewards/margins": 5.667209625244141,
      "rewards/rejected": -4.404989719390869,
      "step": 1967
    },
    {
      "epoch": 0.52,
      "grad_norm": 32.24669647216797,
      "kl": 0.0,
      "learning_rate": 2.424757916775713e-07,
      "logps/chosen": -313.1291198730469,
      "logps/rejected": -282.4967956542969,
      "loss": 0.1948,
      "rewards/chosen": 0.557210385799408,
      "rewards/margins": 5.114065647125244,
      "rewards/rejected": -4.556855201721191,
      "step": 1968
    },
    {
      "epoch": 0.52,
      "grad_norm": 33.87299728393555,
      "kl": 0.0,
      "learning_rate": 2.423449358806595e-07,
      "logps/chosen": -227.81365966796875,
      "logps/rejected": -222.52450561523438,
      "loss": 0.3365,
      "rewards/chosen": -0.6963803768157959,
      "rewards/margins": 2.368621587753296,
      "rewards/rejected": -3.065001964569092,
      "step": 1969
    },
    {
      "epoch": 0.52,
      "grad_norm": 43.42044448852539,
      "kl": 0.0,
      "learning_rate": 2.422140800837477e-07,
      "logps/chosen": -228.2924041748047,
      "logps/rejected": -372.4189453125,
      "loss": 0.4293,
      "rewards/chosen": -0.8277515172958374,
      "rewards/margins": 1.1175405979156494,
      "rewards/rejected": -1.9452921152114868,
      "step": 1970
    },
    {
      "epoch": 0.52,
      "grad_norm": 33.60139846801758,
      "kl": 0.0,
      "learning_rate": 2.420832242868359e-07,
      "logps/chosen": -259.1628723144531,
      "logps/rejected": -381.32757568359375,
      "loss": 0.2342,
      "rewards/chosen": 1.377594232559204,
      "rewards/margins": 5.173920631408691,
      "rewards/rejected": -3.7963266372680664,
      "step": 1971
    },
    {
      "epoch": 0.52,
      "grad_norm": 33.57109069824219,
      "kl": 0.0,
      "learning_rate": 2.419523684899241e-07,
      "logps/chosen": -205.43441772460938,
      "logps/rejected": -240.46177673339844,
      "loss": 0.2658,
      "rewards/chosen": -0.9131128787994385,
      "rewards/margins": 2.3757729530334473,
      "rewards/rejected": -3.2888858318328857,
      "step": 1972
    },
    {
      "epoch": 0.52,
      "grad_norm": 42.60993576049805,
      "kl": 0.0,
      "learning_rate": 2.418215126930123e-07,
      "logps/chosen": -217.70196533203125,
      "logps/rejected": -162.82742309570312,
      "loss": 0.3996,
      "rewards/chosen": 0.12623104453086853,
      "rewards/margins": 0.7846701145172119,
      "rewards/rejected": -0.658439040184021,
      "step": 1973
    },
    {
      "epoch": 0.52,
      "grad_norm": 34.66673278808594,
      "kl": 0.0,
      "learning_rate": 2.416906568961005e-07,
      "logps/chosen": -217.13880920410156,
      "logps/rejected": -273.5459899902344,
      "loss": 0.3047,
      "rewards/chosen": 0.6674838066101074,
      "rewards/margins": 3.589686632156372,
      "rewards/rejected": -2.9222028255462646,
      "step": 1974
    },
    {
      "epoch": 0.52,
      "grad_norm": 32.35926055908203,
      "kl": 0.0,
      "learning_rate": 2.415598010991887e-07,
      "logps/chosen": -344.4316711425781,
      "logps/rejected": -287.30889892578125,
      "loss": 0.1567,
      "rewards/chosen": 2.5555834770202637,
      "rewards/margins": 7.409801959991455,
      "rewards/rejected": -4.854218482971191,
      "step": 1975
    },
    {
      "epoch": 0.52,
      "grad_norm": 29.469270706176758,
      "kl": 0.0,
      "learning_rate": 2.414289453022769e-07,
      "logps/chosen": -267.01593017578125,
      "logps/rejected": -216.3834686279297,
      "loss": 0.2763,
      "rewards/chosen": -0.6539653539657593,
      "rewards/margins": 0.95839524269104,
      "rewards/rejected": -1.6123605966567993,
      "step": 1976
    },
    {
      "epoch": 0.52,
      "grad_norm": 44.44554901123047,
      "kl": 0.0,
      "learning_rate": 2.412980895053651e-07,
      "logps/chosen": -265.7334899902344,
      "logps/rejected": -310.4556579589844,
      "loss": 0.2784,
      "rewards/chosen": -0.21513530611991882,
      "rewards/margins": 4.380249977111816,
      "rewards/rejected": -4.5953850746154785,
      "step": 1977
    },
    {
      "epoch": 0.52,
      "grad_norm": 25.45243263244629,
      "kl": 0.0,
      "learning_rate": 2.411672337084533e-07,
      "logps/chosen": -185.72653198242188,
      "logps/rejected": -275.7221374511719,
      "loss": 0.2038,
      "rewards/chosen": 1.7371035814285278,
      "rewards/margins": 5.086659908294678,
      "rewards/rejected": -3.3495562076568604,
      "step": 1978
    },
    {
      "epoch": 0.52,
      "grad_norm": 25.313249588012695,
      "kl": 0.0,
      "learning_rate": 2.410363779115415e-07,
      "logps/chosen": -195.99685668945312,
      "logps/rejected": -190.29600524902344,
      "loss": 0.1704,
      "rewards/chosen": 2.642261266708374,
      "rewards/margins": 6.334344387054443,
      "rewards/rejected": -3.6920831203460693,
      "step": 1979
    },
    {
      "epoch": 0.52,
      "grad_norm": 32.64925765991211,
      "kl": 0.0,
      "learning_rate": 2.4090552211462967e-07,
      "logps/chosen": -182.5782470703125,
      "logps/rejected": -299.02032470703125,
      "loss": 0.3535,
      "rewards/chosen": -0.3439632058143616,
      "rewards/margins": 3.923374652862549,
      "rewards/rejected": -4.267337799072266,
      "step": 1980
    },
    {
      "epoch": 0.52,
      "grad_norm": 26.917070388793945,
      "kl": 0.0,
      "learning_rate": 2.4077466631771787e-07,
      "logps/chosen": -154.9143829345703,
      "logps/rejected": -252.35838317871094,
      "loss": 0.354,
      "rewards/chosen": -0.6761507987976074,
      "rewards/margins": 5.603362083435059,
      "rewards/rejected": -6.279512882232666,
      "step": 1981
    },
    {
      "epoch": 0.52,
      "grad_norm": 39.749664306640625,
      "kl": 0.0,
      "learning_rate": 2.4064381052080606e-07,
      "logps/chosen": -228.64013671875,
      "logps/rejected": -169.25645446777344,
      "loss": 0.3207,
      "rewards/chosen": 0.6242986917495728,
      "rewards/margins": 2.633760929107666,
      "rewards/rejected": -2.009462356567383,
      "step": 1982
    },
    {
      "epoch": 0.52,
      "grad_norm": 40.571128845214844,
      "kl": 0.0,
      "learning_rate": 2.4051295472389426e-07,
      "logps/chosen": -202.51095581054688,
      "logps/rejected": -189.1254119873047,
      "loss": 0.366,
      "rewards/chosen": 0.6944351196289062,
      "rewards/margins": 2.6559300422668457,
      "rewards/rejected": -1.96149480342865,
      "step": 1983
    },
    {
      "epoch": 0.52,
      "grad_norm": 27.367752075195312,
      "kl": 0.0,
      "learning_rate": 2.4038209892698245e-07,
      "logps/chosen": -159.35598754882812,
      "logps/rejected": -217.9905548095703,
      "loss": 0.209,
      "rewards/chosen": 1.7716405391693115,
      "rewards/margins": 5.866267204284668,
      "rewards/rejected": -4.094626426696777,
      "step": 1984
    },
    {
      "epoch": 0.52,
      "grad_norm": 36.464874267578125,
      "kl": 0.0,
      "learning_rate": 2.4025124313007065e-07,
      "logps/chosen": -159.65773010253906,
      "logps/rejected": -226.83433532714844,
      "loss": 0.2122,
      "rewards/chosen": 1.215417742729187,
      "rewards/margins": 3.594834804534912,
      "rewards/rejected": -2.3794169425964355,
      "step": 1985
    },
    {
      "epoch": 0.52,
      "grad_norm": 36.60029220581055,
      "kl": 0.0,
      "learning_rate": 2.4012038733315885e-07,
      "logps/chosen": -210.5471649169922,
      "logps/rejected": -252.05821228027344,
      "loss": 0.3524,
      "rewards/chosen": -0.19484469294548035,
      "rewards/margins": 1.9677438735961914,
      "rewards/rejected": -2.162588596343994,
      "step": 1986
    },
    {
      "epoch": 0.52,
      "grad_norm": 39.36339569091797,
      "kl": 0.0,
      "learning_rate": 2.3998953153624704e-07,
      "logps/chosen": -206.82473754882812,
      "logps/rejected": -286.58221435546875,
      "loss": 0.3572,
      "rewards/chosen": 1.0052763223648071,
      "rewards/margins": 3.1620450019836426,
      "rewards/rejected": -2.156768798828125,
      "step": 1987
    },
    {
      "epoch": 0.52,
      "grad_norm": 40.26015090942383,
      "kl": 0.0,
      "learning_rate": 2.3985867573933524e-07,
      "logps/chosen": -249.87423706054688,
      "logps/rejected": -213.3260498046875,
      "loss": 0.2833,
      "rewards/chosen": -0.04886872321367264,
      "rewards/margins": 2.056130886077881,
      "rewards/rejected": -2.104999542236328,
      "step": 1988
    },
    {
      "epoch": 0.52,
      "grad_norm": 35.690330505371094,
      "kl": 0.0,
      "learning_rate": 2.3972781994242343e-07,
      "logps/chosen": -246.7486572265625,
      "logps/rejected": -248.2810821533203,
      "loss": 0.3034,
      "rewards/chosen": -0.03401753306388855,
      "rewards/margins": 1.7448437213897705,
      "rewards/rejected": -1.7788612842559814,
      "step": 1989
    },
    {
      "epoch": 0.52,
      "grad_norm": 29.125974655151367,
      "kl": 0.0,
      "learning_rate": 2.3959696414551163e-07,
      "logps/chosen": -183.20132446289062,
      "logps/rejected": -304.69183349609375,
      "loss": 0.1834,
      "rewards/chosen": 2.362597942352295,
      "rewards/margins": 5.717899322509766,
      "rewards/rejected": -3.3553013801574707,
      "step": 1990
    },
    {
      "epoch": 0.52,
      "grad_norm": 47.09006881713867,
      "kl": 0.0,
      "learning_rate": 2.3946610834859983e-07,
      "logps/chosen": -255.5098876953125,
      "logps/rejected": -304.6872863769531,
      "loss": 0.2865,
      "rewards/chosen": 0.13334283232688904,
      "rewards/margins": 4.940859794616699,
      "rewards/rejected": -4.807517051696777,
      "step": 1991
    },
    {
      "epoch": 0.52,
      "grad_norm": 30.52964210510254,
      "kl": 0.0,
      "learning_rate": 2.39335252551688e-07,
      "logps/chosen": -208.18450927734375,
      "logps/rejected": -293.5606384277344,
      "loss": 0.1683,
      "rewards/chosen": 1.081275224685669,
      "rewards/margins": 6.437618255615234,
      "rewards/rejected": -5.356342792510986,
      "step": 1992
    },
    {
      "epoch": 0.52,
      "grad_norm": 35.834815979003906,
      "kl": 0.0,
      "learning_rate": 2.392043967547762e-07,
      "logps/chosen": -233.533203125,
      "logps/rejected": -275.6778259277344,
      "loss": 0.2766,
      "rewards/chosen": 1.9862544536590576,
      "rewards/margins": 7.9831438064575195,
      "rewards/rejected": -5.996889591217041,
      "step": 1993
    },
    {
      "epoch": 0.52,
      "grad_norm": 41.54240798950195,
      "kl": 0.0,
      "learning_rate": 2.390735409578644e-07,
      "logps/chosen": -175.21755981445312,
      "logps/rejected": -177.35218811035156,
      "loss": 0.272,
      "rewards/chosen": 1.9858801364898682,
      "rewards/margins": 5.943943977355957,
      "rewards/rejected": -3.958063840866089,
      "step": 1994
    },
    {
      "epoch": 0.52,
      "grad_norm": 33.839927673339844,
      "kl": 0.0,
      "learning_rate": 2.389426851609526e-07,
      "logps/chosen": -187.34593200683594,
      "logps/rejected": -250.72268676757812,
      "loss": 0.3413,
      "rewards/chosen": 0.8257482051849365,
      "rewards/margins": 4.029740333557129,
      "rewards/rejected": -3.2039921283721924,
      "step": 1995
    },
    {
      "epoch": 0.52,
      "grad_norm": 34.77016830444336,
      "kl": 0.0,
      "learning_rate": 2.388118293640408e-07,
      "logps/chosen": -247.8369903564453,
      "logps/rejected": -240.98202514648438,
      "loss": 0.2067,
      "rewards/chosen": 2.7072157859802246,
      "rewards/margins": 6.077670097351074,
      "rewards/rejected": -3.3704545497894287,
      "step": 1996
    },
    {
      "epoch": 0.52,
      "grad_norm": 35.83280944824219,
      "kl": 0.0,
      "learning_rate": 2.38680973567129e-07,
      "logps/chosen": -189.8119354248047,
      "logps/rejected": -286.9617004394531,
      "loss": 0.2035,
      "rewards/chosen": 0.5847533345222473,
      "rewards/margins": 3.042118549346924,
      "rewards/rejected": -2.4573652744293213,
      "step": 1997
    },
    {
      "epoch": 0.52,
      "grad_norm": 30.720943450927734,
      "kl": 0.0,
      "learning_rate": 2.385501177702172e-07,
      "logps/chosen": -263.184814453125,
      "logps/rejected": -183.3404998779297,
      "loss": 0.3176,
      "rewards/chosen": -0.8872728943824768,
      "rewards/margins": 4.405872344970703,
      "rewards/rejected": -5.293145179748535,
      "step": 1998
    },
    {
      "epoch": 0.52,
      "grad_norm": 36.902984619140625,
      "kl": 0.0,
      "learning_rate": 2.384192619733054e-07,
      "logps/chosen": -342.3307800292969,
      "logps/rejected": -220.6918487548828,
      "loss": 0.2243,
      "rewards/chosen": 2.224860668182373,
      "rewards/margins": 7.383242130279541,
      "rewards/rejected": -5.158381462097168,
      "step": 1999
    },
    {
      "epoch": 0.52,
      "grad_norm": 31.698623657226562,
      "kl": 0.0,
      "learning_rate": 2.382884061763936e-07,
      "logps/chosen": -281.65863037109375,
      "logps/rejected": -207.4318084716797,
      "loss": 0.3269,
      "rewards/chosen": 2.86019229888916,
      "rewards/margins": 4.098602294921875,
      "rewards/rejected": -1.2384099960327148,
      "step": 2000
    },
    {
      "epoch": 0.52,
      "grad_norm": 35.31499099731445,
      "kl": 0.0,
      "learning_rate": 2.381575503794818e-07,
      "logps/chosen": -230.57676696777344,
      "logps/rejected": -317.56005859375,
      "loss": 0.25,
      "rewards/chosen": 1.5604783296585083,
      "rewards/margins": 4.952313423156738,
      "rewards/rejected": -3.3918352127075195,
      "step": 2001
    },
    {
      "epoch": 0.52,
      "grad_norm": 31.48984146118164,
      "kl": 0.0,
      "learning_rate": 2.3802669458256998e-07,
      "logps/chosen": -212.90480041503906,
      "logps/rejected": -433.2575988769531,
      "loss": 0.2678,
      "rewards/chosen": 0.2492951601743698,
      "rewards/margins": 3.0489494800567627,
      "rewards/rejected": -2.799654245376587,
      "step": 2002
    },
    {
      "epoch": 0.52,
      "grad_norm": 41.93503952026367,
      "kl": 0.0,
      "learning_rate": 2.3789583878565818e-07,
      "logps/chosen": -235.5082244873047,
      "logps/rejected": -195.15249633789062,
      "loss": 0.3205,
      "rewards/chosen": 0.1757785975933075,
      "rewards/margins": 1.9791057109832764,
      "rewards/rejected": -1.8033270835876465,
      "step": 2003
    },
    {
      "epoch": 0.52,
      "grad_norm": 24.260042190551758,
      "kl": 0.0,
      "learning_rate": 2.377649829887464e-07,
      "logps/chosen": -269.15264892578125,
      "logps/rejected": -240.45811462402344,
      "loss": 0.2631,
      "rewards/chosen": 0.8864780068397522,
      "rewards/margins": 4.7506513595581055,
      "rewards/rejected": -3.864173173904419,
      "step": 2004
    },
    {
      "epoch": 0.52,
      "grad_norm": 24.483179092407227,
      "kl": 0.0,
      "learning_rate": 2.376341271918346e-07,
      "logps/chosen": -153.36785888671875,
      "logps/rejected": -258.3946838378906,
      "loss": 0.2444,
      "rewards/chosen": -0.7696568369865417,
      "rewards/margins": 3.203350067138672,
      "rewards/rejected": -3.9730069637298584,
      "step": 2005
    },
    {
      "epoch": 0.52,
      "grad_norm": 39.20796203613281,
      "kl": 0.0,
      "learning_rate": 2.375032713949228e-07,
      "logps/chosen": -132.20347595214844,
      "logps/rejected": -303.88922119140625,
      "loss": 0.2446,
      "rewards/chosen": 0.35151904821395874,
      "rewards/margins": 3.453958511352539,
      "rewards/rejected": -3.1024394035339355,
      "step": 2006
    },
    {
      "epoch": 0.53,
      "grad_norm": 31.88499641418457,
      "kl": 0.0,
      "learning_rate": 2.37372415598011e-07,
      "logps/chosen": -227.24574279785156,
      "logps/rejected": -222.66433715820312,
      "loss": 0.297,
      "rewards/chosen": -1.9475051164627075,
      "rewards/margins": 1.7535721063613892,
      "rewards/rejected": -3.7010772228240967,
      "step": 2007
    },
    {
      "epoch": 0.53,
      "grad_norm": 36.38870620727539,
      "kl": 0.0,
      "learning_rate": 2.3724155980109916e-07,
      "logps/chosen": -213.55064392089844,
      "logps/rejected": -368.173095703125,
      "loss": 0.2498,
      "rewards/chosen": 0.9353142380714417,
      "rewards/margins": 6.506036758422852,
      "rewards/rejected": -5.570722579956055,
      "step": 2008
    },
    {
      "epoch": 0.53,
      "grad_norm": 34.4116325378418,
      "kl": 0.0,
      "learning_rate": 2.3711070400418736e-07,
      "logps/chosen": -186.77804565429688,
      "logps/rejected": -120.74478912353516,
      "loss": 0.3031,
      "rewards/chosen": 0.9410187005996704,
      "rewards/margins": 3.602015972137451,
      "rewards/rejected": -2.660997152328491,
      "step": 2009
    },
    {
      "epoch": 0.53,
      "grad_norm": 34.9754753112793,
      "kl": 0.0,
      "learning_rate": 2.3697984820727558e-07,
      "logps/chosen": -235.28622436523438,
      "logps/rejected": -229.31048583984375,
      "loss": 0.275,
      "rewards/chosen": 0.7856313586235046,
      "rewards/margins": 3.691113233566284,
      "rewards/rejected": -2.9054818153381348,
      "step": 2010
    },
    {
      "epoch": 0.53,
      "grad_norm": 32.74319076538086,
      "kl": 0.0,
      "learning_rate": 2.3684899241036378e-07,
      "logps/chosen": -200.742431640625,
      "logps/rejected": -311.3709411621094,
      "loss": 0.2691,
      "rewards/chosen": 0.4585956931114197,
      "rewards/margins": 3.353062152862549,
      "rewards/rejected": -2.8944664001464844,
      "step": 2011
    },
    {
      "epoch": 0.53,
      "grad_norm": 38.48353958129883,
      "kl": 0.0,
      "learning_rate": 2.3671813661345197e-07,
      "logps/chosen": -240.978759765625,
      "logps/rejected": -241.54025268554688,
      "loss": 0.2153,
      "rewards/chosen": 1.544427514076233,
      "rewards/margins": 5.172610759735107,
      "rewards/rejected": -3.628183126449585,
      "step": 2012
    },
    {
      "epoch": 0.53,
      "grad_norm": 29.140609741210938,
      "kl": 0.0,
      "learning_rate": 2.3658728081654017e-07,
      "logps/chosen": -243.23280334472656,
      "logps/rejected": -175.568115234375,
      "loss": 0.3854,
      "rewards/chosen": -0.7309738397598267,
      "rewards/margins": 1.0130811929702759,
      "rewards/rejected": -1.7440550327301025,
      "step": 2013
    },
    {
      "epoch": 0.53,
      "grad_norm": 36.87703323364258,
      "kl": 0.0,
      "learning_rate": 2.3645642501962836e-07,
      "logps/chosen": -300.458251953125,
      "logps/rejected": -286.5090026855469,
      "loss": 0.3353,
      "rewards/chosen": 1.6609442234039307,
      "rewards/margins": 4.945582389831543,
      "rewards/rejected": -3.2846384048461914,
      "step": 2014
    },
    {
      "epoch": 0.53,
      "grad_norm": 46.88383865356445,
      "kl": 0.0,
      "learning_rate": 2.3632556922271653e-07,
      "logps/chosen": -124.17665100097656,
      "logps/rejected": -381.4186096191406,
      "loss": 0.4301,
      "rewards/chosen": -0.28269121050834656,
      "rewards/margins": 2.143458127975464,
      "rewards/rejected": -2.426149368286133,
      "step": 2015
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.590614318847656,
      "kl": 0.0,
      "learning_rate": 2.3619471342580473e-07,
      "logps/chosen": -165.1231689453125,
      "logps/rejected": -216.70668029785156,
      "loss": 0.363,
      "rewards/chosen": -0.5194066166877747,
      "rewards/margins": 3.826328992843628,
      "rewards/rejected": -4.345735549926758,
      "step": 2016
    },
    {
      "epoch": 0.53,
      "grad_norm": 28.982065200805664,
      "kl": 0.0,
      "learning_rate": 2.3606385762889295e-07,
      "logps/chosen": -106.90189361572266,
      "logps/rejected": -241.30169677734375,
      "loss": 0.2629,
      "rewards/chosen": 1.0548369884490967,
      "rewards/margins": 2.681565999984741,
      "rewards/rejected": -1.6267290115356445,
      "step": 2017
    },
    {
      "epoch": 0.53,
      "grad_norm": 40.08494186401367,
      "kl": 0.0,
      "learning_rate": 2.3593300183198115e-07,
      "logps/chosen": -279.9969482421875,
      "logps/rejected": -170.47064208984375,
      "loss": 0.2288,
      "rewards/chosen": 2.9944510459899902,
      "rewards/margins": 5.17792272567749,
      "rewards/rejected": -2.1834716796875,
      "step": 2018
    },
    {
      "epoch": 0.53,
      "grad_norm": 59.07280349731445,
      "kl": 0.0,
      "learning_rate": 2.3580214603506934e-07,
      "logps/chosen": -207.0133514404297,
      "logps/rejected": -319.8271484375,
      "loss": 0.3363,
      "rewards/chosen": 0.8179835081100464,
      "rewards/margins": 3.3634209632873535,
      "rewards/rejected": -2.5454375743865967,
      "step": 2019
    },
    {
      "epoch": 0.53,
      "grad_norm": 34.716835021972656,
      "kl": 0.0,
      "learning_rate": 2.3567129023815754e-07,
      "logps/chosen": -285.7257080078125,
      "logps/rejected": -236.57012939453125,
      "loss": 0.3969,
      "rewards/chosen": -0.056455135345458984,
      "rewards/margins": 3.922239065170288,
      "rewards/rejected": -3.978694200515747,
      "step": 2020
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.732534408569336,
      "kl": 0.0,
      "learning_rate": 2.3554043444124574e-07,
      "logps/chosen": -191.0402069091797,
      "logps/rejected": -263.2533264160156,
      "loss": 0.2471,
      "rewards/chosen": 0.4087807834148407,
      "rewards/margins": 4.085519790649414,
      "rewards/rejected": -3.67673921585083,
      "step": 2021
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.43977165222168,
      "kl": 0.0,
      "learning_rate": 2.3540957864433393e-07,
      "logps/chosen": -137.8092498779297,
      "logps/rejected": -281.7326965332031,
      "loss": 0.1874,
      "rewards/chosen": 0.5577578544616699,
      "rewards/margins": 4.101840496063232,
      "rewards/rejected": -3.5440826416015625,
      "step": 2022
    },
    {
      "epoch": 0.53,
      "grad_norm": 34.099342346191406,
      "kl": 0.0,
      "learning_rate": 2.3527872284742213e-07,
      "logps/chosen": -203.07217407226562,
      "logps/rejected": -312.94195556640625,
      "loss": 0.381,
      "rewards/chosen": 0.886745810508728,
      "rewards/margins": 2.430570602416992,
      "rewards/rejected": -1.5438249111175537,
      "step": 2023
    },
    {
      "epoch": 0.53,
      "grad_norm": 35.84495162963867,
      "kl": 0.0,
      "learning_rate": 2.3514786705051032e-07,
      "logps/chosen": -228.089111328125,
      "logps/rejected": -355.3266296386719,
      "loss": 0.1612,
      "rewards/chosen": 0.7639932036399841,
      "rewards/margins": 4.759679317474365,
      "rewards/rejected": -3.9956860542297363,
      "step": 2024
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.479677200317383,
      "kl": 0.0,
      "learning_rate": 2.3501701125359852e-07,
      "logps/chosen": -221.3619842529297,
      "logps/rejected": -189.91445922851562,
      "loss": 0.1739,
      "rewards/chosen": 1.7698726654052734,
      "rewards/margins": 4.553598403930664,
      "rewards/rejected": -2.7837259769439697,
      "step": 2025
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.979618072509766,
      "kl": 0.0,
      "learning_rate": 2.3488615545668672e-07,
      "logps/chosen": -182.97657775878906,
      "logps/rejected": -242.29254150390625,
      "loss": 0.2574,
      "rewards/chosen": 2.3239691257476807,
      "rewards/margins": 6.539175033569336,
      "rewards/rejected": -4.215206146240234,
      "step": 2026
    },
    {
      "epoch": 0.53,
      "grad_norm": 34.30022430419922,
      "kl": 0.0,
      "learning_rate": 2.347552996597749e-07,
      "logps/chosen": -222.5321807861328,
      "logps/rejected": -224.00125122070312,
      "loss": 0.2416,
      "rewards/chosen": 0.36926016211509705,
      "rewards/margins": 4.0446457862854,
      "rewards/rejected": -3.6753857135772705,
      "step": 2027
    },
    {
      "epoch": 0.53,
      "grad_norm": 31.23023223876953,
      "kl": 0.0,
      "learning_rate": 2.346244438628631e-07,
      "logps/chosen": -190.40428161621094,
      "logps/rejected": -179.3951873779297,
      "loss": 0.2639,
      "rewards/chosen": 2.5482754707336426,
      "rewards/margins": 5.174324989318848,
      "rewards/rejected": -2.626049757003784,
      "step": 2028
    },
    {
      "epoch": 0.53,
      "grad_norm": 40.01976013183594,
      "kl": 0.0,
      "learning_rate": 2.3449358806595133e-07,
      "logps/chosen": -181.07247924804688,
      "logps/rejected": -252.67135620117188,
      "loss": 0.2582,
      "rewards/chosen": 1.9590002298355103,
      "rewards/margins": 4.202696323394775,
      "rewards/rejected": -2.2436962127685547,
      "step": 2029
    },
    {
      "epoch": 0.53,
      "grad_norm": 32.337642669677734,
      "kl": 0.0,
      "learning_rate": 2.3436273226903953e-07,
      "logps/chosen": -259.14239501953125,
      "logps/rejected": -253.57275390625,
      "loss": 0.3372,
      "rewards/chosen": 0.023569345474243164,
      "rewards/margins": 3.4797098636627197,
      "rewards/rejected": -3.4561405181884766,
      "step": 2030
    },
    {
      "epoch": 0.53,
      "grad_norm": 36.94684982299805,
      "kl": 0.0,
      "learning_rate": 2.342318764721277e-07,
      "logps/chosen": -193.77459716796875,
      "logps/rejected": -276.48101806640625,
      "loss": 0.3417,
      "rewards/chosen": 1.8443284034729004,
      "rewards/margins": 5.706932067871094,
      "rewards/rejected": -3.8626036643981934,
      "step": 2031
    },
    {
      "epoch": 0.53,
      "grad_norm": 26.014928817749023,
      "kl": 0.0,
      "learning_rate": 2.341010206752159e-07,
      "logps/chosen": -182.404296875,
      "logps/rejected": -355.6706848144531,
      "loss": 0.2146,
      "rewards/chosen": 1.1796051263809204,
      "rewards/margins": 5.5185322761535645,
      "rewards/rejected": -4.338927268981934,
      "step": 2032
    },
    {
      "epoch": 0.53,
      "grad_norm": 31.432388305664062,
      "kl": 0.0,
      "learning_rate": 2.339701648783041e-07,
      "logps/chosen": -187.7768096923828,
      "logps/rejected": -205.853759765625,
      "loss": 0.21,
      "rewards/chosen": 1.4805999994277954,
      "rewards/margins": 4.251741409301758,
      "rewards/rejected": -2.771141529083252,
      "step": 2033
    },
    {
      "epoch": 0.53,
      "grad_norm": 38.272300720214844,
      "kl": 0.0,
      "learning_rate": 2.3383930908139229e-07,
      "logps/chosen": -227.22837829589844,
      "logps/rejected": -221.27516174316406,
      "loss": 0.3342,
      "rewards/chosen": 0.6269685626029968,
      "rewards/margins": 2.780792474746704,
      "rewards/rejected": -2.1538238525390625,
      "step": 2034
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.997676849365234,
      "kl": 0.0,
      "learning_rate": 2.3370845328448048e-07,
      "logps/chosen": -223.91384887695312,
      "logps/rejected": -177.48019409179688,
      "loss": 0.2491,
      "rewards/chosen": -0.05373985320329666,
      "rewards/margins": 2.6699798107147217,
      "rewards/rejected": -2.723719596862793,
      "step": 2035
    },
    {
      "epoch": 0.53,
      "grad_norm": 37.9314079284668,
      "kl": 0.0,
      "learning_rate": 2.335775974875687e-07,
      "logps/chosen": -276.69659423828125,
      "logps/rejected": -320.3892517089844,
      "loss": 0.3087,
      "rewards/chosen": 1.2256563901901245,
      "rewards/margins": 4.2694549560546875,
      "rewards/rejected": -3.0437984466552734,
      "step": 2036
    },
    {
      "epoch": 0.53,
      "grad_norm": 30.487159729003906,
      "kl": 0.0,
      "learning_rate": 2.334467416906569e-07,
      "logps/chosen": -151.29331970214844,
      "logps/rejected": -225.7119598388672,
      "loss": 0.2399,
      "rewards/chosen": 1.0992461442947388,
      "rewards/margins": 4.55507230758667,
      "rewards/rejected": -3.4558262825012207,
      "step": 2037
    },
    {
      "epoch": 0.53,
      "grad_norm": 33.59539031982422,
      "kl": 0.0,
      "learning_rate": 2.333158858937451e-07,
      "logps/chosen": -179.46453857421875,
      "logps/rejected": -236.49371337890625,
      "loss": 0.3458,
      "rewards/chosen": 0.380405068397522,
      "rewards/margins": 3.274649143218994,
      "rewards/rejected": -2.8942441940307617,
      "step": 2038
    },
    {
      "epoch": 0.53,
      "grad_norm": 26.793468475341797,
      "kl": 0.0,
      "learning_rate": 2.3318503009683327e-07,
      "logps/chosen": -223.2361602783203,
      "logps/rejected": -212.54644775390625,
      "loss": 0.2385,
      "rewards/chosen": 1.6074248552322388,
      "rewards/margins": 3.5587990283966064,
      "rewards/rejected": -1.9513741731643677,
      "step": 2039
    },
    {
      "epoch": 0.53,
      "grad_norm": 25.713003158569336,
      "kl": 0.0,
      "learning_rate": 2.3305417429992146e-07,
      "logps/chosen": -252.0117645263672,
      "logps/rejected": -209.342529296875,
      "loss": 0.2839,
      "rewards/chosen": -0.6636760830879211,
      "rewards/margins": 3.629101037979126,
      "rewards/rejected": -4.292777061462402,
      "step": 2040
    },
    {
      "epoch": 0.53,
      "grad_norm": 26.062259674072266,
      "kl": 0.0,
      "learning_rate": 2.3292331850300966e-07,
      "logps/chosen": -199.529296875,
      "logps/rejected": -284.480712890625,
      "loss": 0.1446,
      "rewards/chosen": 3.976663112640381,
      "rewards/margins": 10.23494815826416,
      "rewards/rejected": -6.258285045623779,
      "step": 2041
    },
    {
      "epoch": 0.53,
      "grad_norm": 40.40095520019531,
      "kl": 0.0,
      "learning_rate": 2.3279246270609788e-07,
      "logps/chosen": -225.0578155517578,
      "logps/rejected": -151.56898498535156,
      "loss": 0.306,
      "rewards/chosen": -0.09351640939712524,
      "rewards/margins": 3.9907732009887695,
      "rewards/rejected": -4.08428955078125,
      "step": 2042
    },
    {
      "epoch": 0.53,
      "grad_norm": 43.919803619384766,
      "kl": 0.0,
      "learning_rate": 2.3266160690918608e-07,
      "logps/chosen": -231.98822021484375,
      "logps/rejected": -263.60198974609375,
      "loss": 0.3163,
      "rewards/chosen": 0.3000814914703369,
      "rewards/margins": 5.877663612365723,
      "rewards/rejected": -5.577582359313965,
      "step": 2043
    },
    {
      "epoch": 0.53,
      "grad_norm": 39.102787017822266,
      "kl": 0.0,
      "learning_rate": 2.3253075111227427e-07,
      "logps/chosen": -257.454833984375,
      "logps/rejected": -203.32229614257812,
      "loss": 0.2321,
      "rewards/chosen": 0.9678674936294556,
      "rewards/margins": 5.673610210418701,
      "rewards/rejected": -4.705742835998535,
      "step": 2044
    },
    {
      "epoch": 0.54,
      "grad_norm": 25.78129005432129,
      "kl": 0.0,
      "learning_rate": 2.3239989531536247e-07,
      "logps/chosen": -182.70339965820312,
      "logps/rejected": -234.2674102783203,
      "loss": 0.1706,
      "rewards/chosen": 1.4865150451660156,
      "rewards/margins": 5.233919620513916,
      "rewards/rejected": -3.7474045753479004,
      "step": 2045
    },
    {
      "epoch": 0.54,
      "grad_norm": 35.66816329956055,
      "kl": 0.0,
      "learning_rate": 2.3226903951845064e-07,
      "logps/chosen": -217.499267578125,
      "logps/rejected": -160.31105041503906,
      "loss": 0.309,
      "rewards/chosen": -0.4123508632183075,
      "rewards/margins": 2.4263663291931152,
      "rewards/rejected": -2.838717222213745,
      "step": 2046
    },
    {
      "epoch": 0.54,
      "grad_norm": 51.080989837646484,
      "kl": 0.0,
      "learning_rate": 2.3213818372153883e-07,
      "logps/chosen": -257.92095947265625,
      "logps/rejected": -225.64663696289062,
      "loss": 0.3738,
      "rewards/chosen": 0.7672978639602661,
      "rewards/margins": 2.8424081802368164,
      "rewards/rejected": -2.0751101970672607,
      "step": 2047
    },
    {
      "epoch": 0.54,
      "grad_norm": 36.70692443847656,
      "kl": 0.0,
      "learning_rate": 2.3200732792462706e-07,
      "logps/chosen": -291.3275146484375,
      "logps/rejected": -155.7629852294922,
      "loss": 0.366,
      "rewards/chosen": 1.1201237440109253,
      "rewards/margins": 2.4310898780822754,
      "rewards/rejected": -1.3109660148620605,
      "step": 2048
    },
    {
      "epoch": 0.54,
      "grad_norm": 40.8702392578125,
      "kl": 0.0,
      "learning_rate": 2.3187647212771525e-07,
      "logps/chosen": -267.50341796875,
      "logps/rejected": -201.24693298339844,
      "loss": 0.3107,
      "rewards/chosen": 1.1985299587249756,
      "rewards/margins": 4.256721496582031,
      "rewards/rejected": -3.0581917762756348,
      "step": 2049
    },
    {
      "epoch": 0.54,
      "grad_norm": 51.903621673583984,
      "kl": 0.0,
      "learning_rate": 2.3174561633080345e-07,
      "logps/chosen": -294.9173278808594,
      "logps/rejected": -271.4747314453125,
      "loss": 0.2116,
      "rewards/chosen": 0.5645659565925598,
      "rewards/margins": 3.08552885055542,
      "rewards/rejected": -2.520962953567505,
      "step": 2050
    },
    {
      "epoch": 0.54,
      "grad_norm": 36.193016052246094,
      "kl": 0.0,
      "learning_rate": 2.3161476053389165e-07,
      "logps/chosen": -345.0677490234375,
      "logps/rejected": -302.47991943359375,
      "loss": 0.1589,
      "rewards/chosen": 2.930142641067505,
      "rewards/margins": 6.179523468017578,
      "rewards/rejected": -3.2493808269500732,
      "step": 2051
    },
    {
      "epoch": 0.54,
      "grad_norm": 35.836212158203125,
      "kl": 0.0,
      "learning_rate": 2.3148390473697984e-07,
      "logps/chosen": -274.1582336425781,
      "logps/rejected": -256.15203857421875,
      "loss": 0.357,
      "rewards/chosen": 0.9071843028068542,
      "rewards/margins": 2.8665997982025146,
      "rewards/rejected": -1.9594155550003052,
      "step": 2052
    },
    {
      "epoch": 0.54,
      "grad_norm": 33.03522872924805,
      "kl": 0.0,
      "learning_rate": 2.3135304894006804e-07,
      "logps/chosen": -148.79110717773438,
      "logps/rejected": -246.0092315673828,
      "loss": 0.2571,
      "rewards/chosen": 1.2804070711135864,
      "rewards/margins": 2.9049646854400635,
      "rewards/rejected": -1.624557614326477,
      "step": 2053
    },
    {
      "epoch": 0.54,
      "grad_norm": 26.965436935424805,
      "kl": 0.0,
      "learning_rate": 2.312221931431562e-07,
      "logps/chosen": -234.40638732910156,
      "logps/rejected": -242.86436462402344,
      "loss": 0.1664,
      "rewards/chosen": 0.8637860417366028,
      "rewards/margins": 4.613651275634766,
      "rewards/rejected": -3.7498650550842285,
      "step": 2054
    },
    {
      "epoch": 0.54,
      "grad_norm": 37.29470443725586,
      "kl": 0.0,
      "learning_rate": 2.3109133734624443e-07,
      "logps/chosen": -172.05210876464844,
      "logps/rejected": -238.47952270507812,
      "loss": 0.2839,
      "rewards/chosen": 0.9011194705963135,
      "rewards/margins": 3.0700299739837646,
      "rewards/rejected": -2.168910503387451,
      "step": 2055
    },
    {
      "epoch": 0.54,
      "grad_norm": 25.493192672729492,
      "kl": 0.0,
      "learning_rate": 2.3096048154933263e-07,
      "logps/chosen": -215.99545288085938,
      "logps/rejected": -243.0108184814453,
      "loss": 0.3314,
      "rewards/chosen": 0.6691614389419556,
      "rewards/margins": 4.748775959014893,
      "rewards/rejected": -4.079614639282227,
      "step": 2056
    },
    {
      "epoch": 0.54,
      "grad_norm": 39.543739318847656,
      "kl": 0.0,
      "learning_rate": 2.3082962575242082e-07,
      "logps/chosen": -262.2772521972656,
      "logps/rejected": -238.32432556152344,
      "loss": 0.4089,
      "rewards/chosen": -0.20746786892414093,
      "rewards/margins": 1.0184450149536133,
      "rewards/rejected": -1.2259129285812378,
      "step": 2057
    },
    {
      "epoch": 0.54,
      "grad_norm": 41.07046890258789,
      "kl": 0.0,
      "learning_rate": 2.3069876995550902e-07,
      "logps/chosen": -245.2674560546875,
      "logps/rejected": -271.6712646484375,
      "loss": 0.337,
      "rewards/chosen": 0.7155461311340332,
      "rewards/margins": 3.8527755737304688,
      "rewards/rejected": -3.1372294425964355,
      "step": 2058
    },
    {
      "epoch": 0.54,
      "grad_norm": 34.27461624145508,
      "kl": 0.0,
      "learning_rate": 2.3056791415859721e-07,
      "logps/chosen": -137.6658935546875,
      "logps/rejected": -287.7002868652344,
      "loss": 0.3097,
      "rewards/chosen": 0.8914886116981506,
      "rewards/margins": 4.4790730476379395,
      "rewards/rejected": -3.5875844955444336,
      "step": 2059
    },
    {
      "epoch": 0.54,
      "grad_norm": 34.30635452270508,
      "kl": 0.0,
      "learning_rate": 2.304370583616854e-07,
      "logps/chosen": -132.6312255859375,
      "logps/rejected": -329.7675476074219,
      "loss": 0.3,
      "rewards/chosen": 0.9626002311706543,
      "rewards/margins": 3.7751975059509277,
      "rewards/rejected": -2.8125972747802734,
      "step": 2060
    },
    {
      "epoch": 0.54,
      "grad_norm": 30.666933059692383,
      "kl": 0.0,
      "learning_rate": 2.3030620256477363e-07,
      "logps/chosen": -176.3401336669922,
      "logps/rejected": -278.4073486328125,
      "loss": 0.2786,
      "rewards/chosen": 0.10558772087097168,
      "rewards/margins": 2.623535394668579,
      "rewards/rejected": -2.5179476737976074,
      "step": 2061
    },
    {
      "epoch": 0.54,
      "grad_norm": 27.88566780090332,
      "kl": 0.0,
      "learning_rate": 2.301753467678618e-07,
      "logps/chosen": -157.7092742919922,
      "logps/rejected": -206.13185119628906,
      "loss": 0.3859,
      "rewards/chosen": 1.1426632404327393,
      "rewards/margins": 2.981151580810547,
      "rewards/rejected": -1.8384883403778076,
      "step": 2062
    },
    {
      "epoch": 0.54,
      "grad_norm": 30.36982536315918,
      "kl": 0.0,
      "learning_rate": 2.3004449097095e-07,
      "logps/chosen": -191.57711791992188,
      "logps/rejected": -152.2157440185547,
      "loss": 0.2586,
      "rewards/chosen": 1.6766765117645264,
      "rewards/margins": 3.6210598945617676,
      "rewards/rejected": -1.9443833827972412,
      "step": 2063
    },
    {
      "epoch": 0.54,
      "grad_norm": 38.566322326660156,
      "kl": 0.0,
      "learning_rate": 2.299136351740382e-07,
      "logps/chosen": -198.1544189453125,
      "logps/rejected": -227.01773071289062,
      "loss": 0.3586,
      "rewards/chosen": 0.406218022108078,
      "rewards/margins": 3.210958957672119,
      "rewards/rejected": -2.8047409057617188,
      "step": 2064
    },
    {
      "epoch": 0.54,
      "grad_norm": 36.628318786621094,
      "kl": 0.0,
      "learning_rate": 2.297827793771264e-07,
      "logps/chosen": -145.6483154296875,
      "logps/rejected": -219.84780883789062,
      "loss": 0.3084,
      "rewards/chosen": 0.7460767030715942,
      "rewards/margins": 3.5765395164489746,
      "rewards/rejected": -2.830462694168091,
      "step": 2065
    },
    {
      "epoch": 0.54,
      "grad_norm": 23.9617919921875,
      "kl": 0.0,
      "learning_rate": 2.296519235802146e-07,
      "logps/chosen": -118.81352996826172,
      "logps/rejected": -207.17298889160156,
      "loss": 0.3238,
      "rewards/chosen": 0.7397387623786926,
      "rewards/margins": 2.863344192504883,
      "rewards/rejected": -2.123605489730835,
      "step": 2066
    },
    {
      "epoch": 0.54,
      "grad_norm": 40.05659866333008,
      "kl": 0.0,
      "learning_rate": 2.295210677833028e-07,
      "logps/chosen": -155.1406707763672,
      "logps/rejected": -207.32810974121094,
      "loss": 0.2911,
      "rewards/chosen": 0.2956763803958893,
      "rewards/margins": 2.1852457523345947,
      "rewards/rejected": -1.8895692825317383,
      "step": 2067
    },
    {
      "epoch": 0.54,
      "grad_norm": 38.00597381591797,
      "kl": 0.0,
      "learning_rate": 2.29390211986391e-07,
      "logps/chosen": -222.1738739013672,
      "logps/rejected": -213.74107360839844,
      "loss": 0.2449,
      "rewards/chosen": 1.4015142917633057,
      "rewards/margins": 3.836189031600952,
      "rewards/rejected": -2.4346747398376465,
      "step": 2068
    },
    {
      "epoch": 0.54,
      "grad_norm": 39.29631805419922,
      "kl": 0.0,
      "learning_rate": 2.292593561894792e-07,
      "logps/chosen": -289.01568603515625,
      "logps/rejected": -258.3611755371094,
      "loss": 0.3935,
      "rewards/chosen": -0.3644639253616333,
      "rewards/margins": 3.4178175926208496,
      "rewards/rejected": -3.7822813987731934,
      "step": 2069
    },
    {
      "epoch": 0.54,
      "grad_norm": 32.35689163208008,
      "kl": 0.0,
      "learning_rate": 2.2912850039256737e-07,
      "logps/chosen": -187.1671142578125,
      "logps/rejected": -373.15692138671875,
      "loss": 0.3169,
      "rewards/chosen": 0.5017662644386292,
      "rewards/margins": 3.3714005947113037,
      "rewards/rejected": -2.8696343898773193,
      "step": 2070
    },
    {
      "epoch": 0.54,
      "grad_norm": 32.022483825683594,
      "kl": 0.0,
      "learning_rate": 2.2899764459565557e-07,
      "logps/chosen": -270.5025634765625,
      "logps/rejected": -257.8646240234375,
      "loss": 0.3022,
      "rewards/chosen": 0.6621522903442383,
      "rewards/margins": 4.498281478881836,
      "rewards/rejected": -3.8361291885375977,
      "step": 2071
    },
    {
      "epoch": 0.54,
      "grad_norm": 28.791711807250977,
      "kl": 0.0,
      "learning_rate": 2.2886678879874376e-07,
      "logps/chosen": -190.7532501220703,
      "logps/rejected": -294.8187561035156,
      "loss": 0.3575,
      "rewards/chosen": -1.2482961416244507,
      "rewards/margins": 0.8909574747085571,
      "rewards/rejected": -2.139253616333008,
      "step": 2072
    },
    {
      "epoch": 0.54,
      "grad_norm": 34.729156494140625,
      "kl": 0.0,
      "learning_rate": 2.2873593300183196e-07,
      "logps/chosen": -209.54147338867188,
      "logps/rejected": -211.24008178710938,
      "loss": 0.3454,
      "rewards/chosen": 2.8879120349884033,
      "rewards/margins": 4.048269748687744,
      "rewards/rejected": -1.1603578329086304,
      "step": 2073
    },
    {
      "epoch": 0.54,
      "grad_norm": 34.23164367675781,
      "kl": 0.0,
      "learning_rate": 2.2860507720492018e-07,
      "logps/chosen": -121.15304565429688,
      "logps/rejected": -223.7058868408203,
      "loss": 0.282,
      "rewards/chosen": 0.3577978014945984,
      "rewards/margins": 2.298954486846924,
      "rewards/rejected": -1.9411567449569702,
      "step": 2074
    },
    {
      "epoch": 0.54,
      "grad_norm": 30.46665382385254,
      "kl": 0.0,
      "learning_rate": 2.2847422140800838e-07,
      "logps/chosen": -169.19219970703125,
      "logps/rejected": -232.3114013671875,
      "loss": 0.3508,
      "rewards/chosen": 0.5962676405906677,
      "rewards/margins": 2.8509838581085205,
      "rewards/rejected": -2.254716157913208,
      "step": 2075
    },
    {
      "epoch": 0.54,
      "grad_norm": 24.444530487060547,
      "kl": 0.0,
      "learning_rate": 2.2834336561109657e-07,
      "logps/chosen": -166.1984100341797,
      "logps/rejected": -266.15338134765625,
      "loss": 0.1984,
      "rewards/chosen": 1.1371859312057495,
      "rewards/margins": 5.206028938293457,
      "rewards/rejected": -4.068842887878418,
      "step": 2076
    },
    {
      "epoch": 0.54,
      "grad_norm": 43.3023796081543,
      "kl": 0.0,
      "learning_rate": 2.2821250981418474e-07,
      "logps/chosen": -162.20846557617188,
      "logps/rejected": -226.7047882080078,
      "loss": 0.2923,
      "rewards/chosen": 0.9513958692550659,
      "rewards/margins": 2.763209581375122,
      "rewards/rejected": -1.8118137121200562,
      "step": 2077
    },
    {
      "epoch": 0.54,
      "grad_norm": 43.34999465942383,
      "kl": 0.0,
      "learning_rate": 2.2808165401727294e-07,
      "logps/chosen": -253.7343292236328,
      "logps/rejected": -234.84107971191406,
      "loss": 0.2351,
      "rewards/chosen": 0.7632766962051392,
      "rewards/margins": 4.0219950675964355,
      "rewards/rejected": -3.258718490600586,
      "step": 2078
    },
    {
      "epoch": 0.54,
      "grad_norm": 32.680931091308594,
      "kl": 0.0,
      "learning_rate": 2.2795079822036114e-07,
      "logps/chosen": -121.70050048828125,
      "logps/rejected": -294.01043701171875,
      "loss": 0.3483,
      "rewards/chosen": 1.3794186115264893,
      "rewards/margins": 3.346170663833618,
      "rewards/rejected": -1.966752052307129,
      "step": 2079
    },
    {
      "epoch": 0.54,
      "grad_norm": 33.10261917114258,
      "kl": 0.0,
      "learning_rate": 2.2781994242344936e-07,
      "logps/chosen": -243.3766632080078,
      "logps/rejected": -152.2445526123047,
      "loss": 0.2724,
      "rewards/chosen": 1.2106717824935913,
      "rewards/margins": 4.09446907043457,
      "rewards/rejected": -2.8837971687316895,
      "step": 2080
    },
    {
      "epoch": 0.54,
      "grad_norm": 35.59720230102539,
      "kl": 0.0,
      "learning_rate": 2.2768908662653755e-07,
      "logps/chosen": -259.7003173828125,
      "logps/rejected": -189.43997192382812,
      "loss": 0.255,
      "rewards/chosen": 0.9261613488197327,
      "rewards/margins": 3.8873603343963623,
      "rewards/rejected": -2.9611990451812744,
      "step": 2081
    },
    {
      "epoch": 0.54,
      "grad_norm": 37.25908660888672,
      "kl": 0.0,
      "learning_rate": 2.2755823082962575e-07,
      "logps/chosen": -278.11016845703125,
      "logps/rejected": -288.7530517578125,
      "loss": 0.2594,
      "rewards/chosen": 2.6176414489746094,
      "rewards/margins": 7.874884605407715,
      "rewards/rejected": -5.2572431564331055,
      "step": 2082
    },
    {
      "epoch": 0.55,
      "grad_norm": 42.3132438659668,
      "kl": 0.0,
      "learning_rate": 2.2742737503271395e-07,
      "logps/chosen": -223.02183532714844,
      "logps/rejected": -247.94920349121094,
      "loss": 0.2972,
      "rewards/chosen": 0.5573528409004211,
      "rewards/margins": 4.124536991119385,
      "rewards/rejected": -3.5671842098236084,
      "step": 2083
    },
    {
      "epoch": 0.55,
      "grad_norm": 43.50983810424805,
      "kl": 0.0,
      "learning_rate": 2.2729651923580214e-07,
      "logps/chosen": -209.89193725585938,
      "logps/rejected": -259.04034423828125,
      "loss": 0.3302,
      "rewards/chosen": -0.0854308009147644,
      "rewards/margins": 4.501287937164307,
      "rewards/rejected": -4.586718559265137,
      "step": 2084
    },
    {
      "epoch": 0.55,
      "grad_norm": 34.85049057006836,
      "kl": 0.0,
      "learning_rate": 2.271656634388903e-07,
      "logps/chosen": -175.64346313476562,
      "logps/rejected": -311.0813293457031,
      "loss": 0.1309,
      "rewards/chosen": 0.5914903283119202,
      "rewards/margins": 6.049227714538574,
      "rewards/rejected": -5.457737445831299,
      "step": 2085
    },
    {
      "epoch": 0.55,
      "grad_norm": 32.070823669433594,
      "kl": 0.0,
      "learning_rate": 2.270348076419785e-07,
      "logps/chosen": -309.7897033691406,
      "logps/rejected": -279.0724792480469,
      "loss": 0.2345,
      "rewards/chosen": -0.2845461070537567,
      "rewards/margins": 2.9219210147857666,
      "rewards/rejected": -3.2064671516418457,
      "step": 2086
    },
    {
      "epoch": 0.55,
      "grad_norm": 36.27039337158203,
      "kl": 0.0,
      "learning_rate": 2.2690395184506673e-07,
      "logps/chosen": -227.92718505859375,
      "logps/rejected": -302.98028564453125,
      "loss": 0.3127,
      "rewards/chosen": 2.268153190612793,
      "rewards/margins": 4.073305130004883,
      "rewards/rejected": -1.8051517009735107,
      "step": 2087
    },
    {
      "epoch": 0.55,
      "grad_norm": 28.964229583740234,
      "kl": 0.0,
      "learning_rate": 2.2677309604815493e-07,
      "logps/chosen": -165.77523803710938,
      "logps/rejected": -276.9595031738281,
      "loss": 0.2121,
      "rewards/chosen": 1.7321010828018188,
      "rewards/margins": 5.551499843597412,
      "rewards/rejected": -3.819398880004883,
      "step": 2088
    },
    {
      "epoch": 0.55,
      "grad_norm": 41.056583404541016,
      "kl": 0.0,
      "learning_rate": 2.2664224025124312e-07,
      "logps/chosen": -261.00238037109375,
      "logps/rejected": -283.1644287109375,
      "loss": 0.3032,
      "rewards/chosen": -0.3032938838005066,
      "rewards/margins": 3.9948885440826416,
      "rewards/rejected": -4.298182487487793,
      "step": 2089
    },
    {
      "epoch": 0.55,
      "grad_norm": 38.75579071044922,
      "kl": 0.0,
      "learning_rate": 2.2651138445433132e-07,
      "logps/chosen": -139.2954864501953,
      "logps/rejected": -168.7962646484375,
      "loss": 0.3067,
      "rewards/chosen": 0.5684501528739929,
      "rewards/margins": 3.4893176555633545,
      "rewards/rejected": -2.920867443084717,
      "step": 2090
    },
    {
      "epoch": 0.55,
      "grad_norm": 30.218616485595703,
      "kl": 0.0,
      "learning_rate": 2.2638052865741952e-07,
      "logps/chosen": -153.24710083007812,
      "logps/rejected": -190.56497192382812,
      "loss": 0.2842,
      "rewards/chosen": 1.887859582901001,
      "rewards/margins": 4.847748756408691,
      "rewards/rejected": -2.9598894119262695,
      "step": 2091
    },
    {
      "epoch": 0.55,
      "grad_norm": 37.584842681884766,
      "kl": 0.0,
      "learning_rate": 2.262496728605077e-07,
      "logps/chosen": -269.6733703613281,
      "logps/rejected": -246.76669311523438,
      "loss": 0.274,
      "rewards/chosen": 0.8811774253845215,
      "rewards/margins": 4.343387603759766,
      "rewards/rejected": -3.462209939956665,
      "step": 2092
    },
    {
      "epoch": 0.55,
      "grad_norm": 31.532167434692383,
      "kl": 0.0,
      "learning_rate": 2.261188170635959e-07,
      "logps/chosen": -225.5703887939453,
      "logps/rejected": -302.420654296875,
      "loss": 0.2572,
      "rewards/chosen": -0.5330899357795715,
      "rewards/margins": 3.7925503253936768,
      "rewards/rejected": -4.3256402015686035,
      "step": 2093
    },
    {
      "epoch": 0.55,
      "grad_norm": 36.89503479003906,
      "kl": 0.0,
      "learning_rate": 2.259879612666841e-07,
      "logps/chosen": -272.6938781738281,
      "logps/rejected": -233.2865753173828,
      "loss": 0.2564,
      "rewards/chosen": 0.8622837066650391,
      "rewards/margins": 3.173910617828369,
      "rewards/rejected": -2.31162691116333,
      "step": 2094
    },
    {
      "epoch": 0.55,
      "grad_norm": 42.40085983276367,
      "kl": 0.0,
      "learning_rate": 2.258571054697723e-07,
      "logps/chosen": -177.05694580078125,
      "logps/rejected": -227.2252197265625,
      "loss": 0.3312,
      "rewards/chosen": 1.544360637664795,
      "rewards/margins": 2.9416725635528564,
      "rewards/rejected": -1.3973119258880615,
      "step": 2095
    },
    {
      "epoch": 0.55,
      "grad_norm": 38.93367004394531,
      "kl": 0.0,
      "learning_rate": 2.257262496728605e-07,
      "logps/chosen": -229.6371307373047,
      "logps/rejected": -266.69879150390625,
      "loss": 0.3313,
      "rewards/chosen": 1.1398550271987915,
      "rewards/margins": 5.1608171463012695,
      "rewards/rejected": -4.020962238311768,
      "step": 2096
    },
    {
      "epoch": 0.55,
      "grad_norm": 34.740570068359375,
      "kl": 0.0,
      "learning_rate": 2.255953938759487e-07,
      "logps/chosen": -147.47230529785156,
      "logps/rejected": -291.76287841796875,
      "loss": 0.2119,
      "rewards/chosen": 1.6616230010986328,
      "rewards/margins": 5.7488112449646,
      "rewards/rejected": -4.087188243865967,
      "step": 2097
    },
    {
      "epoch": 0.55,
      "grad_norm": 39.187652587890625,
      "kl": 0.0,
      "learning_rate": 2.254645380790369e-07,
      "logps/chosen": -196.1621856689453,
      "logps/rejected": -310.6141357421875,
      "loss": 0.2301,
      "rewards/chosen": 1.0132759809494019,
      "rewards/margins": 4.632660865783691,
      "rewards/rejected": -3.619384765625,
      "step": 2098
    },
    {
      "epoch": 0.55,
      "grad_norm": 28.46122932434082,
      "kl": 0.0,
      "learning_rate": 2.253336822821251e-07,
      "logps/chosen": -231.9829559326172,
      "logps/rejected": -218.917236328125,
      "loss": 0.2882,
      "rewards/chosen": -0.4349358379840851,
      "rewards/margins": 3.075749158859253,
      "rewards/rejected": -3.5106849670410156,
      "step": 2099
    },
    {
      "epoch": 0.55,
      "grad_norm": 28.59282875061035,
      "kl": 0.0,
      "learning_rate": 2.252028264852133e-07,
      "logps/chosen": -183.28744506835938,
      "logps/rejected": -218.506591796875,
      "loss": 0.2559,
      "rewards/chosen": 1.6624376773834229,
      "rewards/margins": 3.909499168395996,
      "rewards/rejected": -2.2470614910125732,
      "step": 2100
    },
    {
      "epoch": 0.55,
      "grad_norm": 37.74734878540039,
      "kl": 0.0,
      "learning_rate": 2.2507197068830148e-07,
      "logps/chosen": -339.4798889160156,
      "logps/rejected": -272.19970703125,
      "loss": 0.2443,
      "rewards/chosen": 2.859189033508301,
      "rewards/margins": 6.35294246673584,
      "rewards/rejected": -3.49375319480896,
      "step": 2101
    },
    {
      "epoch": 0.55,
      "grad_norm": 30.180173873901367,
      "kl": 0.0,
      "learning_rate": 2.2494111489138967e-07,
      "logps/chosen": -261.21405029296875,
      "logps/rejected": -268.70416259765625,
      "loss": 0.1482,
      "rewards/chosen": 1.0338104963302612,
      "rewards/margins": 5.0800395011901855,
      "rewards/rejected": -4.046228885650635,
      "step": 2102
    },
    {
      "epoch": 0.55,
      "grad_norm": 31.450355529785156,
      "kl": 0.0,
      "learning_rate": 2.2481025909447787e-07,
      "logps/chosen": -139.55465698242188,
      "logps/rejected": -239.27542114257812,
      "loss": 0.2741,
      "rewards/chosen": 0.44167831540107727,
      "rewards/margins": 3.0485122203826904,
      "rewards/rejected": -2.6068339347839355,
      "step": 2103
    },
    {
      "epoch": 0.55,
      "grad_norm": 47.74028396606445,
      "kl": 0.0,
      "learning_rate": 2.2467940329756606e-07,
      "logps/chosen": -274.1092529296875,
      "logps/rejected": -224.17535400390625,
      "loss": 0.3558,
      "rewards/chosen": -0.06706207990646362,
      "rewards/margins": 2.3114514350891113,
      "rewards/rejected": -2.3785135746002197,
      "step": 2104
    },
    {
      "epoch": 0.55,
      "grad_norm": 37.560523986816406,
      "kl": 0.0,
      "learning_rate": 2.2454854750065426e-07,
      "logps/chosen": -214.400146484375,
      "logps/rejected": -277.4588317871094,
      "loss": 0.1941,
      "rewards/chosen": 0.6747554540634155,
      "rewards/margins": 4.1736907958984375,
      "rewards/rejected": -3.4989354610443115,
      "step": 2105
    },
    {
      "epoch": 0.55,
      "grad_norm": 31.483280181884766,
      "kl": 0.0,
      "learning_rate": 2.2441769170374248e-07,
      "logps/chosen": -216.14453125,
      "logps/rejected": -244.1451416015625,
      "loss": 0.3242,
      "rewards/chosen": 0.3833763301372528,
      "rewards/margins": 4.8582916259765625,
      "rewards/rejected": -4.474915504455566,
      "step": 2106
    },
    {
      "epoch": 0.55,
      "grad_norm": 26.778085708618164,
      "kl": 0.0,
      "learning_rate": 2.2428683590683068e-07,
      "logps/chosen": -176.0457305908203,
      "logps/rejected": -272.55145263671875,
      "loss": 0.1509,
      "rewards/chosen": 1.2023561000823975,
      "rewards/margins": 4.90726375579834,
      "rewards/rejected": -3.7049076557159424,
      "step": 2107
    },
    {
      "epoch": 0.55,
      "grad_norm": 39.008323669433594,
      "kl": 0.0,
      "learning_rate": 2.2415598010991885e-07,
      "logps/chosen": -220.84750366210938,
      "logps/rejected": -272.07684326171875,
      "loss": 0.3226,
      "rewards/chosen": 1.5920367240905762,
      "rewards/margins": 4.589495658874512,
      "rewards/rejected": -2.9974586963653564,
      "step": 2108
    },
    {
      "epoch": 0.55,
      "grad_norm": 34.33102798461914,
      "kl": 0.0,
      "learning_rate": 2.2402512431300705e-07,
      "logps/chosen": -156.10195922851562,
      "logps/rejected": -184.01315307617188,
      "loss": 0.3066,
      "rewards/chosen": 1.0926826000213623,
      "rewards/margins": 3.315559148788452,
      "rewards/rejected": -2.22287654876709,
      "step": 2109
    },
    {
      "epoch": 0.55,
      "grad_norm": 35.68468475341797,
      "kl": 0.0,
      "learning_rate": 2.2389426851609524e-07,
      "logps/chosen": -184.97727966308594,
      "logps/rejected": -323.71539306640625,
      "loss": 0.2343,
      "rewards/chosen": 0.557456910610199,
      "rewards/margins": 4.741086006164551,
      "rewards/rejected": -4.183629035949707,
      "step": 2110
    },
    {
      "epoch": 0.55,
      "grad_norm": 40.77687072753906,
      "kl": 0.0,
      "learning_rate": 2.2376341271918344e-07,
      "logps/chosen": -243.48583984375,
      "logps/rejected": -239.84727478027344,
      "loss": 0.3424,
      "rewards/chosen": -0.6909189224243164,
      "rewards/margins": 1.2094755172729492,
      "rewards/rejected": -1.9003944396972656,
      "step": 2111
    },
    {
      "epoch": 0.55,
      "grad_norm": 33.72917556762695,
      "kl": 0.0,
      "learning_rate": 2.2363255692227166e-07,
      "logps/chosen": -245.06240844726562,
      "logps/rejected": -277.0432434082031,
      "loss": 0.2041,
      "rewards/chosen": 0.7709704637527466,
      "rewards/margins": 4.738353729248047,
      "rewards/rejected": -3.96738338470459,
      "step": 2112
    },
    {
      "epoch": 0.55,
      "grad_norm": 25.89661979675293,
      "kl": 0.0,
      "learning_rate": 2.2350170112535986e-07,
      "logps/chosen": -227.06411743164062,
      "logps/rejected": -211.87493896484375,
      "loss": 0.2515,
      "rewards/chosen": 2.6791181564331055,
      "rewards/margins": 5.6696014404296875,
      "rewards/rejected": -2.990483045578003,
      "step": 2113
    },
    {
      "epoch": 0.55,
      "grad_norm": 39.84913635253906,
      "kl": 0.0,
      "learning_rate": 2.2337084532844805e-07,
      "logps/chosen": -160.59805297851562,
      "logps/rejected": -188.2767333984375,
      "loss": 0.3441,
      "rewards/chosen": -0.2964557111263275,
      "rewards/margins": 1.1534799337387085,
      "rewards/rejected": -1.4499356746673584,
      "step": 2114
    },
    {
      "epoch": 0.55,
      "grad_norm": 36.11937713623047,
      "kl": 0.0,
      "learning_rate": 2.2323998953153625e-07,
      "logps/chosen": -204.82492065429688,
      "logps/rejected": -214.0093536376953,
      "loss": 0.2517,
      "rewards/chosen": 1.5596061944961548,
      "rewards/margins": 4.491985321044922,
      "rewards/rejected": -2.9323790073394775,
      "step": 2115
    },
    {
      "epoch": 0.55,
      "grad_norm": 39.927921295166016,
      "kl": 0.0,
      "learning_rate": 2.2310913373462442e-07,
      "logps/chosen": -169.34048461914062,
      "logps/rejected": -200.87672424316406,
      "loss": 0.3675,
      "rewards/chosen": -0.863287091255188,
      "rewards/margins": 0.387457013130188,
      "rewards/rejected": -1.250744104385376,
      "step": 2116
    },
    {
      "epoch": 0.55,
      "grad_norm": 24.469409942626953,
      "kl": 0.0,
      "learning_rate": 2.2297827793771261e-07,
      "logps/chosen": -157.0042724609375,
      "logps/rejected": -232.30807495117188,
      "loss": 0.2429,
      "rewards/chosen": -0.11154678463935852,
      "rewards/margins": 5.050269603729248,
      "rewards/rejected": -5.161816596984863,
      "step": 2117
    },
    {
      "epoch": 0.55,
      "grad_norm": 38.49171447753906,
      "kl": 0.0,
      "learning_rate": 2.228474221408008e-07,
      "logps/chosen": -289.55157470703125,
      "logps/rejected": -282.3967590332031,
      "loss": 0.2361,
      "rewards/chosen": 1.7181639671325684,
      "rewards/margins": 4.490333557128906,
      "rewards/rejected": -2.772169589996338,
      "step": 2118
    },
    {
      "epoch": 0.55,
      "grad_norm": 26.931310653686523,
      "kl": 0.0,
      "learning_rate": 2.2271656634388903e-07,
      "logps/chosen": -276.5685119628906,
      "logps/rejected": -287.1419677734375,
      "loss": 0.2312,
      "rewards/chosen": 1.2769911289215088,
      "rewards/margins": 5.967591285705566,
      "rewards/rejected": -4.690600395202637,
      "step": 2119
    },
    {
      "epoch": 0.55,
      "grad_norm": 42.65892791748047,
      "kl": 0.0,
      "learning_rate": 2.2258571054697723e-07,
      "logps/chosen": -200.8690643310547,
      "logps/rejected": -269.5424499511719,
      "loss": 0.2979,
      "rewards/chosen": -0.08438437432050705,
      "rewards/margins": 4.158094882965088,
      "rewards/rejected": -4.24247932434082,
      "step": 2120
    },
    {
      "epoch": 0.56,
      "grad_norm": 31.81760025024414,
      "kl": 0.0,
      "learning_rate": 2.2245485475006542e-07,
      "logps/chosen": -218.39447021484375,
      "logps/rejected": -310.91448974609375,
      "loss": 0.273,
      "rewards/chosen": 0.7956982254981995,
      "rewards/margins": 4.268186092376709,
      "rewards/rejected": -3.472487688064575,
      "step": 2121
    },
    {
      "epoch": 0.56,
      "grad_norm": 27.217666625976562,
      "kl": 0.0,
      "learning_rate": 2.2232399895315362e-07,
      "logps/chosen": -173.05572509765625,
      "logps/rejected": -205.1007080078125,
      "loss": 0.3612,
      "rewards/chosen": 0.7939292788505554,
      "rewards/margins": 3.3084661960601807,
      "rewards/rejected": -2.5145368576049805,
      "step": 2122
    },
    {
      "epoch": 0.56,
      "grad_norm": 39.9664306640625,
      "kl": 0.0,
      "learning_rate": 2.2219314315624182e-07,
      "logps/chosen": -266.86346435546875,
      "logps/rejected": -286.49920654296875,
      "loss": 0.3312,
      "rewards/chosen": 0.009727433323860168,
      "rewards/margins": 2.185347080230713,
      "rewards/rejected": -2.175619602203369,
      "step": 2123
    },
    {
      "epoch": 0.56,
      "grad_norm": 33.432987213134766,
      "kl": 0.0,
      "learning_rate": 2.2206228735933e-07,
      "logps/chosen": -192.0679931640625,
      "logps/rejected": -248.40536499023438,
      "loss": 0.1883,
      "rewards/chosen": 0.7708672881126404,
      "rewards/margins": 3.6535582542419434,
      "rewards/rejected": -2.882690906524658,
      "step": 2124
    },
    {
      "epoch": 0.56,
      "grad_norm": 37.950958251953125,
      "kl": 0.0,
      "learning_rate": 2.219314315624182e-07,
      "logps/chosen": -255.85194396972656,
      "logps/rejected": -309.26995849609375,
      "loss": 0.3021,
      "rewards/chosen": 1.773488998413086,
      "rewards/margins": 3.9557957649230957,
      "rewards/rejected": -2.1823067665100098,
      "step": 2125
    },
    {
      "epoch": 0.56,
      "grad_norm": 39.088714599609375,
      "kl": 0.0,
      "learning_rate": 2.218005757655064e-07,
      "logps/chosen": -227.0677490234375,
      "logps/rejected": -164.189697265625,
      "loss": 0.3448,
      "rewards/chosen": 0.35238975286483765,
      "rewards/margins": 2.7368478775024414,
      "rewards/rejected": -2.384458065032959,
      "step": 2126
    },
    {
      "epoch": 0.56,
      "grad_norm": 40.480316162109375,
      "kl": 0.0,
      "learning_rate": 2.216697199685946e-07,
      "logps/chosen": -213.88058471679688,
      "logps/rejected": -249.59548950195312,
      "loss": 0.4055,
      "rewards/chosen": 1.3990250825881958,
      "rewards/margins": 2.4990382194519043,
      "rewards/rejected": -1.100013017654419,
      "step": 2127
    },
    {
      "epoch": 0.56,
      "grad_norm": 38.71848678588867,
      "kl": 0.0,
      "learning_rate": 2.215388641716828e-07,
      "logps/chosen": -307.7563781738281,
      "logps/rejected": -276.81884765625,
      "loss": 0.2757,
      "rewards/chosen": -0.06617091596126556,
      "rewards/margins": 3.0361528396606445,
      "rewards/rejected": -3.1023237705230713,
      "step": 2128
    },
    {
      "epoch": 0.56,
      "grad_norm": 33.18585205078125,
      "kl": 0.0,
      "learning_rate": 2.21408008374771e-07,
      "logps/chosen": -220.02284240722656,
      "logps/rejected": -183.52706909179688,
      "loss": 0.1851,
      "rewards/chosen": 0.7266839146614075,
      "rewards/margins": 3.259298324584961,
      "rewards/rejected": -2.5326144695281982,
      "step": 2129
    },
    {
      "epoch": 0.56,
      "grad_norm": 32.663631439208984,
      "kl": 0.0,
      "learning_rate": 2.212771525778592e-07,
      "logps/chosen": -164.1979217529297,
      "logps/rejected": -256.0262145996094,
      "loss": 0.1787,
      "rewards/chosen": 0.20368297398090363,
      "rewards/margins": 4.717043399810791,
      "rewards/rejected": -4.513360500335693,
      "step": 2130
    },
    {
      "epoch": 0.56,
      "grad_norm": 28.846590042114258,
      "kl": 0.0,
      "learning_rate": 2.2114629678094736e-07,
      "logps/chosen": -181.72128295898438,
      "logps/rejected": -220.4137420654297,
      "loss": 0.2067,
      "rewards/chosen": 2.2399544715881348,
      "rewards/margins": 6.073826789855957,
      "rewards/rejected": -3.8338725566864014,
      "step": 2131
    },
    {
      "epoch": 0.56,
      "grad_norm": 36.972412109375,
      "kl": 0.0,
      "learning_rate": 2.2101544098403558e-07,
      "logps/chosen": -114.09268951416016,
      "logps/rejected": -245.62338256835938,
      "loss": 0.3456,
      "rewards/chosen": 0.64730304479599,
      "rewards/margins": 3.1397600173950195,
      "rewards/rejected": -2.4924569129943848,
      "step": 2132
    },
    {
      "epoch": 0.56,
      "grad_norm": 35.26288604736328,
      "kl": 0.0,
      "learning_rate": 2.2088458518712378e-07,
      "logps/chosen": -257.6791687011719,
      "logps/rejected": -192.79925537109375,
      "loss": 0.3397,
      "rewards/chosen": -0.32979992032051086,
      "rewards/margins": 1.9963352680206299,
      "rewards/rejected": -2.3261351585388184,
      "step": 2133
    },
    {
      "epoch": 0.56,
      "grad_norm": 36.57280349731445,
      "kl": 0.0,
      "learning_rate": 2.2075372939021197e-07,
      "logps/chosen": -187.70989990234375,
      "logps/rejected": -201.96804809570312,
      "loss": 0.3463,
      "rewards/chosen": 0.014570653438568115,
      "rewards/margins": 3.578090190887451,
      "rewards/rejected": -3.5635194778442383,
      "step": 2134
    },
    {
      "epoch": 0.56,
      "grad_norm": 27.505809783935547,
      "kl": 0.0,
      "learning_rate": 2.2062287359330017e-07,
      "logps/chosen": -225.48959350585938,
      "logps/rejected": -231.11915588378906,
      "loss": 0.2318,
      "rewards/chosen": 1.2916991710662842,
      "rewards/margins": 5.916680335998535,
      "rewards/rejected": -4.62498140335083,
      "step": 2135
    },
    {
      "epoch": 0.56,
      "grad_norm": 30.881328582763672,
      "kl": 0.0,
      "learning_rate": 2.2049201779638837e-07,
      "logps/chosen": -192.57763671875,
      "logps/rejected": -132.3449249267578,
      "loss": 0.309,
      "rewards/chosen": 0.8033540844917297,
      "rewards/margins": 3.7982852458953857,
      "rewards/rejected": -2.994931221008301,
      "step": 2136
    },
    {
      "epoch": 0.56,
      "grad_norm": 30.890676498413086,
      "kl": 0.0,
      "learning_rate": 2.2036116199947656e-07,
      "logps/chosen": -285.72076416015625,
      "logps/rejected": -222.05982971191406,
      "loss": 0.2203,
      "rewards/chosen": 2.0986266136169434,
      "rewards/margins": 6.601768970489502,
      "rewards/rejected": -4.503142356872559,
      "step": 2137
    },
    {
      "epoch": 0.56,
      "grad_norm": 38.768924713134766,
      "kl": 0.0,
      "learning_rate": 2.2023030620256479e-07,
      "logps/chosen": -226.85989379882812,
      "logps/rejected": -310.0096740722656,
      "loss": 0.2693,
      "rewards/chosen": -0.08474735170602798,
      "rewards/margins": 4.027078628540039,
      "rewards/rejected": -4.111825942993164,
      "step": 2138
    },
    {
      "epoch": 0.56,
      "grad_norm": 34.64961242675781,
      "kl": 0.0,
      "learning_rate": 2.2009945040565295e-07,
      "logps/chosen": -316.72369384765625,
      "logps/rejected": -335.4961242675781,
      "loss": 0.2341,
      "rewards/chosen": 1.2335299253463745,
      "rewards/margins": 3.577603816986084,
      "rewards/rejected": -2.344074010848999,
      "step": 2139
    },
    {
      "epoch": 0.56,
      "grad_norm": 31.96160316467285,
      "kl": 0.0,
      "learning_rate": 2.1996859460874115e-07,
      "logps/chosen": -184.9946746826172,
      "logps/rejected": -189.07225036621094,
      "loss": 0.1965,
      "rewards/chosen": 2.3256113529205322,
      "rewards/margins": 5.768891334533691,
      "rewards/rejected": -3.44327974319458,
      "step": 2140
    },
    {
      "epoch": 0.56,
      "grad_norm": 32.98160171508789,
      "kl": 0.0,
      "learning_rate": 2.1983773881182935e-07,
      "logps/chosen": -187.4897003173828,
      "logps/rejected": -235.82472229003906,
      "loss": 0.2849,
      "rewards/chosen": 0.5114994049072266,
      "rewards/margins": 3.3172054290771484,
      "rewards/rejected": -2.805706024169922,
      "step": 2141
    },
    {
      "epoch": 0.56,
      "grad_norm": 25.43107795715332,
      "kl": 0.0,
      "learning_rate": 2.1970688301491754e-07,
      "logps/chosen": -217.7528839111328,
      "logps/rejected": -294.7716979980469,
      "loss": 0.1704,
      "rewards/chosen": 1.8277029991149902,
      "rewards/margins": 5.8614821434021,
      "rewards/rejected": -4.033779144287109,
      "step": 2142
    },
    {
      "epoch": 0.56,
      "grad_norm": 32.24444580078125,
      "kl": 0.0,
      "learning_rate": 2.1957602721800574e-07,
      "logps/chosen": -240.55447387695312,
      "logps/rejected": -229.03414916992188,
      "loss": 0.3044,
      "rewards/chosen": -0.8243075609207153,
      "rewards/margins": 2.479674816131592,
      "rewards/rejected": -3.3039822578430176,
      "step": 2143
    },
    {
      "epoch": 0.56,
      "grad_norm": 39.09797668457031,
      "kl": 0.0,
      "learning_rate": 2.1944517142109396e-07,
      "logps/chosen": -180.61624145507812,
      "logps/rejected": -211.71670532226562,
      "loss": 0.3454,
      "rewards/chosen": 0.7784052491188049,
      "rewards/margins": 2.5519487857818604,
      "rewards/rejected": -1.7735434770584106,
      "step": 2144
    },
    {
      "epoch": 0.56,
      "grad_norm": 22.509733200073242,
      "kl": 0.0,
      "learning_rate": 2.1931431562418216e-07,
      "logps/chosen": -154.87033081054688,
      "logps/rejected": -232.0869903564453,
      "loss": 0.3044,
      "rewards/chosen": 1.8187867403030396,
      "rewards/margins": 5.667130470275879,
      "rewards/rejected": -3.848343849182129,
      "step": 2145
    },
    {
      "epoch": 0.56,
      "grad_norm": 40.67682647705078,
      "kl": 0.0,
      "learning_rate": 2.1918345982727035e-07,
      "logps/chosen": -254.24398803710938,
      "logps/rejected": -272.7815246582031,
      "loss": 0.2777,
      "rewards/chosen": 2.3531463146209717,
      "rewards/margins": 5.830312728881836,
      "rewards/rejected": -3.4771666526794434,
      "step": 2146
    },
    {
      "epoch": 0.56,
      "grad_norm": 30.30360984802246,
      "kl": 0.0,
      "learning_rate": 2.1905260403035852e-07,
      "logps/chosen": -165.32546997070312,
      "logps/rejected": -269.3461608886719,
      "loss": 0.2567,
      "rewards/chosen": 1.1869643926620483,
      "rewards/margins": 4.445183753967285,
      "rewards/rejected": -3.2582192420959473,
      "step": 2147
    },
    {
      "epoch": 0.56,
      "grad_norm": 42.13438415527344,
      "kl": 0.0,
      "learning_rate": 2.1892174823344672e-07,
      "logps/chosen": -197.6481170654297,
      "logps/rejected": -216.67141723632812,
      "loss": 0.3744,
      "rewards/chosen": 0.4272662401199341,
      "rewards/margins": 3.3712263107299805,
      "rewards/rejected": -2.943960189819336,
      "step": 2148
    },
    {
      "epoch": 0.56,
      "grad_norm": 37.454105377197266,
      "kl": 0.0,
      "learning_rate": 2.1879089243653492e-07,
      "logps/chosen": -222.81515502929688,
      "logps/rejected": -316.2408447265625,
      "loss": 0.3504,
      "rewards/chosen": -0.2285047173500061,
      "rewards/margins": 2.9904603958129883,
      "rewards/rejected": -3.2189650535583496,
      "step": 2149
    },
    {
      "epoch": 0.56,
      "grad_norm": 42.05679702758789,
      "kl": 0.0,
      "learning_rate": 2.186600366396231e-07,
      "logps/chosen": -171.63548278808594,
      "logps/rejected": -273.6712341308594,
      "loss": 0.2923,
      "rewards/chosen": 1.511362075805664,
      "rewards/margins": 4.281908988952637,
      "rewards/rejected": -2.7705469131469727,
      "step": 2150
    },
    {
      "epoch": 0.56,
      "grad_norm": 37.90392303466797,
      "kl": 0.0,
      "learning_rate": 2.1852918084271133e-07,
      "logps/chosen": -242.9505615234375,
      "logps/rejected": -183.59271240234375,
      "loss": 0.2654,
      "rewards/chosen": 0.4527307450771332,
      "rewards/margins": 2.6310439109802246,
      "rewards/rejected": -2.1783132553100586,
      "step": 2151
    },
    {
      "epoch": 0.56,
      "grad_norm": 33.639251708984375,
      "kl": 0.0,
      "learning_rate": 2.1839832504579953e-07,
      "logps/chosen": -163.69761657714844,
      "logps/rejected": -191.52780151367188,
      "loss": 0.2613,
      "rewards/chosen": 1.0045702457427979,
      "rewards/margins": 3.827319860458374,
      "rewards/rejected": -2.822749614715576,
      "step": 2152
    },
    {
      "epoch": 0.56,
      "grad_norm": 33.028106689453125,
      "kl": 0.0,
      "learning_rate": 2.1826746924888773e-07,
      "logps/chosen": -192.94677734375,
      "logps/rejected": -217.50164794921875,
      "loss": 0.339,
      "rewards/chosen": 0.38689547777175903,
      "rewards/margins": 3.0231199264526367,
      "rewards/rejected": -2.6362245082855225,
      "step": 2153
    },
    {
      "epoch": 0.56,
      "grad_norm": 27.515506744384766,
      "kl": 0.0,
      "learning_rate": 2.1813661345197592e-07,
      "logps/chosen": -169.6422119140625,
      "logps/rejected": -232.17977905273438,
      "loss": 0.3014,
      "rewards/chosen": 0.4662325084209442,
      "rewards/margins": 4.453876495361328,
      "rewards/rejected": -3.9876441955566406,
      "step": 2154
    },
    {
      "epoch": 0.56,
      "grad_norm": 34.339027404785156,
      "kl": 0.0,
      "learning_rate": 2.180057576550641e-07,
      "logps/chosen": -213.93386840820312,
      "logps/rejected": -202.92160034179688,
      "loss": 0.3042,
      "rewards/chosen": 2.6573381423950195,
      "rewards/margins": 5.09763240814209,
      "rewards/rejected": -2.4402945041656494,
      "step": 2155
    },
    {
      "epoch": 0.56,
      "grad_norm": 28.567813873291016,
      "kl": 0.0,
      "learning_rate": 2.178749018581523e-07,
      "logps/chosen": -205.0366973876953,
      "logps/rejected": -156.337646484375,
      "loss": 0.1926,
      "rewards/chosen": 1.1674195528030396,
      "rewards/margins": 3.3237528800964355,
      "rewards/rejected": -2.1563334465026855,
      "step": 2156
    },
    {
      "epoch": 0.56,
      "grad_norm": 34.4302864074707,
      "kl": 0.0,
      "learning_rate": 2.177440460612405e-07,
      "logps/chosen": -169.4579315185547,
      "logps/rejected": -209.5000457763672,
      "loss": 0.2903,
      "rewards/chosen": -0.6242586374282837,
      "rewards/margins": 4.042919635772705,
      "rewards/rejected": -4.667178153991699,
      "step": 2157
    },
    {
      "epoch": 0.56,
      "grad_norm": 30.366474151611328,
      "kl": 0.0,
      "learning_rate": 2.176131902643287e-07,
      "logps/chosen": -273.88360595703125,
      "logps/rejected": -211.7207794189453,
      "loss": 0.2764,
      "rewards/chosen": 1.9383443593978882,
      "rewards/margins": 4.192324161529541,
      "rewards/rejected": -2.2539799213409424,
      "step": 2158
    },
    {
      "epoch": 0.57,
      "grad_norm": 32.51573181152344,
      "kl": 0.0,
      "learning_rate": 2.174823344674169e-07,
      "logps/chosen": -298.5272216796875,
      "logps/rejected": -213.78758239746094,
      "loss": 0.2315,
      "rewards/chosen": 2.326655626296997,
      "rewards/margins": 5.482968330383301,
      "rewards/rejected": -3.1563124656677246,
      "step": 2159
    },
    {
      "epoch": 0.57,
      "grad_norm": 21.78364372253418,
      "kl": 0.0,
      "learning_rate": 2.173514786705051e-07,
      "logps/chosen": -234.16297912597656,
      "logps/rejected": -159.34295654296875,
      "loss": 0.2376,
      "rewards/chosen": 0.8480652570724487,
      "rewards/margins": 6.228734016418457,
      "rewards/rejected": -5.380668640136719,
      "step": 2160
    },
    {
      "epoch": 0.57,
      "grad_norm": 34.532020568847656,
      "kl": 0.0,
      "learning_rate": 2.172206228735933e-07,
      "logps/chosen": -224.65281677246094,
      "logps/rejected": -256.1437683105469,
      "loss": 0.216,
      "rewards/chosen": 2.745820999145508,
      "rewards/margins": 6.271589279174805,
      "rewards/rejected": -3.525768280029297,
      "step": 2161
    },
    {
      "epoch": 0.57,
      "grad_norm": 40.12411117553711,
      "kl": 0.0,
      "learning_rate": 2.1708976707668146e-07,
      "logps/chosen": -147.53533935546875,
      "logps/rejected": -218.46353149414062,
      "loss": 0.2874,
      "rewards/chosen": 0.935888946056366,
      "rewards/margins": 2.4985463619232178,
      "rewards/rejected": -1.5626574754714966,
      "step": 2162
    },
    {
      "epoch": 0.57,
      "grad_norm": 33.51218795776367,
      "kl": 0.0,
      "learning_rate": 2.1695891127976966e-07,
      "logps/chosen": -185.3155059814453,
      "logps/rejected": -369.5777893066406,
      "loss": 0.1757,
      "rewards/chosen": 1.2122989892959595,
      "rewards/margins": 4.9565558433532715,
      "rewards/rejected": -3.7442567348480225,
      "step": 2163
    },
    {
      "epoch": 0.57,
      "grad_norm": 39.717586517333984,
      "kl": 0.0,
      "learning_rate": 2.1682805548285788e-07,
      "logps/chosen": -229.6589813232422,
      "logps/rejected": -243.6190948486328,
      "loss": 0.2637,
      "rewards/chosen": 0.4361046850681305,
      "rewards/margins": 4.181896209716797,
      "rewards/rejected": -3.745791435241699,
      "step": 2164
    },
    {
      "epoch": 0.57,
      "grad_norm": 41.20941162109375,
      "kl": 0.0,
      "learning_rate": 2.1669719968594608e-07,
      "logps/chosen": -254.28562927246094,
      "logps/rejected": -211.9898681640625,
      "loss": 0.2955,
      "rewards/chosen": 1.7442975044250488,
      "rewards/margins": 4.571514129638672,
      "rewards/rejected": -2.827216386795044,
      "step": 2165
    },
    {
      "epoch": 0.57,
      "grad_norm": 178.1888885498047,
      "kl": 0.0,
      "learning_rate": 2.1656634388903428e-07,
      "logps/chosen": -139.78665161132812,
      "logps/rejected": -312.6597595214844,
      "loss": 0.3272,
      "rewards/chosen": 1.3665469884872437,
      "rewards/margins": 4.133112907409668,
      "rewards/rejected": -2.7665657997131348,
      "step": 2166
    },
    {
      "epoch": 0.57,
      "grad_norm": 33.31022644042969,
      "kl": 0.0,
      "learning_rate": 2.1643548809212247e-07,
      "logps/chosen": -225.217529296875,
      "logps/rejected": -217.964111328125,
      "loss": 0.287,
      "rewards/chosen": -0.30057814717292786,
      "rewards/margins": 2.33773136138916,
      "rewards/rejected": -2.6383094787597656,
      "step": 2167
    },
    {
      "epoch": 0.57,
      "grad_norm": 27.65751075744629,
      "kl": 0.0,
      "learning_rate": 2.1630463229521067e-07,
      "logps/chosen": -194.6341552734375,
      "logps/rejected": -216.58169555664062,
      "loss": 0.2897,
      "rewards/chosen": 1.8614850044250488,
      "rewards/margins": 5.929368019104004,
      "rewards/rejected": -4.067883014678955,
      "step": 2168
    },
    {
      "epoch": 0.57,
      "grad_norm": 35.12147903442383,
      "kl": 0.0,
      "learning_rate": 2.1617377649829886e-07,
      "logps/chosen": -152.90708923339844,
      "logps/rejected": -152.86331176757812,
      "loss": 0.3569,
      "rewards/chosen": -0.22007495164871216,
      "rewards/margins": 1.8580245971679688,
      "rewards/rejected": -2.078099489212036,
      "step": 2169
    },
    {
      "epoch": 0.57,
      "grad_norm": 34.59441375732422,
      "kl": 0.0,
      "learning_rate": 2.1604292070138706e-07,
      "logps/chosen": -160.6201171875,
      "logps/rejected": -378.0250244140625,
      "loss": 0.2663,
      "rewards/chosen": 2.978060245513916,
      "rewards/margins": 10.298988342285156,
      "rewards/rejected": -7.320927619934082,
      "step": 2170
    },
    {
      "epoch": 0.57,
      "grad_norm": 27.22705078125,
      "kl": 0.0,
      "learning_rate": 2.1591206490447526e-07,
      "logps/chosen": -199.74655151367188,
      "logps/rejected": -290.22235107421875,
      "loss": 0.1602,
      "rewards/chosen": 1.652679681777954,
      "rewards/margins": 6.4878082275390625,
      "rewards/rejected": -4.8351287841796875,
      "step": 2171
    },
    {
      "epoch": 0.57,
      "grad_norm": 31.888912200927734,
      "kl": 0.0,
      "learning_rate": 2.1578120910756345e-07,
      "logps/chosen": -136.07327270507812,
      "logps/rejected": -275.01251220703125,
      "loss": 0.2782,
      "rewards/chosen": 0.604078471660614,
      "rewards/margins": 6.338095188140869,
      "rewards/rejected": -5.7340168952941895,
      "step": 2172
    },
    {
      "epoch": 0.57,
      "grad_norm": 32.569190979003906,
      "kl": 0.0,
      "learning_rate": 2.1565035331065165e-07,
      "logps/chosen": -200.0724639892578,
      "logps/rejected": -308.7223205566406,
      "loss": 0.2741,
      "rewards/chosen": 0.7700318098068237,
      "rewards/margins": 5.290283679962158,
      "rewards/rejected": -4.520251750946045,
      "step": 2173
    },
    {
      "epoch": 0.57,
      "grad_norm": 29.394424438476562,
      "kl": 0.0,
      "learning_rate": 2.1551949751373984e-07,
      "logps/chosen": -261.6470031738281,
      "logps/rejected": -275.6170654296875,
      "loss": 0.3277,
      "rewards/chosen": -1.3419867753982544,
      "rewards/margins": 2.240203857421875,
      "rewards/rejected": -3.58219051361084,
      "step": 2174
    },
    {
      "epoch": 0.57,
      "grad_norm": 28.1448974609375,
      "kl": 0.0,
      "learning_rate": 2.1538864171682804e-07,
      "logps/chosen": -219.25546264648438,
      "logps/rejected": -211.60226440429688,
      "loss": 0.2081,
      "rewards/chosen": 1.6857521533966064,
      "rewards/margins": 4.913581371307373,
      "rewards/rejected": -3.2278292179107666,
      "step": 2175
    },
    {
      "epoch": 0.57,
      "grad_norm": 32.16535568237305,
      "kl": 0.0,
      "learning_rate": 2.1525778591991626e-07,
      "logps/chosen": -209.88577270507812,
      "logps/rejected": -337.0541687011719,
      "loss": 0.274,
      "rewards/chosen": 0.017229488119482994,
      "rewards/margins": 3.5651438236236572,
      "rewards/rejected": -3.5479142665863037,
      "step": 2176
    },
    {
      "epoch": 0.57,
      "grad_norm": 31.019987106323242,
      "kl": 0.0,
      "learning_rate": 2.1512693012300446e-07,
      "logps/chosen": -179.34695434570312,
      "logps/rejected": -221.03477478027344,
      "loss": 0.3261,
      "rewards/chosen": 1.2182724475860596,
      "rewards/margins": 5.014984607696533,
      "rewards/rejected": -3.7967121601104736,
      "step": 2177
    },
    {
      "epoch": 0.57,
      "grad_norm": 28.094982147216797,
      "kl": 0.0,
      "learning_rate": 2.1499607432609263e-07,
      "logps/chosen": -180.26943969726562,
      "logps/rejected": -301.4582214355469,
      "loss": 0.2488,
      "rewards/chosen": 1.3258321285247803,
      "rewards/margins": 5.93510627746582,
      "rewards/rejected": -4.609274387359619,
      "step": 2178
    },
    {
      "epoch": 0.57,
      "grad_norm": 30.352922439575195,
      "kl": 0.0,
      "learning_rate": 2.1486521852918082e-07,
      "logps/chosen": -268.167724609375,
      "logps/rejected": -183.42550659179688,
      "loss": 0.2504,
      "rewards/chosen": 1.3395893573760986,
      "rewards/margins": 4.956480026245117,
      "rewards/rejected": -3.6168904304504395,
      "step": 2179
    },
    {
      "epoch": 0.57,
      "grad_norm": 36.48234558105469,
      "kl": 0.0,
      "learning_rate": 2.1473436273226902e-07,
      "logps/chosen": -190.11546325683594,
      "logps/rejected": -230.5013427734375,
      "loss": 0.2733,
      "rewards/chosen": 0.42728573083877563,
      "rewards/margins": 3.968291759490967,
      "rewards/rejected": -3.541006088256836,
      "step": 2180
    },
    {
      "epoch": 0.57,
      "grad_norm": 31.570514678955078,
      "kl": 0.0,
      "learning_rate": 2.1460350693535722e-07,
      "logps/chosen": -185.64596557617188,
      "logps/rejected": -236.3667755126953,
      "loss": 0.2253,
      "rewards/chosen": 2.1138195991516113,
      "rewards/margins": 5.250305652618408,
      "rewards/rejected": -3.136486053466797,
      "step": 2181
    },
    {
      "epoch": 0.57,
      "grad_norm": 38.71629333496094,
      "kl": 0.0,
      "learning_rate": 2.144726511384454e-07,
      "logps/chosen": -242.28211975097656,
      "logps/rejected": -348.03997802734375,
      "loss": 0.3644,
      "rewards/chosen": 0.37175941467285156,
      "rewards/margins": 7.486160755157471,
      "rewards/rejected": -7.114401340484619,
      "step": 2182
    },
    {
      "epoch": 0.57,
      "grad_norm": 43.70787811279297,
      "kl": 0.0,
      "learning_rate": 2.1434179534153364e-07,
      "logps/chosen": -220.95281982421875,
      "logps/rejected": -279.92919921875,
      "loss": 0.2103,
      "rewards/chosen": 1.4428505897521973,
      "rewards/margins": 4.366604804992676,
      "rewards/rejected": -2.9237544536590576,
      "step": 2183
    },
    {
      "epoch": 0.57,
      "grad_norm": 37.34977340698242,
      "kl": 0.0,
      "learning_rate": 2.1421093954462183e-07,
      "logps/chosen": -203.84178161621094,
      "logps/rejected": -306.4324035644531,
      "loss": 0.2319,
      "rewards/chosen": 1.7929118871688843,
      "rewards/margins": 4.468847274780273,
      "rewards/rejected": -2.6759352684020996,
      "step": 2184
    },
    {
      "epoch": 0.57,
      "grad_norm": 37.7314453125,
      "kl": 0.0,
      "learning_rate": 2.1408008374771003e-07,
      "logps/chosen": -202.56298828125,
      "logps/rejected": -220.33523559570312,
      "loss": 0.2206,
      "rewards/chosen": 0.2738282084465027,
      "rewards/margins": 4.876184463500977,
      "rewards/rejected": -4.602356433868408,
      "step": 2185
    },
    {
      "epoch": 0.57,
      "grad_norm": 34.27370834350586,
      "kl": 0.0,
      "learning_rate": 2.139492279507982e-07,
      "logps/chosen": -282.90570068359375,
      "logps/rejected": -265.1341857910156,
      "loss": 0.1865,
      "rewards/chosen": 1.703295111656189,
      "rewards/margins": 6.923501491546631,
      "rewards/rejected": -5.220206260681152,
      "step": 2186
    },
    {
      "epoch": 0.57,
      "grad_norm": 52.453433990478516,
      "kl": 0.0,
      "learning_rate": 2.138183721538864e-07,
      "logps/chosen": -167.38584899902344,
      "logps/rejected": -235.28759765625,
      "loss": 0.3206,
      "rewards/chosen": -0.025245457887649536,
      "rewards/margins": 3.191519021987915,
      "rewards/rejected": -3.216764450073242,
      "step": 2187
    },
    {
      "epoch": 0.57,
      "grad_norm": 28.034582138061523,
      "kl": 0.0,
      "learning_rate": 2.136875163569746e-07,
      "logps/chosen": -178.97332763671875,
      "logps/rejected": -241.8062744140625,
      "loss": 0.3562,
      "rewards/chosen": 0.7335941791534424,
      "rewards/margins": 5.105332374572754,
      "rewards/rejected": -4.371738433837891,
      "step": 2188
    },
    {
      "epoch": 0.57,
      "grad_norm": 75.6514892578125,
      "kl": 0.0,
      "learning_rate": 2.135566605600628e-07,
      "logps/chosen": -157.3307647705078,
      "logps/rejected": -202.5512237548828,
      "loss": 0.3532,
      "rewards/chosen": 0.16009896993637085,
      "rewards/margins": 2.7601494789123535,
      "rewards/rejected": -2.600050449371338,
      "step": 2189
    },
    {
      "epoch": 0.57,
      "grad_norm": 26.94536590576172,
      "kl": 0.0,
      "learning_rate": 2.13425804763151e-07,
      "logps/chosen": -181.12144470214844,
      "logps/rejected": -358.65509033203125,
      "loss": 0.1763,
      "rewards/chosen": 1.8456650972366333,
      "rewards/margins": 6.388722896575928,
      "rewards/rejected": -4.543057918548584,
      "step": 2190
    },
    {
      "epoch": 0.57,
      "grad_norm": 25.052820205688477,
      "kl": 0.0,
      "learning_rate": 2.132949489662392e-07,
      "logps/chosen": -138.30601501464844,
      "logps/rejected": -200.51475524902344,
      "loss": 0.2248,
      "rewards/chosen": 1.732397437095642,
      "rewards/margins": 4.560910224914551,
      "rewards/rejected": -2.828512668609619,
      "step": 2191
    },
    {
      "epoch": 0.57,
      "grad_norm": 32.18205261230469,
      "kl": 0.0,
      "learning_rate": 2.131640931693274e-07,
      "logps/chosen": -245.55789184570312,
      "logps/rejected": -193.5570526123047,
      "loss": 0.3338,
      "rewards/chosen": 0.15273427963256836,
      "rewards/margins": 2.8544297218322754,
      "rewards/rejected": -2.701695442199707,
      "step": 2192
    },
    {
      "epoch": 0.57,
      "grad_norm": 22.67527961730957,
      "kl": 0.0,
      "learning_rate": 2.1303323737241557e-07,
      "logps/chosen": -179.4833984375,
      "logps/rejected": -201.1123046875,
      "loss": 0.1737,
      "rewards/chosen": 1.5341203212738037,
      "rewards/margins": 4.5851240158081055,
      "rewards/rejected": -3.0510034561157227,
      "step": 2193
    },
    {
      "epoch": 0.57,
      "grad_norm": 40.20901870727539,
      "kl": 0.0,
      "learning_rate": 2.1290238157550377e-07,
      "logps/chosen": -282.07647705078125,
      "logps/rejected": -189.77468872070312,
      "loss": 0.279,
      "rewards/chosen": 0.2572477459907532,
      "rewards/margins": 2.514106273651123,
      "rewards/rejected": -2.2568585872650146,
      "step": 2194
    },
    {
      "epoch": 0.57,
      "grad_norm": 36.39990997314453,
      "kl": 0.0,
      "learning_rate": 2.12771525778592e-07,
      "logps/chosen": -169.90203857421875,
      "logps/rejected": -300.26422119140625,
      "loss": 0.2349,
      "rewards/chosen": 0.17301256954669952,
      "rewards/margins": 3.594301462173462,
      "rewards/rejected": -3.4212889671325684,
      "step": 2195
    },
    {
      "epoch": 0.57,
      "grad_norm": 31.77029037475586,
      "kl": 0.0,
      "learning_rate": 2.1264066998168018e-07,
      "logps/chosen": -203.70204162597656,
      "logps/rejected": -237.15872192382812,
      "loss": 0.3168,
      "rewards/chosen": 1.2523151636123657,
      "rewards/margins": 4.519637584686279,
      "rewards/rejected": -3.267322301864624,
      "step": 2196
    },
    {
      "epoch": 0.57,
      "grad_norm": 30.283166885375977,
      "kl": 0.0,
      "learning_rate": 2.1250981418476838e-07,
      "logps/chosen": -234.54457092285156,
      "logps/rejected": -201.38150024414062,
      "loss": 0.2492,
      "rewards/chosen": 0.1242559552192688,
      "rewards/margins": 3.404097318649292,
      "rewards/rejected": -3.279841423034668,
      "step": 2197
    },
    {
      "epoch": 0.58,
      "grad_norm": 28.874860763549805,
      "kl": 0.0,
      "learning_rate": 2.1237895838785658e-07,
      "logps/chosen": -146.4224853515625,
      "logps/rejected": -255.65232849121094,
      "loss": 0.2861,
      "rewards/chosen": 0.9600822925567627,
      "rewards/margins": 4.900790214538574,
      "rewards/rejected": -3.9407076835632324,
      "step": 2198
    },
    {
      "epoch": 0.58,
      "grad_norm": 26.083675384521484,
      "kl": 0.0,
      "learning_rate": 2.1224810259094477e-07,
      "logps/chosen": -209.79185485839844,
      "logps/rejected": -204.64401245117188,
      "loss": 0.2222,
      "rewards/chosen": 1.7219187021255493,
      "rewards/margins": 3.7728919982910156,
      "rewards/rejected": -2.050973415374756,
      "step": 2199
    },
    {
      "epoch": 0.58,
      "grad_norm": 34.4384765625,
      "kl": 0.0,
      "learning_rate": 2.1211724679403297e-07,
      "logps/chosen": -203.7540740966797,
      "logps/rejected": -280.116943359375,
      "loss": 0.2361,
      "rewards/chosen": -0.42424073815345764,
      "rewards/margins": 3.4924893379211426,
      "rewards/rejected": -3.9167301654815674,
      "step": 2200
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.34817123413086,
      "kl": 0.0,
      "learning_rate": 2.1198639099712114e-07,
      "logps/chosen": -266.0620422363281,
      "logps/rejected": -350.2867736816406,
      "loss": 0.2504,
      "rewards/chosen": 0.5245301127433777,
      "rewards/margins": 4.969491004943848,
      "rewards/rejected": -4.444961071014404,
      "step": 2201
    },
    {
      "epoch": 0.58,
      "grad_norm": 38.560787200927734,
      "kl": 0.0,
      "learning_rate": 2.1185553520020936e-07,
      "logps/chosen": -145.8126678466797,
      "logps/rejected": -275.6151428222656,
      "loss": 0.3191,
      "rewards/chosen": 0.6639471054077148,
      "rewards/margins": 3.947199583053589,
      "rewards/rejected": -3.283252477645874,
      "step": 2202
    },
    {
      "epoch": 0.58,
      "grad_norm": 23.77583885192871,
      "kl": 0.0,
      "learning_rate": 2.1172467940329756e-07,
      "logps/chosen": -166.97357177734375,
      "logps/rejected": -223.1553955078125,
      "loss": 0.1399,
      "rewards/chosen": 1.8763564825057983,
      "rewards/margins": 6.32912540435791,
      "rewards/rejected": -4.452768802642822,
      "step": 2203
    },
    {
      "epoch": 0.58,
      "grad_norm": 36.36481475830078,
      "kl": 0.0,
      "learning_rate": 2.1159382360638575e-07,
      "logps/chosen": -203.16331481933594,
      "logps/rejected": -351.1559753417969,
      "loss": 0.2323,
      "rewards/chosen": -0.09685803204774857,
      "rewards/margins": 5.4982428550720215,
      "rewards/rejected": -5.5951008796691895,
      "step": 2204
    },
    {
      "epoch": 0.58,
      "grad_norm": 34.4608268737793,
      "kl": 0.0,
      "learning_rate": 2.1146296780947395e-07,
      "logps/chosen": -220.57008361816406,
      "logps/rejected": -289.4773864746094,
      "loss": 0.2413,
      "rewards/chosen": 1.1138266324996948,
      "rewards/margins": 4.565395355224609,
      "rewards/rejected": -3.451568603515625,
      "step": 2205
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.523155212402344,
      "kl": 0.0,
      "learning_rate": 2.1133211201256215e-07,
      "logps/chosen": -156.62518310546875,
      "logps/rejected": -215.77374267578125,
      "loss": 0.2555,
      "rewards/chosen": 1.4320130348205566,
      "rewards/margins": 3.965759754180908,
      "rewards/rejected": -2.5337467193603516,
      "step": 2206
    },
    {
      "epoch": 0.58,
      "grad_norm": 42.77104187011719,
      "kl": 0.0,
      "learning_rate": 2.1120125621565034e-07,
      "logps/chosen": -197.8491668701172,
      "logps/rejected": -234.12548828125,
      "loss": 0.2988,
      "rewards/chosen": 0.7398046851158142,
      "rewards/margins": 2.2821764945983887,
      "rewards/rejected": -1.5423718690872192,
      "step": 2207
    },
    {
      "epoch": 0.58,
      "grad_norm": 37.442527770996094,
      "kl": 0.0,
      "learning_rate": 2.1107040041873856e-07,
      "logps/chosen": -237.7860870361328,
      "logps/rejected": -215.8834228515625,
      "loss": 0.2311,
      "rewards/chosen": 1.64774751663208,
      "rewards/margins": 5.702475070953369,
      "rewards/rejected": -4.054727554321289,
      "step": 2208
    },
    {
      "epoch": 0.58,
      "grad_norm": 30.224994659423828,
      "kl": 0.0,
      "learning_rate": 2.1093954462182673e-07,
      "logps/chosen": -165.15650939941406,
      "logps/rejected": -310.42462158203125,
      "loss": 0.2603,
      "rewards/chosen": 1.1862459182739258,
      "rewards/margins": 5.600722312927246,
      "rewards/rejected": -4.41447639465332,
      "step": 2209
    },
    {
      "epoch": 0.58,
      "grad_norm": 38.65983963012695,
      "kl": 0.0,
      "learning_rate": 2.1080868882491493e-07,
      "logps/chosen": -196.13844299316406,
      "logps/rejected": -230.19146728515625,
      "loss": 0.2714,
      "rewards/chosen": 2.045654773712158,
      "rewards/margins": 4.847431182861328,
      "rewards/rejected": -2.801776647567749,
      "step": 2210
    },
    {
      "epoch": 0.58,
      "grad_norm": 33.665855407714844,
      "kl": 0.0,
      "learning_rate": 2.1067783302800313e-07,
      "logps/chosen": -227.00308227539062,
      "logps/rejected": -192.12783813476562,
      "loss": 0.2827,
      "rewards/chosen": 1.3021548986434937,
      "rewards/margins": 4.660031318664551,
      "rewards/rejected": -3.3578763008117676,
      "step": 2211
    },
    {
      "epoch": 0.58,
      "grad_norm": 34.888275146484375,
      "kl": 0.0,
      "learning_rate": 2.1054697723109132e-07,
      "logps/chosen": -205.8463592529297,
      "logps/rejected": -176.98809814453125,
      "loss": 0.2552,
      "rewards/chosen": 0.521612286567688,
      "rewards/margins": 2.937107563018799,
      "rewards/rejected": -2.4154953956604004,
      "step": 2212
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.640281677246094,
      "kl": 0.0,
      "learning_rate": 2.1041612143417952e-07,
      "logps/chosen": -234.514892578125,
      "logps/rejected": -269.9208679199219,
      "loss": 0.2382,
      "rewards/chosen": 1.1150953769683838,
      "rewards/margins": 4.122707366943359,
      "rewards/rejected": -3.0076119899749756,
      "step": 2213
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.11532211303711,
      "kl": 0.0,
      "learning_rate": 2.1028526563726774e-07,
      "logps/chosen": -192.67202758789062,
      "logps/rejected": -212.2769775390625,
      "loss": 0.2851,
      "rewards/chosen": -0.08396945893764496,
      "rewards/margins": 3.1026346683502197,
      "rewards/rejected": -3.1866040229797363,
      "step": 2214
    },
    {
      "epoch": 0.58,
      "grad_norm": 42.883541107177734,
      "kl": 0.0,
      "learning_rate": 2.1015440984035594e-07,
      "logps/chosen": -241.93801879882812,
      "logps/rejected": -286.9732971191406,
      "loss": 0.2737,
      "rewards/chosen": 1.6893227100372314,
      "rewards/margins": 5.431354522705078,
      "rewards/rejected": -3.7420315742492676,
      "step": 2215
    },
    {
      "epoch": 0.58,
      "grad_norm": 34.18726348876953,
      "kl": 0.0,
      "learning_rate": 2.1002355404344413e-07,
      "logps/chosen": -153.22897338867188,
      "logps/rejected": -150.74411010742188,
      "loss": 0.3609,
      "rewards/chosen": 1.0502197742462158,
      "rewards/margins": 2.500394821166992,
      "rewards/rejected": -1.4501750469207764,
      "step": 2216
    },
    {
      "epoch": 0.58,
      "grad_norm": 38.212318420410156,
      "kl": 0.0,
      "learning_rate": 2.098926982465323e-07,
      "logps/chosen": -224.86199951171875,
      "logps/rejected": -308.0025329589844,
      "loss": 0.1965,
      "rewards/chosen": 0.44962361454963684,
      "rewards/margins": 4.539242744445801,
      "rewards/rejected": -4.089619159698486,
      "step": 2217
    },
    {
      "epoch": 0.58,
      "grad_norm": 32.787559509277344,
      "kl": 0.0,
      "learning_rate": 2.097618424496205e-07,
      "logps/chosen": -277.6907043457031,
      "logps/rejected": -224.82205200195312,
      "loss": 0.2155,
      "rewards/chosen": 0.506578803062439,
      "rewards/margins": 3.952627182006836,
      "rewards/rejected": -3.4460482597351074,
      "step": 2218
    },
    {
      "epoch": 0.58,
      "grad_norm": 31.50032615661621,
      "kl": 0.0,
      "learning_rate": 2.096309866527087e-07,
      "logps/chosen": -219.48257446289062,
      "logps/rejected": -213.10658264160156,
      "loss": 0.3219,
      "rewards/chosen": 0.9474495649337769,
      "rewards/margins": 2.4071855545043945,
      "rewards/rejected": -1.4597361087799072,
      "step": 2219
    },
    {
      "epoch": 0.58,
      "grad_norm": 36.32145309448242,
      "kl": 0.0,
      "learning_rate": 2.095001308557969e-07,
      "logps/chosen": -225.48365783691406,
      "logps/rejected": -273.248291015625,
      "loss": 0.225,
      "rewards/chosen": 0.33812087774276733,
      "rewards/margins": 3.649228811264038,
      "rewards/rejected": -3.311107873916626,
      "step": 2220
    },
    {
      "epoch": 0.58,
      "grad_norm": 38.5352668762207,
      "kl": 0.0,
      "learning_rate": 2.0936927505888511e-07,
      "logps/chosen": -219.2425537109375,
      "logps/rejected": -274.7395935058594,
      "loss": 0.2791,
      "rewards/chosen": 1.2792561054229736,
      "rewards/margins": 5.127982139587402,
      "rewards/rejected": -3.8487257957458496,
      "step": 2221
    },
    {
      "epoch": 0.58,
      "grad_norm": 41.40312576293945,
      "kl": 0.0,
      "learning_rate": 2.092384192619733e-07,
      "logps/chosen": -229.9542694091797,
      "logps/rejected": -304.9787902832031,
      "loss": 0.2319,
      "rewards/chosen": 0.703386664390564,
      "rewards/margins": 4.285565376281738,
      "rewards/rejected": -3.582178831100464,
      "step": 2222
    },
    {
      "epoch": 0.58,
      "grad_norm": 34.01308059692383,
      "kl": 0.0,
      "learning_rate": 2.091075634650615e-07,
      "logps/chosen": -156.47169494628906,
      "logps/rejected": -347.48370361328125,
      "loss": 0.2302,
      "rewards/chosen": 1.8110793828964233,
      "rewards/margins": 3.668882369995117,
      "rewards/rejected": -1.8578031063079834,
      "step": 2223
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.65562057495117,
      "kl": 0.0,
      "learning_rate": 2.0897670766814968e-07,
      "logps/chosen": -155.8760528564453,
      "logps/rejected": -218.9156036376953,
      "loss": 0.2815,
      "rewards/chosen": 0.4598958194255829,
      "rewards/margins": 3.565948963165283,
      "rewards/rejected": -3.106053113937378,
      "step": 2224
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.85246658325195,
      "kl": 0.0,
      "learning_rate": 2.0884585187123787e-07,
      "logps/chosen": -196.15066528320312,
      "logps/rejected": -293.7716064453125,
      "loss": 0.2994,
      "rewards/chosen": 0.30416417121887207,
      "rewards/margins": 3.709951162338257,
      "rewards/rejected": -3.4057869911193848,
      "step": 2225
    },
    {
      "epoch": 0.58,
      "grad_norm": 31.45633316040039,
      "kl": 0.0,
      "learning_rate": 2.0871499607432607e-07,
      "logps/chosen": -161.6018829345703,
      "logps/rejected": -231.9564971923828,
      "loss": 0.204,
      "rewards/chosen": 1.5219181776046753,
      "rewards/margins": 5.138897895812988,
      "rewards/rejected": -3.6169795989990234,
      "step": 2226
    },
    {
      "epoch": 0.58,
      "grad_norm": 27.005146026611328,
      "kl": 0.0,
      "learning_rate": 2.085841402774143e-07,
      "logps/chosen": -170.5829620361328,
      "logps/rejected": -275.69659423828125,
      "loss": 0.2103,
      "rewards/chosen": 2.6731762886047363,
      "rewards/margins": 7.106757640838623,
      "rewards/rejected": -4.433581352233887,
      "step": 2227
    },
    {
      "epoch": 0.58,
      "grad_norm": 26.06833839416504,
      "kl": 0.0,
      "learning_rate": 2.0845328448050249e-07,
      "logps/chosen": -215.9841766357422,
      "logps/rejected": -279.5541076660156,
      "loss": 0.1581,
      "rewards/chosen": 1.9819355010986328,
      "rewards/margins": 6.945269584655762,
      "rewards/rejected": -4.963334083557129,
      "step": 2228
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.79971694946289,
      "kl": 0.0,
      "learning_rate": 2.0832242868359068e-07,
      "logps/chosen": -252.36244201660156,
      "logps/rejected": -285.28302001953125,
      "loss": 0.2275,
      "rewards/chosen": 1.6426401138305664,
      "rewards/margins": 4.779829502105713,
      "rewards/rejected": -3.1371893882751465,
      "step": 2229
    },
    {
      "epoch": 0.58,
      "grad_norm": 50.33610534667969,
      "kl": 0.0,
      "learning_rate": 2.0819157288667888e-07,
      "logps/chosen": -287.94281005859375,
      "logps/rejected": -286.266845703125,
      "loss": 0.2573,
      "rewards/chosen": -0.2679663300514221,
      "rewards/margins": 2.9511818885803223,
      "rewards/rejected": -3.2191481590270996,
      "step": 2230
    },
    {
      "epoch": 0.58,
      "grad_norm": 27.209035873413086,
      "kl": 0.0,
      "learning_rate": 2.0806071708976707e-07,
      "logps/chosen": -179.55047607421875,
      "logps/rejected": -257.8847961425781,
      "loss": 0.1428,
      "rewards/chosen": 2.861096143722534,
      "rewards/margins": 6.1882853507995605,
      "rewards/rejected": -3.3271892070770264,
      "step": 2231
    },
    {
      "epoch": 0.58,
      "grad_norm": 26.88276481628418,
      "kl": 0.0,
      "learning_rate": 2.0792986129285524e-07,
      "logps/chosen": -169.83990478515625,
      "logps/rejected": -234.80967712402344,
      "loss": 0.2285,
      "rewards/chosen": 1.3872361183166504,
      "rewards/margins": 7.181230545043945,
      "rewards/rejected": -5.793994426727295,
      "step": 2232
    },
    {
      "epoch": 0.58,
      "grad_norm": 35.116703033447266,
      "kl": 0.0,
      "learning_rate": 2.0779900549594344e-07,
      "logps/chosen": -237.92538452148438,
      "logps/rejected": -313.69866943359375,
      "loss": 0.2332,
      "rewards/chosen": -0.5640676021575928,
      "rewards/margins": 2.326479911804199,
      "rewards/rejected": -2.890547513961792,
      "step": 2233
    },
    {
      "epoch": 0.58,
      "grad_norm": 32.42366409301758,
      "kl": 0.0,
      "learning_rate": 2.0766814969903166e-07,
      "logps/chosen": -123.78561401367188,
      "logps/rejected": -258.0790710449219,
      "loss": 0.1687,
      "rewards/chosen": 0.20091545581817627,
      "rewards/margins": 4.391180038452148,
      "rewards/rejected": -4.190264701843262,
      "step": 2234
    },
    {
      "epoch": 0.58,
      "grad_norm": 38.087501525878906,
      "kl": 0.0,
      "learning_rate": 2.0753729390211986e-07,
      "logps/chosen": -235.76138305664062,
      "logps/rejected": -244.61502075195312,
      "loss": 0.3038,
      "rewards/chosen": 0.21202373504638672,
      "rewards/margins": 3.9693703651428223,
      "rewards/rejected": -3.7573466300964355,
      "step": 2235
    },
    {
      "epoch": 0.59,
      "grad_norm": 34.855560302734375,
      "kl": 0.0,
      "learning_rate": 2.0740643810520806e-07,
      "logps/chosen": -259.66522216796875,
      "logps/rejected": -245.32571411132812,
      "loss": 0.2637,
      "rewards/chosen": 0.4846804141998291,
      "rewards/margins": 4.76972770690918,
      "rewards/rejected": -4.2850470542907715,
      "step": 2236
    },
    {
      "epoch": 0.59,
      "grad_norm": 40.834163665771484,
      "kl": 0.0,
      "learning_rate": 2.0727558230829625e-07,
      "logps/chosen": -293.44512939453125,
      "logps/rejected": -210.43310546875,
      "loss": 0.3469,
      "rewards/chosen": 2.0567681789398193,
      "rewards/margins": 3.828779935836792,
      "rewards/rejected": -1.7720117568969727,
      "step": 2237
    },
    {
      "epoch": 0.59,
      "grad_norm": 32.50837707519531,
      "kl": 0.0,
      "learning_rate": 2.0714472651138445e-07,
      "logps/chosen": -228.298828125,
      "logps/rejected": -189.87283325195312,
      "loss": 0.2022,
      "rewards/chosen": 2.2769954204559326,
      "rewards/margins": 5.409682273864746,
      "rewards/rejected": -3.1326866149902344,
      "step": 2238
    },
    {
      "epoch": 0.59,
      "grad_norm": 34.75202178955078,
      "kl": 0.0,
      "learning_rate": 2.0701387071447264e-07,
      "logps/chosen": -229.86451721191406,
      "logps/rejected": -253.40927124023438,
      "loss": 0.3198,
      "rewards/chosen": 0.4948378801345825,
      "rewards/margins": 3.6758928298950195,
      "rewards/rejected": -3.1810548305511475,
      "step": 2239
    },
    {
      "epoch": 0.59,
      "grad_norm": 36.177547454833984,
      "kl": 0.0,
      "learning_rate": 2.0688301491756084e-07,
      "logps/chosen": -188.50875854492188,
      "logps/rejected": -309.8052978515625,
      "loss": 0.282,
      "rewards/chosen": 0.8006499409675598,
      "rewards/margins": 4.804440975189209,
      "rewards/rejected": -4.003790855407715,
      "step": 2240
    },
    {
      "epoch": 0.59,
      "grad_norm": 27.196758270263672,
      "kl": 0.0,
      "learning_rate": 2.0675215912064904e-07,
      "logps/chosen": -162.847900390625,
      "logps/rejected": -280.6136474609375,
      "loss": 0.3561,
      "rewards/chosen": 0.6857805252075195,
      "rewards/margins": 5.029175758361816,
      "rewards/rejected": -4.343395233154297,
      "step": 2241
    },
    {
      "epoch": 0.59,
      "grad_norm": 40.71586608886719,
      "kl": 0.0,
      "learning_rate": 2.0662130332373723e-07,
      "logps/chosen": -257.04296875,
      "logps/rejected": -187.94483947753906,
      "loss": 0.2425,
      "rewards/chosen": 1.0367754697799683,
      "rewards/margins": 4.446749687194824,
      "rewards/rejected": -3.4099740982055664,
      "step": 2242
    },
    {
      "epoch": 0.59,
      "grad_norm": 32.89750289916992,
      "kl": 0.0,
      "learning_rate": 2.0649044752682543e-07,
      "logps/chosen": -270.7828063964844,
      "logps/rejected": -268.9897155761719,
      "loss": 0.1506,
      "rewards/chosen": 2.786309242248535,
      "rewards/margins": 7.044668674468994,
      "rewards/rejected": -4.258359432220459,
      "step": 2243
    },
    {
      "epoch": 0.59,
      "grad_norm": 36.8209228515625,
      "kl": 0.0,
      "learning_rate": 2.0635959172991362e-07,
      "logps/chosen": -205.2930908203125,
      "logps/rejected": -207.3105010986328,
      "loss": 0.2875,
      "rewards/chosen": 1.5179622173309326,
      "rewards/margins": 3.5570497512817383,
      "rewards/rejected": -2.0390875339508057,
      "step": 2244
    },
    {
      "epoch": 0.59,
      "grad_norm": 36.65214157104492,
      "kl": 0.0,
      "learning_rate": 2.0622873593300182e-07,
      "logps/chosen": -158.63861083984375,
      "logps/rejected": -217.1534881591797,
      "loss": 0.3151,
      "rewards/chosen": 0.019611716270446777,
      "rewards/margins": 2.943315029144287,
      "rewards/rejected": -2.92370343208313,
      "step": 2245
    },
    {
      "epoch": 0.59,
      "grad_norm": 40.02116394042969,
      "kl": 0.0,
      "learning_rate": 2.0609788013609004e-07,
      "logps/chosen": -249.88970947265625,
      "logps/rejected": -151.15518188476562,
      "loss": 0.3755,
      "rewards/chosen": 0.8100465536117554,
      "rewards/margins": 2.0503296852111816,
      "rewards/rejected": -1.2402830123901367,
      "step": 2246
    },
    {
      "epoch": 0.59,
      "grad_norm": 35.16876983642578,
      "kl": 0.0,
      "learning_rate": 2.059670243391782e-07,
      "logps/chosen": -226.23460388183594,
      "logps/rejected": -245.07582092285156,
      "loss": 0.2016,
      "rewards/chosen": 1.5717494487762451,
      "rewards/margins": 5.3078742027282715,
      "rewards/rejected": -3.7361247539520264,
      "step": 2247
    },
    {
      "epoch": 0.59,
      "grad_norm": 30.50265121459961,
      "kl": 0.0,
      "learning_rate": 2.058361685422664e-07,
      "logps/chosen": -155.54959106445312,
      "logps/rejected": -251.41653442382812,
      "loss": 0.2158,
      "rewards/chosen": 1.5379118919372559,
      "rewards/margins": 5.863900184631348,
      "rewards/rejected": -4.325988292694092,
      "step": 2248
    },
    {
      "epoch": 0.59,
      "grad_norm": 45.763301849365234,
      "kl": 0.0,
      "learning_rate": 2.057053127453546e-07,
      "logps/chosen": -204.8267059326172,
      "logps/rejected": -216.93017578125,
      "loss": 0.2251,
      "rewards/chosen": 1.2222058773040771,
      "rewards/margins": 4.166966438293457,
      "rewards/rejected": -2.944760322570801,
      "step": 2249
    },
    {
      "epoch": 0.59,
      "grad_norm": 35.689453125,
      "kl": 0.0,
      "learning_rate": 2.055744569484428e-07,
      "logps/chosen": -167.92544555664062,
      "logps/rejected": -189.067626953125,
      "loss": 0.2259,
      "rewards/chosen": 0.4054737687110901,
      "rewards/margins": 4.531358242034912,
      "rewards/rejected": -4.125884532928467,
      "step": 2250
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.52212905883789,
      "kl": 0.0,
      "learning_rate": 2.05443601151531e-07,
      "logps/chosen": -227.6565704345703,
      "logps/rejected": -263.6648864746094,
      "loss": 0.1781,
      "rewards/chosen": 1.7398179769515991,
      "rewards/margins": 4.553890228271484,
      "rewards/rejected": -2.8140721321105957,
      "step": 2251
    },
    {
      "epoch": 0.59,
      "grad_norm": 26.91353988647461,
      "kl": 0.0,
      "learning_rate": 2.053127453546192e-07,
      "logps/chosen": -145.8267059326172,
      "logps/rejected": -270.70635986328125,
      "loss": 0.2029,
      "rewards/chosen": 1.10588800907135,
      "rewards/margins": 4.696566104888916,
      "rewards/rejected": -3.5906779766082764,
      "step": 2252
    },
    {
      "epoch": 0.59,
      "grad_norm": 33.017311096191406,
      "kl": 0.0,
      "learning_rate": 2.0518188955770742e-07,
      "logps/chosen": -204.77932739257812,
      "logps/rejected": -308.2210388183594,
      "loss": 0.2999,
      "rewards/chosen": 1.018227219581604,
      "rewards/margins": 4.7906084060668945,
      "rewards/rejected": -3.77238130569458,
      "step": 2253
    },
    {
      "epoch": 0.59,
      "grad_norm": 28.2861385345459,
      "kl": 0.0,
      "learning_rate": 2.050510337607956e-07,
      "logps/chosen": -233.28012084960938,
      "logps/rejected": -170.07479858398438,
      "loss": 0.4263,
      "rewards/chosen": -0.36268797516822815,
      "rewards/margins": 2.6240553855895996,
      "rewards/rejected": -2.986743450164795,
      "step": 2254
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.228620529174805,
      "kl": 0.0,
      "learning_rate": 2.0492017796388378e-07,
      "logps/chosen": -257.2540588378906,
      "logps/rejected": -267.53515625,
      "loss": 0.1912,
      "rewards/chosen": 0.64061039686203,
      "rewards/margins": 3.424734592437744,
      "rewards/rejected": -2.7841241359710693,
      "step": 2255
    },
    {
      "epoch": 0.59,
      "grad_norm": 35.21196365356445,
      "kl": 0.0,
      "learning_rate": 2.0478932216697198e-07,
      "logps/chosen": -213.58702087402344,
      "logps/rejected": -329.76788330078125,
      "loss": 0.2916,
      "rewards/chosen": 0.6111934185028076,
      "rewards/margins": 4.495480060577393,
      "rewards/rejected": -3.884286642074585,
      "step": 2256
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.973970413208008,
      "kl": 0.0,
      "learning_rate": 2.0465846637006017e-07,
      "logps/chosen": -239.40626525878906,
      "logps/rejected": -191.99514770507812,
      "loss": 0.2833,
      "rewards/chosen": -0.32032090425491333,
      "rewards/margins": 3.3377461433410645,
      "rewards/rejected": -3.658066987991333,
      "step": 2257
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.78082275390625,
      "kl": 0.0,
      "learning_rate": 2.0452761057314837e-07,
      "logps/chosen": -224.99295043945312,
      "logps/rejected": -208.2244873046875,
      "loss": 0.4009,
      "rewards/chosen": -0.342149019241333,
      "rewards/margins": 2.974942922592163,
      "rewards/rejected": -3.317091941833496,
      "step": 2258
    },
    {
      "epoch": 0.59,
      "grad_norm": 40.125362396240234,
      "kl": 0.0,
      "learning_rate": 2.043967547762366e-07,
      "logps/chosen": -142.40603637695312,
      "logps/rejected": -232.1035614013672,
      "loss": 0.3002,
      "rewards/chosen": 0.41182827949523926,
      "rewards/margins": 5.9469099044799805,
      "rewards/rejected": -5.53508186340332,
      "step": 2259
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.44727325439453,
      "kl": 0.0,
      "learning_rate": 2.042658989793248e-07,
      "logps/chosen": -238.10951232910156,
      "logps/rejected": -203.90293884277344,
      "loss": 0.2593,
      "rewards/chosen": 0.28709545731544495,
      "rewards/margins": 2.862297296524048,
      "rewards/rejected": -2.5752017498016357,
      "step": 2260
    },
    {
      "epoch": 0.59,
      "grad_norm": 40.94384002685547,
      "kl": 0.0,
      "learning_rate": 2.0413504318241298e-07,
      "logps/chosen": -169.60580444335938,
      "logps/rejected": -287.79888916015625,
      "loss": 0.4053,
      "rewards/chosen": -0.45726338028907776,
      "rewards/margins": 2.6300411224365234,
      "rewards/rejected": -3.0873045921325684,
      "step": 2261
    },
    {
      "epoch": 0.59,
      "grad_norm": 29.562114715576172,
      "kl": 0.0,
      "learning_rate": 2.0400418738550118e-07,
      "logps/chosen": -239.27877807617188,
      "logps/rejected": -378.2849426269531,
      "loss": 0.2454,
      "rewards/chosen": 0.4851676821708679,
      "rewards/margins": 4.73510217666626,
      "rewards/rejected": -4.249934673309326,
      "step": 2262
    },
    {
      "epoch": 0.59,
      "grad_norm": 47.51166915893555,
      "kl": 0.0,
      "learning_rate": 2.0387333158858935e-07,
      "logps/chosen": -213.38778686523438,
      "logps/rejected": -257.33331298828125,
      "loss": 0.3026,
      "rewards/chosen": 0.39499297738075256,
      "rewards/margins": 3.191692590713501,
      "rewards/rejected": -2.7966995239257812,
      "step": 2263
    },
    {
      "epoch": 0.59,
      "grad_norm": 32.65146255493164,
      "kl": 0.0,
      "learning_rate": 2.0374247579167755e-07,
      "logps/chosen": -215.3774871826172,
      "logps/rejected": -231.47328186035156,
      "loss": 0.3043,
      "rewards/chosen": 0.690306544303894,
      "rewards/margins": 2.42452335357666,
      "rewards/rejected": -1.7342166900634766,
      "step": 2264
    },
    {
      "epoch": 0.59,
      "grad_norm": 30.549049377441406,
      "kl": 0.0,
      "learning_rate": 2.0361161999476574e-07,
      "logps/chosen": -220.37583923339844,
      "logps/rejected": -260.5372009277344,
      "loss": 0.1832,
      "rewards/chosen": -0.4377664029598236,
      "rewards/margins": 4.060056686401367,
      "rewards/rejected": -4.497823238372803,
      "step": 2265
    },
    {
      "epoch": 0.59,
      "grad_norm": 29.487773895263672,
      "kl": 0.0,
      "learning_rate": 2.0348076419785396e-07,
      "logps/chosen": -219.22384643554688,
      "logps/rejected": -278.33453369140625,
      "loss": 0.2964,
      "rewards/chosen": 1.3252283334732056,
      "rewards/margins": 5.023862361907959,
      "rewards/rejected": -3.698634147644043,
      "step": 2266
    },
    {
      "epoch": 0.59,
      "grad_norm": 31.48894500732422,
      "kl": 0.0,
      "learning_rate": 2.0334990840094216e-07,
      "logps/chosen": -219.73208618164062,
      "logps/rejected": -219.90045166015625,
      "loss": 0.1944,
      "rewards/chosen": 1.8473029136657715,
      "rewards/margins": 6.5499396324157715,
      "rewards/rejected": -4.70263671875,
      "step": 2267
    },
    {
      "epoch": 0.59,
      "grad_norm": 23.981107711791992,
      "kl": 0.0,
      "learning_rate": 2.0321905260403036e-07,
      "logps/chosen": -133.35617065429688,
      "logps/rejected": -190.30067443847656,
      "loss": 0.33,
      "rewards/chosen": -0.8549145460128784,
      "rewards/margins": 3.315229892730713,
      "rewards/rejected": -4.170144557952881,
      "step": 2268
    },
    {
      "epoch": 0.59,
      "grad_norm": 38.23117446899414,
      "kl": 0.0,
      "learning_rate": 2.0308819680711855e-07,
      "logps/chosen": -198.3517303466797,
      "logps/rejected": -327.96453857421875,
      "loss": 0.2805,
      "rewards/chosen": 0.14711612462997437,
      "rewards/margins": 4.454745769500732,
      "rewards/rejected": -4.307629585266113,
      "step": 2269
    },
    {
      "epoch": 0.59,
      "grad_norm": 34.60736846923828,
      "kl": 0.0,
      "learning_rate": 2.0295734101020675e-07,
      "logps/chosen": -182.74720764160156,
      "logps/rejected": -207.50497436523438,
      "loss": 0.2981,
      "rewards/chosen": 0.09414029121398926,
      "rewards/margins": 3.436382293701172,
      "rewards/rejected": -3.3422420024871826,
      "step": 2270
    },
    {
      "epoch": 0.59,
      "grad_norm": 42.404842376708984,
      "kl": 0.0,
      "learning_rate": 2.0282648521329492e-07,
      "logps/chosen": -246.59584045410156,
      "logps/rejected": -236.692626953125,
      "loss": 0.2859,
      "rewards/chosen": 0.38779035210609436,
      "rewards/margins": 3.6365997791290283,
      "rewards/rejected": -3.248809337615967,
      "step": 2271
    },
    {
      "epoch": 0.59,
      "grad_norm": 32.34096145629883,
      "kl": 0.0,
      "learning_rate": 2.0269562941638314e-07,
      "logps/chosen": -184.75596618652344,
      "logps/rejected": -225.25875854492188,
      "loss": 0.2885,
      "rewards/chosen": 0.9209032654762268,
      "rewards/margins": 3.5063042640686035,
      "rewards/rejected": -2.5854010581970215,
      "step": 2272
    },
    {
      "epoch": 0.59,
      "grad_norm": 20.64876365661621,
      "kl": 0.0,
      "learning_rate": 2.0256477361947134e-07,
      "logps/chosen": -146.1011505126953,
      "logps/rejected": -244.64822387695312,
      "loss": 0.1834,
      "rewards/chosen": 2.905881643295288,
      "rewards/margins": 6.3737945556640625,
      "rewards/rejected": -3.4679126739501953,
      "step": 2273
    },
    {
      "epoch": 0.6,
      "grad_norm": 33.63536071777344,
      "kl": 0.0,
      "learning_rate": 2.0243391782255953e-07,
      "logps/chosen": -235.8003692626953,
      "logps/rejected": -336.51385498046875,
      "loss": 0.2576,
      "rewards/chosen": 0.5231467485427856,
      "rewards/margins": 7.412076950073242,
      "rewards/rejected": -6.888930320739746,
      "step": 2274
    },
    {
      "epoch": 0.6,
      "grad_norm": 34.075950622558594,
      "kl": 0.0,
      "learning_rate": 2.0230306202564773e-07,
      "logps/chosen": -211.6470489501953,
      "logps/rejected": -185.71539306640625,
      "loss": 0.3364,
      "rewards/chosen": 1.6632918119430542,
      "rewards/margins": 5.770527362823486,
      "rewards/rejected": -4.107235431671143,
      "step": 2275
    },
    {
      "epoch": 0.6,
      "grad_norm": 25.605819702148438,
      "kl": 0.0,
      "learning_rate": 2.0217220622873593e-07,
      "logps/chosen": -247.4192352294922,
      "logps/rejected": -231.5600128173828,
      "loss": 0.2677,
      "rewards/chosen": -0.2584896385669708,
      "rewards/margins": 4.03928804397583,
      "rewards/rejected": -4.2977776527404785,
      "step": 2276
    },
    {
      "epoch": 0.6,
      "grad_norm": 42.00531768798828,
      "kl": 0.0,
      "learning_rate": 2.0204135043182412e-07,
      "logps/chosen": -300.18609619140625,
      "logps/rejected": -263.7876281738281,
      "loss": 0.2941,
      "rewards/chosen": 1.2072327136993408,
      "rewards/margins": 2.713994026184082,
      "rewards/rejected": -1.5067613124847412,
      "step": 2277
    },
    {
      "epoch": 0.6,
      "grad_norm": 31.236480712890625,
      "kl": 0.0,
      "learning_rate": 2.019104946349123e-07,
      "logps/chosen": -231.35574340820312,
      "logps/rejected": -279.8404541015625,
      "loss": 0.2957,
      "rewards/chosen": 0.4912111163139343,
      "rewards/margins": 6.064578056335449,
      "rewards/rejected": -5.573367118835449,
      "step": 2278
    },
    {
      "epoch": 0.6,
      "grad_norm": 30.456981658935547,
      "kl": 0.0,
      "learning_rate": 2.0177963883800051e-07,
      "logps/chosen": -208.600830078125,
      "logps/rejected": -197.56190490722656,
      "loss": 0.1894,
      "rewards/chosen": 1.867599368095398,
      "rewards/margins": 5.389187812805176,
      "rewards/rejected": -3.5215883255004883,
      "step": 2279
    },
    {
      "epoch": 0.6,
      "grad_norm": 36.0406494140625,
      "kl": 0.0,
      "learning_rate": 2.016487830410887e-07,
      "logps/chosen": -174.555908203125,
      "logps/rejected": -345.35638427734375,
      "loss": 0.2134,
      "rewards/chosen": -0.6918312907218933,
      "rewards/margins": 5.157019138336182,
      "rewards/rejected": -5.848850250244141,
      "step": 2280
    },
    {
      "epoch": 0.6,
      "grad_norm": 52.155052185058594,
      "kl": 0.0,
      "learning_rate": 2.015179272441769e-07,
      "logps/chosen": -164.63284301757812,
      "logps/rejected": -272.5175476074219,
      "loss": 0.279,
      "rewards/chosen": 0.6520876884460449,
      "rewards/margins": 5.025417327880859,
      "rewards/rejected": -4.3733296394348145,
      "step": 2281
    },
    {
      "epoch": 0.6,
      "grad_norm": 46.86626052856445,
      "kl": 0.0,
      "learning_rate": 2.013870714472651e-07,
      "logps/chosen": -207.02198791503906,
      "logps/rejected": -338.878662109375,
      "loss": 0.2458,
      "rewards/chosen": 1.6476094722747803,
      "rewards/margins": 5.5855560302734375,
      "rewards/rejected": -3.937946319580078,
      "step": 2282
    },
    {
      "epoch": 0.6,
      "grad_norm": 35.65882110595703,
      "kl": 0.0,
      "learning_rate": 2.012562156503533e-07,
      "logps/chosen": -238.85012817382812,
      "logps/rejected": -276.264404296875,
      "loss": 0.3135,
      "rewards/chosen": 0.41609618067741394,
      "rewards/margins": 4.50074577331543,
      "rewards/rejected": -4.084649562835693,
      "step": 2283
    },
    {
      "epoch": 0.6,
      "grad_norm": 40.120540618896484,
      "kl": 0.0,
      "learning_rate": 2.011253598534415e-07,
      "logps/chosen": -206.02651977539062,
      "logps/rejected": -254.01666259765625,
      "loss": 0.2503,
      "rewards/chosen": 1.6650097370147705,
      "rewards/margins": 4.307703971862793,
      "rewards/rejected": -2.6426944732666016,
      "step": 2284
    },
    {
      "epoch": 0.6,
      "grad_norm": 28.992889404296875,
      "kl": 0.0,
      "learning_rate": 2.0099450405652972e-07,
      "logps/chosen": -196.9144287109375,
      "logps/rejected": -263.31854248046875,
      "loss": 0.291,
      "rewards/chosen": 1.4108424186706543,
      "rewards/margins": 3.198729991912842,
      "rewards/rejected": -1.787887454032898,
      "step": 2285
    },
    {
      "epoch": 0.6,
      "grad_norm": 29.407325744628906,
      "kl": 0.0,
      "learning_rate": 2.0086364825961789e-07,
      "logps/chosen": -188.19052124023438,
      "logps/rejected": -263.819580078125,
      "loss": 0.3178,
      "rewards/chosen": 1.2455238103866577,
      "rewards/margins": 5.1608991622924805,
      "rewards/rejected": -3.915375232696533,
      "step": 2286
    },
    {
      "epoch": 0.6,
      "grad_norm": 29.475479125976562,
      "kl": 0.0,
      "learning_rate": 2.0073279246270608e-07,
      "logps/chosen": -224.15408325195312,
      "logps/rejected": -250.49986267089844,
      "loss": 0.2832,
      "rewards/chosen": 1.654449462890625,
      "rewards/margins": 3.692563056945801,
      "rewards/rejected": -2.038113594055176,
      "step": 2287
    },
    {
      "epoch": 0.6,
      "grad_norm": 26.579605102539062,
      "kl": 0.0,
      "learning_rate": 2.0060193666579428e-07,
      "logps/chosen": -244.87388610839844,
      "logps/rejected": -297.46636962890625,
      "loss": 0.2238,
      "rewards/chosen": 1.6434001922607422,
      "rewards/margins": 6.1101579666137695,
      "rewards/rejected": -4.466757774353027,
      "step": 2288
    },
    {
      "epoch": 0.6,
      "grad_norm": 33.45051574707031,
      "kl": 0.0,
      "learning_rate": 2.0047108086888247e-07,
      "logps/chosen": -131.8313751220703,
      "logps/rejected": -243.05029296875,
      "loss": 0.2313,
      "rewards/chosen": -0.7130619287490845,
      "rewards/margins": 2.145439624786377,
      "rewards/rejected": -2.858501434326172,
      "step": 2289
    },
    {
      "epoch": 0.6,
      "grad_norm": 38.82516098022461,
      "kl": 0.0,
      "learning_rate": 2.0034022507197067e-07,
      "logps/chosen": -207.3368377685547,
      "logps/rejected": -229.88768005371094,
      "loss": 0.2461,
      "rewards/chosen": 1.289283037185669,
      "rewards/margins": 5.446189880371094,
      "rewards/rejected": -4.156907081604004,
      "step": 2290
    },
    {
      "epoch": 0.6,
      "grad_norm": 34.82200622558594,
      "kl": 0.0,
      "learning_rate": 2.002093692750589e-07,
      "logps/chosen": -95.67045593261719,
      "logps/rejected": -228.51051330566406,
      "loss": 0.2927,
      "rewards/chosen": 0.8008228540420532,
      "rewards/margins": 4.647127151489258,
      "rewards/rejected": -3.846304178237915,
      "step": 2291
    },
    {
      "epoch": 0.6,
      "grad_norm": 39.77212905883789,
      "kl": 0.0,
      "learning_rate": 2.000785134781471e-07,
      "logps/chosen": -177.02243041992188,
      "logps/rejected": -266.8466491699219,
      "loss": 0.3203,
      "rewards/chosen": 0.5170644521713257,
      "rewards/margins": 3.983919143676758,
      "rewards/rejected": -3.4668545722961426,
      "step": 2292
    },
    {
      "epoch": 0.6,
      "grad_norm": 31.12018394470215,
      "kl": 0.0,
      "learning_rate": 1.9994765768123529e-07,
      "logps/chosen": -187.1675567626953,
      "logps/rejected": -226.5849151611328,
      "loss": 0.2171,
      "rewards/chosen": 1.33591628074646,
      "rewards/margins": 5.126616477966309,
      "rewards/rejected": -3.7907004356384277,
      "step": 2293
    },
    {
      "epoch": 0.6,
      "grad_norm": 64.28606414794922,
      "kl": 0.0,
      "learning_rate": 1.9981680188432345e-07,
      "logps/chosen": -211.73049926757812,
      "logps/rejected": -318.67529296875,
      "loss": 0.256,
      "rewards/chosen": 0.35816943645477295,
      "rewards/margins": 4.0642313957214355,
      "rewards/rejected": -3.706062078475952,
      "step": 2294
    },
    {
      "epoch": 0.6,
      "grad_norm": 38.446563720703125,
      "kl": 0.0,
      "learning_rate": 1.9968594608741165e-07,
      "logps/chosen": -170.37831115722656,
      "logps/rejected": -272.9222412109375,
      "loss": 0.2997,
      "rewards/chosen": -0.15325644612312317,
      "rewards/margins": 1.2989284992218018,
      "rewards/rejected": -1.4521849155426025,
      "step": 2295
    },
    {
      "epoch": 0.6,
      "grad_norm": 40.77598571777344,
      "kl": 0.0,
      "learning_rate": 1.9955509029049985e-07,
      "logps/chosen": -204.7364501953125,
      "logps/rejected": -204.11572265625,
      "loss": 0.2454,
      "rewards/chosen": 1.5838898420333862,
      "rewards/margins": 3.9927096366882324,
      "rewards/rejected": -2.4088196754455566,
      "step": 2296
    },
    {
      "epoch": 0.6,
      "grad_norm": 34.579124450683594,
      "kl": 0.0,
      "learning_rate": 1.9942423449358804e-07,
      "logps/chosen": -203.11920166015625,
      "logps/rejected": -258.780029296875,
      "loss": 0.3125,
      "rewards/chosen": 1.4499505758285522,
      "rewards/margins": 5.605946063995361,
      "rewards/rejected": -4.1559953689575195,
      "step": 2297
    },
    {
      "epoch": 0.6,
      "grad_norm": 31.835281372070312,
      "kl": 0.0,
      "learning_rate": 1.9929337869667627e-07,
      "logps/chosen": -191.6736297607422,
      "logps/rejected": -239.4393768310547,
      "loss": 0.2598,
      "rewards/chosen": 1.3314719200134277,
      "rewards/margins": 5.094294548034668,
      "rewards/rejected": -3.762822389602661,
      "step": 2298
    },
    {
      "epoch": 0.6,
      "grad_norm": 32.93226623535156,
      "kl": 0.0,
      "learning_rate": 1.9916252289976446e-07,
      "logps/chosen": -257.39154052734375,
      "logps/rejected": -298.38360595703125,
      "loss": 0.2827,
      "rewards/chosen": 2.54595685005188,
      "rewards/margins": 6.505262851715088,
      "rewards/rejected": -3.959306001663208,
      "step": 2299
    },
    {
      "epoch": 0.6,
      "grad_norm": 35.539306640625,
      "kl": 0.0,
      "learning_rate": 1.9903166710285266e-07,
      "logps/chosen": -237.4660186767578,
      "logps/rejected": -189.1026153564453,
      "loss": 0.2667,
      "rewards/chosen": 1.5426915884017944,
      "rewards/margins": 4.236473083496094,
      "rewards/rejected": -2.6937813758850098,
      "step": 2300
    },
    {
      "epoch": 0.6,
      "grad_norm": 26.92068099975586,
      "kl": 0.0,
      "learning_rate": 1.9890081130594085e-07,
      "logps/chosen": -242.7715301513672,
      "logps/rejected": -251.5889892578125,
      "loss": 0.2824,
      "rewards/chosen": -0.13105101883411407,
      "rewards/margins": 3.447944164276123,
      "rewards/rejected": -3.5789952278137207,
      "step": 2301
    },
    {
      "epoch": 0.6,
      "grad_norm": 29.411447525024414,
      "kl": 0.0,
      "learning_rate": 1.9876995550902902e-07,
      "logps/chosen": -185.21484375,
      "logps/rejected": -173.87359619140625,
      "loss": 0.2309,
      "rewards/chosen": 0.9786520004272461,
      "rewards/margins": 3.331249237060547,
      "rewards/rejected": -2.352597236633301,
      "step": 2302
    },
    {
      "epoch": 0.6,
      "grad_norm": 37.42856979370117,
      "kl": 0.0,
      "learning_rate": 1.9863909971211722e-07,
      "logps/chosen": -266.0807800292969,
      "logps/rejected": -315.8362731933594,
      "loss": 0.1693,
      "rewards/chosen": 0.9435846209526062,
      "rewards/margins": 5.589478969573975,
      "rewards/rejected": -4.645894527435303,
      "step": 2303
    },
    {
      "epoch": 0.6,
      "grad_norm": 32.103275299072266,
      "kl": 0.0,
      "learning_rate": 1.9850824391520544e-07,
      "logps/chosen": -232.0700225830078,
      "logps/rejected": -273.20611572265625,
      "loss": 0.2743,
      "rewards/chosen": -1.274770736694336,
      "rewards/margins": 3.538522720336914,
      "rewards/rejected": -4.81329345703125,
      "step": 2304
    },
    {
      "epoch": 0.6,
      "grad_norm": 41.43397521972656,
      "kl": 0.0,
      "learning_rate": 1.9837738811829364e-07,
      "logps/chosen": -148.38497924804688,
      "logps/rejected": -239.9488067626953,
      "loss": 0.3287,
      "rewards/chosen": 0.009937132708728313,
      "rewards/margins": 2.5099294185638428,
      "rewards/rejected": -2.4999923706054688,
      "step": 2305
    },
    {
      "epoch": 0.6,
      "grad_norm": 36.7087287902832,
      "kl": 0.0,
      "learning_rate": 1.9824653232138183e-07,
      "logps/chosen": -225.000732421875,
      "logps/rejected": -190.12754821777344,
      "loss": 0.3282,
      "rewards/chosen": -0.3766823410987854,
      "rewards/margins": 1.5658700466156006,
      "rewards/rejected": -1.9425524473190308,
      "step": 2306
    },
    {
      "epoch": 0.6,
      "grad_norm": 31.22111701965332,
      "kl": 0.0,
      "learning_rate": 1.9811567652447003e-07,
      "logps/chosen": -167.56187438964844,
      "logps/rejected": -208.11669921875,
      "loss": 0.3685,
      "rewards/chosen": 0.5481584668159485,
      "rewards/margins": 1.6451292037963867,
      "rewards/rejected": -1.096970796585083,
      "step": 2307
    },
    {
      "epoch": 0.6,
      "grad_norm": 49.5782585144043,
      "kl": 0.0,
      "learning_rate": 1.9798482072755823e-07,
      "logps/chosen": -189.76531982421875,
      "logps/rejected": -211.35142517089844,
      "loss": 0.3392,
      "rewards/chosen": 1.0844168663024902,
      "rewards/margins": 3.5620598793029785,
      "rewards/rejected": -2.4776430130004883,
      "step": 2308
    },
    {
      "epoch": 0.6,
      "grad_norm": 38.009700775146484,
      "kl": 0.0,
      "learning_rate": 1.978539649306464e-07,
      "logps/chosen": -301.82708740234375,
      "logps/rejected": -276.3048400878906,
      "loss": 0.2941,
      "rewards/chosen": 0.7866557240486145,
      "rewards/margins": 4.300216197967529,
      "rewards/rejected": -3.5135605335235596,
      "step": 2309
    },
    {
      "epoch": 0.6,
      "grad_norm": 38.04002380371094,
      "kl": 0.0,
      "learning_rate": 1.9772310913373462e-07,
      "logps/chosen": -222.57528686523438,
      "logps/rejected": -237.14932250976562,
      "loss": 0.272,
      "rewards/chosen": 0.6795270442962646,
      "rewards/margins": 4.367680072784424,
      "rewards/rejected": -3.688153028488159,
      "step": 2310
    },
    {
      "epoch": 0.6,
      "grad_norm": 34.9122428894043,
      "kl": 0.0,
      "learning_rate": 1.9759225333682281e-07,
      "logps/chosen": -175.81056213378906,
      "logps/rejected": -248.84698486328125,
      "loss": 0.1641,
      "rewards/chosen": 1.5451974868774414,
      "rewards/margins": 5.348091125488281,
      "rewards/rejected": -3.802893877029419,
      "step": 2311
    },
    {
      "epoch": 0.61,
      "grad_norm": 42.78825378417969,
      "kl": 0.0,
      "learning_rate": 1.97461397539911e-07,
      "logps/chosen": -278.53704833984375,
      "logps/rejected": -235.3387451171875,
      "loss": 0.2333,
      "rewards/chosen": 1.9742026329040527,
      "rewards/margins": 4.3761515617370605,
      "rewards/rejected": -2.401948928833008,
      "step": 2312
    },
    {
      "epoch": 0.61,
      "grad_norm": 33.957889556884766,
      "kl": 0.0,
      "learning_rate": 1.973305417429992e-07,
      "logps/chosen": -300.9579162597656,
      "logps/rejected": -196.88919067382812,
      "loss": 0.2687,
      "rewards/chosen": 0.5445224642753601,
      "rewards/margins": 4.595375061035156,
      "rewards/rejected": -4.0508527755737305,
      "step": 2313
    },
    {
      "epoch": 0.61,
      "grad_norm": 31.26093864440918,
      "kl": 0.0,
      "learning_rate": 1.971996859460874e-07,
      "logps/chosen": -226.60665893554688,
      "logps/rejected": -192.66175842285156,
      "loss": 0.2809,
      "rewards/chosen": 1.3789070844650269,
      "rewards/margins": 4.590993881225586,
      "rewards/rejected": -3.2120869159698486,
      "step": 2314
    },
    {
      "epoch": 0.61,
      "grad_norm": 33.976322174072266,
      "kl": 0.0,
      "learning_rate": 1.970688301491756e-07,
      "logps/chosen": -286.7447509765625,
      "logps/rejected": -288.9273681640625,
      "loss": 0.3294,
      "rewards/chosen": 0.5929345488548279,
      "rewards/margins": 3.9306373596191406,
      "rewards/rejected": -3.337702751159668,
      "step": 2315
    },
    {
      "epoch": 0.61,
      "grad_norm": 31.508800506591797,
      "kl": 0.0,
      "learning_rate": 1.969379743522638e-07,
      "logps/chosen": -205.0119171142578,
      "logps/rejected": -276.708984375,
      "loss": 0.1806,
      "rewards/chosen": 1.152121901512146,
      "rewards/margins": 4.042749404907227,
      "rewards/rejected": -2.89062762260437,
      "step": 2316
    },
    {
      "epoch": 0.61,
      "grad_norm": 32.762855529785156,
      "kl": 0.0,
      "learning_rate": 1.96807118555352e-07,
      "logps/chosen": -133.92018127441406,
      "logps/rejected": -246.65771484375,
      "loss": 0.2742,
      "rewards/chosen": 0.08014148473739624,
      "rewards/margins": 2.6316933631896973,
      "rewards/rejected": -2.5515518188476562,
      "step": 2317
    },
    {
      "epoch": 0.61,
      "grad_norm": 26.48649787902832,
      "kl": 0.0,
      "learning_rate": 1.966762627584402e-07,
      "logps/chosen": -232.65948486328125,
      "logps/rejected": -216.8899383544922,
      "loss": 0.3232,
      "rewards/chosen": 0.629813551902771,
      "rewards/margins": 3.5861639976501465,
      "rewards/rejected": -2.956350564956665,
      "step": 2318
    },
    {
      "epoch": 0.61,
      "grad_norm": 39.5434684753418,
      "kl": 0.0,
      "learning_rate": 1.9654540696152838e-07,
      "logps/chosen": -326.10894775390625,
      "logps/rejected": -340.63677978515625,
      "loss": 0.2668,
      "rewards/chosen": 1.9403249025344849,
      "rewards/margins": 4.059950828552246,
      "rewards/rejected": -2.1196258068084717,
      "step": 2319
    },
    {
      "epoch": 0.61,
      "grad_norm": 29.929338455200195,
      "kl": 0.0,
      "learning_rate": 1.9641455116461658e-07,
      "logps/chosen": -156.9414520263672,
      "logps/rejected": -169.06515502929688,
      "loss": 0.193,
      "rewards/chosen": -0.8299055695533752,
      "rewards/margins": 2.2383029460906982,
      "rewards/rejected": -3.0682084560394287,
      "step": 2320
    },
    {
      "epoch": 0.61,
      "grad_norm": 36.10740661621094,
      "kl": 0.0,
      "learning_rate": 1.9628369536770478e-07,
      "logps/chosen": -198.9758758544922,
      "logps/rejected": -279.46466064453125,
      "loss": 0.2819,
      "rewards/chosen": 1.3619959354400635,
      "rewards/margins": 5.403970718383789,
      "rewards/rejected": -4.0419745445251465,
      "step": 2321
    },
    {
      "epoch": 0.61,
      "grad_norm": 39.26609420776367,
      "kl": 0.0,
      "learning_rate": 1.9615283957079297e-07,
      "logps/chosen": -182.7283477783203,
      "logps/rejected": -257.07550048828125,
      "loss": 0.3292,
      "rewards/chosen": 0.010081393644213676,
      "rewards/margins": 3.4077489376068115,
      "rewards/rejected": -3.397667646408081,
      "step": 2322
    },
    {
      "epoch": 0.61,
      "grad_norm": 30.399250030517578,
      "kl": 0.0,
      "learning_rate": 1.960219837738812e-07,
      "logps/chosen": -184.43582153320312,
      "logps/rejected": -172.64540100097656,
      "loss": 0.2158,
      "rewards/chosen": 1.5585291385650635,
      "rewards/margins": 6.423967361450195,
      "rewards/rejected": -4.865437984466553,
      "step": 2323
    },
    {
      "epoch": 0.61,
      "grad_norm": 42.39558029174805,
      "kl": 0.0,
      "learning_rate": 1.958911279769694e-07,
      "logps/chosen": -204.50466918945312,
      "logps/rejected": -185.65188598632812,
      "loss": 0.2282,
      "rewards/chosen": 0.8809568881988525,
      "rewards/margins": 5.840044975280762,
      "rewards/rejected": -4.95908784866333,
      "step": 2324
    },
    {
      "epoch": 0.61,
      "grad_norm": 28.4328670501709,
      "kl": 0.0,
      "learning_rate": 1.9576027218005756e-07,
      "logps/chosen": -290.41998291015625,
      "logps/rejected": -194.86659240722656,
      "loss": 0.1874,
      "rewards/chosen": 1.8371068239212036,
      "rewards/margins": 5.469941139221191,
      "rewards/rejected": -3.6328341960906982,
      "step": 2325
    },
    {
      "epoch": 0.61,
      "grad_norm": 33.32115936279297,
      "kl": 0.0,
      "learning_rate": 1.9562941638314576e-07,
      "logps/chosen": -214.08786010742188,
      "logps/rejected": -268.7986755371094,
      "loss": 0.2692,
      "rewards/chosen": 1.712082862854004,
      "rewards/margins": 5.668825149536133,
      "rewards/rejected": -3.95674204826355,
      "step": 2326
    },
    {
      "epoch": 0.61,
      "grad_norm": 37.174095153808594,
      "kl": 0.0,
      "learning_rate": 1.9549856058623395e-07,
      "logps/chosen": -190.068603515625,
      "logps/rejected": -241.15679931640625,
      "loss": 0.3069,
      "rewards/chosen": 1.4767121076583862,
      "rewards/margins": 3.502028465270996,
      "rewards/rejected": -2.0253164768218994,
      "step": 2327
    },
    {
      "epoch": 0.61,
      "grad_norm": 40.89213562011719,
      "kl": 0.0,
      "learning_rate": 1.9536770478932215e-07,
      "logps/chosen": -181.3578338623047,
      "logps/rejected": -280.31915283203125,
      "loss": 0.3081,
      "rewards/chosen": 1.517583966255188,
      "rewards/margins": 3.3675737380981445,
      "rewards/rejected": -1.849989652633667,
      "step": 2328
    },
    {
      "epoch": 0.61,
      "grad_norm": 32.44668960571289,
      "kl": 0.0,
      "learning_rate": 1.9523684899241037e-07,
      "logps/chosen": -149.7053985595703,
      "logps/rejected": -227.4987335205078,
      "loss": 0.2448,
      "rewards/chosen": -0.6433849334716797,
      "rewards/margins": 3.7083725929260254,
      "rewards/rejected": -4.351757526397705,
      "step": 2329
    },
    {
      "epoch": 0.61,
      "grad_norm": 23.76544189453125,
      "kl": 0.0,
      "learning_rate": 1.9510599319549857e-07,
      "logps/chosen": -140.5263671875,
      "logps/rejected": -253.57730102539062,
      "loss": 0.2735,
      "rewards/chosen": -0.26378029584884644,
      "rewards/margins": 4.153634548187256,
      "rewards/rejected": -4.417414665222168,
      "step": 2330
    },
    {
      "epoch": 0.61,
      "grad_norm": 28.82638168334961,
      "kl": 0.0,
      "learning_rate": 1.9497513739858676e-07,
      "logps/chosen": -227.6795654296875,
      "logps/rejected": -303.05035400390625,
      "loss": 0.2781,
      "rewards/chosen": 0.5589046478271484,
      "rewards/margins": 5.917707443237305,
      "rewards/rejected": -5.358802795410156,
      "step": 2331
    },
    {
      "epoch": 0.61,
      "grad_norm": 29.29484748840332,
      "kl": 0.0,
      "learning_rate": 1.9484428160167496e-07,
      "logps/chosen": -186.6299285888672,
      "logps/rejected": -297.8755798339844,
      "loss": 0.2199,
      "rewards/chosen": 1.4472681283950806,
      "rewards/margins": 4.518843650817871,
      "rewards/rejected": -3.07157564163208,
      "step": 2332
    },
    {
      "epoch": 0.61,
      "grad_norm": 40.52328872680664,
      "kl": 0.0,
      "learning_rate": 1.9471342580476313e-07,
      "logps/chosen": -143.46986389160156,
      "logps/rejected": -132.3037872314453,
      "loss": 0.4259,
      "rewards/chosen": -0.21857896447181702,
      "rewards/margins": 2.3188986778259277,
      "rewards/rejected": -2.537477731704712,
      "step": 2333
    },
    {
      "epoch": 0.61,
      "grad_norm": 27.894329071044922,
      "kl": 0.0,
      "learning_rate": 1.9458257000785133e-07,
      "logps/chosen": -209.2640380859375,
      "logps/rejected": -304.1653137207031,
      "loss": 0.1426,
      "rewards/chosen": 2.2198264598846436,
      "rewards/margins": 6.847384452819824,
      "rewards/rejected": -4.62755823135376,
      "step": 2334
    },
    {
      "epoch": 0.61,
      "grad_norm": 38.58939743041992,
      "kl": 0.0,
      "learning_rate": 1.9445171421093952e-07,
      "logps/chosen": -235.97927856445312,
      "logps/rejected": -215.62770080566406,
      "loss": 0.3428,
      "rewards/chosen": -0.37354040145874023,
      "rewards/margins": 2.582803964614868,
      "rewards/rejected": -2.9563443660736084,
      "step": 2335
    },
    {
      "epoch": 0.61,
      "grad_norm": 55.73698806762695,
      "kl": 0.0,
      "learning_rate": 1.9432085841402774e-07,
      "logps/chosen": -202.73712158203125,
      "logps/rejected": -323.6961669921875,
      "loss": 0.333,
      "rewards/chosen": 0.4086439907550812,
      "rewards/margins": 3.40960431098938,
      "rewards/rejected": -3.000960350036621,
      "step": 2336
    },
    {
      "epoch": 0.61,
      "grad_norm": 38.34394836425781,
      "kl": 0.0,
      "learning_rate": 1.9419000261711594e-07,
      "logps/chosen": -262.01995849609375,
      "logps/rejected": -272.8934020996094,
      "loss": 0.3725,
      "rewards/chosen": 0.051617540419101715,
      "rewards/margins": 2.554993152618408,
      "rewards/rejected": -2.50337553024292,
      "step": 2337
    },
    {
      "epoch": 0.61,
      "grad_norm": 40.03268814086914,
      "kl": 0.0,
      "learning_rate": 1.9405914682020414e-07,
      "logps/chosen": -312.5582275390625,
      "logps/rejected": -213.3430938720703,
      "loss": 0.25,
      "rewards/chosen": 2.550088405609131,
      "rewards/margins": 6.379768371582031,
      "rewards/rejected": -3.8296799659729004,
      "step": 2338
    },
    {
      "epoch": 0.61,
      "grad_norm": 30.763914108276367,
      "kl": 0.0,
      "learning_rate": 1.9392829102329233e-07,
      "logps/chosen": -256.4914245605469,
      "logps/rejected": -216.35769653320312,
      "loss": 0.3354,
      "rewards/chosen": 0.5054255127906799,
      "rewards/margins": 4.105841636657715,
      "rewards/rejected": -3.6004161834716797,
      "step": 2339
    },
    {
      "epoch": 0.61,
      "grad_norm": 21.9720458984375,
      "kl": 0.0,
      "learning_rate": 1.937974352263805e-07,
      "logps/chosen": -256.197509765625,
      "logps/rejected": -244.96725463867188,
      "loss": 0.3228,
      "rewards/chosen": 0.6852515339851379,
      "rewards/margins": 5.482264518737793,
      "rewards/rejected": -4.797012805938721,
      "step": 2340
    },
    {
      "epoch": 0.61,
      "grad_norm": 29.677480697631836,
      "kl": 0.0,
      "learning_rate": 1.936665794294687e-07,
      "logps/chosen": -220.20875549316406,
      "logps/rejected": -222.16822814941406,
      "loss": 0.3136,
      "rewards/chosen": -1.0607540607452393,
      "rewards/margins": 1.4132604598999023,
      "rewards/rejected": -2.4740145206451416,
      "step": 2341
    },
    {
      "epoch": 0.61,
      "grad_norm": 39.20149612426758,
      "kl": 0.0,
      "learning_rate": 1.9353572363255692e-07,
      "logps/chosen": -238.39891052246094,
      "logps/rejected": -147.9998779296875,
      "loss": 0.1857,
      "rewards/chosen": 2.199713945388794,
      "rewards/margins": 4.197799205780029,
      "rewards/rejected": -1.9980851411819458,
      "step": 2342
    },
    {
      "epoch": 0.61,
      "grad_norm": 42.02245330810547,
      "kl": 0.0,
      "learning_rate": 1.9340486783564512e-07,
      "logps/chosen": -139.3223419189453,
      "logps/rejected": -246.05770874023438,
      "loss": 0.3031,
      "rewards/chosen": 0.6252093315124512,
      "rewards/margins": 3.0687246322631836,
      "rewards/rejected": -2.4435153007507324,
      "step": 2343
    },
    {
      "epoch": 0.61,
      "grad_norm": 31.7598876953125,
      "kl": 0.0,
      "learning_rate": 1.932740120387333e-07,
      "logps/chosen": -222.23658752441406,
      "logps/rejected": -234.6144256591797,
      "loss": 0.2148,
      "rewards/chosen": 2.6373138427734375,
      "rewards/margins": 5.807960510253906,
      "rewards/rejected": -3.1706466674804688,
      "step": 2344
    },
    {
      "epoch": 0.61,
      "grad_norm": 42.98470687866211,
      "kl": 0.0,
      "learning_rate": 1.931431562418215e-07,
      "logps/chosen": -241.89788818359375,
      "logps/rejected": -238.154296875,
      "loss": 0.32,
      "rewards/chosen": 1.266499638557434,
      "rewards/margins": 4.807114601135254,
      "rewards/rejected": -3.5406150817871094,
      "step": 2345
    },
    {
      "epoch": 0.61,
      "grad_norm": 43.5479850769043,
      "kl": 0.0,
      "learning_rate": 1.930123004449097e-07,
      "logps/chosen": -292.238525390625,
      "logps/rejected": -243.96377563476562,
      "loss": 0.2991,
      "rewards/chosen": 0.4111224114894867,
      "rewards/margins": 4.473801612854004,
      "rewards/rejected": -4.062679290771484,
      "step": 2346
    },
    {
      "epoch": 0.61,
      "grad_norm": 49.00059127807617,
      "kl": 0.0,
      "learning_rate": 1.928814446479979e-07,
      "logps/chosen": -239.2394256591797,
      "logps/rejected": -258.6951904296875,
      "loss": 0.4388,
      "rewards/chosen": -0.20010191202163696,
      "rewards/margins": 1.1232011318206787,
      "rewards/rejected": -1.323302984237671,
      "step": 2347
    },
    {
      "epoch": 0.61,
      "grad_norm": 39.21022033691406,
      "kl": 0.0,
      "learning_rate": 1.9275058885108607e-07,
      "logps/chosen": -255.04443359375,
      "logps/rejected": -244.91729736328125,
      "loss": 0.2715,
      "rewards/chosen": -0.001400096109136939,
      "rewards/margins": 3.116028070449829,
      "rewards/rejected": -3.1174280643463135,
      "step": 2348
    },
    {
      "epoch": 0.61,
      "grad_norm": 34.833255767822266,
      "kl": 0.0,
      "learning_rate": 1.926197330541743e-07,
      "logps/chosen": -194.5554962158203,
      "logps/rejected": -260.3280334472656,
      "loss": 0.2837,
      "rewards/chosen": 0.05182616040110588,
      "rewards/margins": 2.837132215499878,
      "rewards/rejected": -2.785305976867676,
      "step": 2349
    },
    {
      "epoch": 0.62,
      "grad_norm": 31.625768661499023,
      "kl": 0.0,
      "learning_rate": 1.924888772572625e-07,
      "logps/chosen": -214.2799072265625,
      "logps/rejected": -145.29727172851562,
      "loss": 0.2255,
      "rewards/chosen": 2.1377511024475098,
      "rewards/margins": 4.877354621887207,
      "rewards/rejected": -2.7396037578582764,
      "step": 2350
    },
    {
      "epoch": 0.62,
      "grad_norm": 32.17449188232422,
      "kl": 0.0,
      "learning_rate": 1.9235802146035069e-07,
      "logps/chosen": -182.0697479248047,
      "logps/rejected": -301.1997375488281,
      "loss": 0.2758,
      "rewards/chosen": 1.7245681285858154,
      "rewards/margins": 5.422351837158203,
      "rewards/rejected": -3.6977834701538086,
      "step": 2351
    },
    {
      "epoch": 0.62,
      "grad_norm": 36.291282653808594,
      "kl": 0.0,
      "learning_rate": 1.9222716566343888e-07,
      "logps/chosen": -220.62863159179688,
      "logps/rejected": -197.6534423828125,
      "loss": 0.3495,
      "rewards/chosen": -0.28744202852249146,
      "rewards/margins": 2.591951608657837,
      "rewards/rejected": -2.8793935775756836,
      "step": 2352
    },
    {
      "epoch": 0.62,
      "grad_norm": 43.120182037353516,
      "kl": 0.0,
      "learning_rate": 1.9209630986652708e-07,
      "logps/chosen": -189.58387756347656,
      "logps/rejected": -183.0449981689453,
      "loss": 0.274,
      "rewards/chosen": 0.5014609694480896,
      "rewards/margins": 2.78602933883667,
      "rewards/rejected": -2.2845683097839355,
      "step": 2353
    },
    {
      "epoch": 0.62,
      "grad_norm": 31.61993980407715,
      "kl": 0.0,
      "learning_rate": 1.9196545406961527e-07,
      "logps/chosen": -245.66635131835938,
      "logps/rejected": -229.27239990234375,
      "loss": 0.3288,
      "rewards/chosen": -0.1258794069290161,
      "rewards/margins": 2.4228243827819824,
      "rewards/rejected": -2.548703670501709,
      "step": 2354
    },
    {
      "epoch": 0.62,
      "grad_norm": 29.71607208251953,
      "kl": 0.0,
      "learning_rate": 1.918345982727035e-07,
      "logps/chosen": -240.70870971679688,
      "logps/rejected": -260.2348937988281,
      "loss": 0.1747,
      "rewards/chosen": 2.110532522201538,
      "rewards/margins": 5.318500518798828,
      "rewards/rejected": -3.207967758178711,
      "step": 2355
    },
    {
      "epoch": 0.62,
      "grad_norm": 37.52629470825195,
      "kl": 0.0,
      "learning_rate": 1.9170374247579167e-07,
      "logps/chosen": -156.80447387695312,
      "logps/rejected": -227.19284057617188,
      "loss": 0.2688,
      "rewards/chosen": 2.4135518074035645,
      "rewards/margins": 4.432103157043457,
      "rewards/rejected": -2.0185515880584717,
      "step": 2356
    },
    {
      "epoch": 0.62,
      "grad_norm": 39.853370666503906,
      "kl": 0.0,
      "learning_rate": 1.9157288667887986e-07,
      "logps/chosen": -221.11697387695312,
      "logps/rejected": -348.988037109375,
      "loss": 0.3974,
      "rewards/chosen": 0.023872777819633484,
      "rewards/margins": 6.337624549865723,
      "rewards/rejected": -6.313751697540283,
      "step": 2357
    },
    {
      "epoch": 0.62,
      "grad_norm": 29.161619186401367,
      "kl": 0.0,
      "learning_rate": 1.9144203088196806e-07,
      "logps/chosen": -187.28756713867188,
      "logps/rejected": -215.73773193359375,
      "loss": 0.2739,
      "rewards/chosen": 0.8985105156898499,
      "rewards/margins": 3.8181779384613037,
      "rewards/rejected": -2.9196674823760986,
      "step": 2358
    },
    {
      "epoch": 0.62,
      "grad_norm": 32.86288833618164,
      "kl": 0.0,
      "learning_rate": 1.9131117508505625e-07,
      "logps/chosen": -171.5975341796875,
      "logps/rejected": -362.61248779296875,
      "loss": 0.2672,
      "rewards/chosen": 0.9393047094345093,
      "rewards/margins": 5.1951680183410645,
      "rewards/rejected": -4.255863189697266,
      "step": 2359
    },
    {
      "epoch": 0.62,
      "grad_norm": 29.304479598999023,
      "kl": 0.0,
      "learning_rate": 1.9118031928814445e-07,
      "logps/chosen": -253.9071044921875,
      "logps/rejected": -265.0699157714844,
      "loss": 0.2201,
      "rewards/chosen": 2.477958917617798,
      "rewards/margins": 6.703524589538574,
      "rewards/rejected": -4.225565433502197,
      "step": 2360
    },
    {
      "epoch": 0.62,
      "grad_norm": 34.67782974243164,
      "kl": 0.0,
      "learning_rate": 1.9104946349123267e-07,
      "logps/chosen": -157.28024291992188,
      "logps/rejected": -303.87591552734375,
      "loss": 0.1655,
      "rewards/chosen": 1.717661738395691,
      "rewards/margins": 5.473419666290283,
      "rewards/rejected": -3.7557578086853027,
      "step": 2361
    },
    {
      "epoch": 0.62,
      "grad_norm": 33.45250701904297,
      "kl": 0.0,
      "learning_rate": 1.9091860769432087e-07,
      "logps/chosen": -164.96530151367188,
      "logps/rejected": -325.362060546875,
      "loss": 0.2493,
      "rewards/chosen": -1.0385844707489014,
      "rewards/margins": 4.012018203735352,
      "rewards/rejected": -5.050602912902832,
      "step": 2362
    },
    {
      "epoch": 0.62,
      "grad_norm": 40.05767059326172,
      "kl": 0.0,
      "learning_rate": 1.9078775189740904e-07,
      "logps/chosen": -182.37466430664062,
      "logps/rejected": -162.07620239257812,
      "loss": 0.1577,
      "rewards/chosen": 1.3069759607315063,
      "rewards/margins": 4.243577480316162,
      "rewards/rejected": -2.9366016387939453,
      "step": 2363
    },
    {
      "epoch": 0.62,
      "grad_norm": 41.3237419128418,
      "kl": 0.0,
      "learning_rate": 1.9065689610049723e-07,
      "logps/chosen": -214.30227661132812,
      "logps/rejected": -252.76242065429688,
      "loss": 0.284,
      "rewards/chosen": -0.3962777853012085,
      "rewards/margins": 2.5071358680725098,
      "rewards/rejected": -2.9034135341644287,
      "step": 2364
    },
    {
      "epoch": 0.62,
      "grad_norm": 35.928016662597656,
      "kl": 0.0,
      "learning_rate": 1.9052604030358543e-07,
      "logps/chosen": -230.25433349609375,
      "logps/rejected": -209.69033813476562,
      "loss": 0.2953,
      "rewards/chosen": 0.2936524748802185,
      "rewards/margins": 3.8067970275878906,
      "rewards/rejected": -3.5131444931030273,
      "step": 2365
    },
    {
      "epoch": 0.62,
      "grad_norm": 35.94745635986328,
      "kl": 0.0,
      "learning_rate": 1.9039518450667363e-07,
      "logps/chosen": -222.4689178466797,
      "logps/rejected": -242.89163208007812,
      "loss": 0.3081,
      "rewards/chosen": -0.4748765826225281,
      "rewards/margins": 2.3422157764434814,
      "rewards/rejected": -2.8170924186706543,
      "step": 2366
    },
    {
      "epoch": 0.62,
      "grad_norm": 55.70354461669922,
      "kl": 0.0,
      "learning_rate": 1.9026432870976182e-07,
      "logps/chosen": -207.79037475585938,
      "logps/rejected": -231.1091766357422,
      "loss": 0.2639,
      "rewards/chosen": 1.4316012859344482,
      "rewards/margins": 4.312504768371582,
      "rewards/rejected": -2.880903482437134,
      "step": 2367
    },
    {
      "epoch": 0.62,
      "grad_norm": 31.606653213500977,
      "kl": 0.0,
      "learning_rate": 1.9013347291285005e-07,
      "logps/chosen": -212.349609375,
      "logps/rejected": -259.9991760253906,
      "loss": 0.3397,
      "rewards/chosen": 0.9540568590164185,
      "rewards/margins": 6.031365394592285,
      "rewards/rejected": -5.077308654785156,
      "step": 2368
    },
    {
      "epoch": 0.62,
      "grad_norm": 36.44898223876953,
      "kl": 0.0,
      "learning_rate": 1.9000261711593824e-07,
      "logps/chosen": -194.91915893554688,
      "logps/rejected": -281.0176696777344,
      "loss": 0.2525,
      "rewards/chosen": 0.3901931047439575,
      "rewards/margins": 4.855154037475586,
      "rewards/rejected": -4.464961051940918,
      "step": 2369
    },
    {
      "epoch": 0.62,
      "grad_norm": 30.430397033691406,
      "kl": 0.0,
      "learning_rate": 1.8987176131902644e-07,
      "logps/chosen": -239.862548828125,
      "logps/rejected": -260.174072265625,
      "loss": 0.1879,
      "rewards/chosen": 0.952376663684845,
      "rewards/margins": 5.6407036781311035,
      "rewards/rejected": -4.688326835632324,
      "step": 2370
    },
    {
      "epoch": 0.62,
      "grad_norm": 25.95115852355957,
      "kl": 0.0,
      "learning_rate": 1.897409055221146e-07,
      "logps/chosen": -217.9676513671875,
      "logps/rejected": -193.67478942871094,
      "loss": 0.2545,
      "rewards/chosen": 0.6721733212471008,
      "rewards/margins": 4.830235958099365,
      "rewards/rejected": -4.15806245803833,
      "step": 2371
    },
    {
      "epoch": 0.62,
      "grad_norm": 31.143552780151367,
      "kl": 0.0,
      "learning_rate": 1.896100497252028e-07,
      "logps/chosen": -183.70237731933594,
      "logps/rejected": -233.95692443847656,
      "loss": 0.3316,
      "rewards/chosen": 0.9070913195610046,
      "rewards/margins": 4.197821140289307,
      "rewards/rejected": -3.2907299995422363,
      "step": 2372
    },
    {
      "epoch": 0.62,
      "grad_norm": 28.717832565307617,
      "kl": 0.0,
      "learning_rate": 1.89479193928291e-07,
      "logps/chosen": -247.36865234375,
      "logps/rejected": -263.091552734375,
      "loss": 0.2654,
      "rewards/chosen": -0.25779542326927185,
      "rewards/margins": 3.495422124862671,
      "rewards/rejected": -3.7532174587249756,
      "step": 2373
    },
    {
      "epoch": 0.62,
      "grad_norm": 46.38393783569336,
      "kl": 0.0,
      "learning_rate": 1.8934833813137922e-07,
      "logps/chosen": -251.53179931640625,
      "logps/rejected": -275.49102783203125,
      "loss": 0.2089,
      "rewards/chosen": 3.581019878387451,
      "rewards/margins": 6.682999610900879,
      "rewards/rejected": -3.1019797325134277,
      "step": 2374
    },
    {
      "epoch": 0.62,
      "grad_norm": 32.04011535644531,
      "kl": 0.0,
      "learning_rate": 1.8921748233446742e-07,
      "logps/chosen": -226.80967712402344,
      "logps/rejected": -185.3697967529297,
      "loss": 0.1354,
      "rewards/chosen": 1.043463110923767,
      "rewards/margins": 4.468041896820068,
      "rewards/rejected": -3.424578905105591,
      "step": 2375
    },
    {
      "epoch": 0.62,
      "grad_norm": 33.520713806152344,
      "kl": 0.0,
      "learning_rate": 1.8908662653755561e-07,
      "logps/chosen": -224.3209686279297,
      "logps/rejected": -259.0567321777344,
      "loss": 0.2562,
      "rewards/chosen": 2.0344178676605225,
      "rewards/margins": 4.964014053344727,
      "rewards/rejected": -2.929595947265625,
      "step": 2376
    },
    {
      "epoch": 0.62,
      "grad_norm": 32.75332260131836,
      "kl": 0.0,
      "learning_rate": 1.889557707406438e-07,
      "logps/chosen": -229.57943725585938,
      "logps/rejected": -235.8605499267578,
      "loss": 0.3635,
      "rewards/chosen": 0.24739819765090942,
      "rewards/margins": 4.021427631378174,
      "rewards/rejected": -3.77402925491333,
      "step": 2377
    },
    {
      "epoch": 0.62,
      "grad_norm": 33.14479064941406,
      "kl": 0.0,
      "learning_rate": 1.88824914943732e-07,
      "logps/chosen": -258.2164306640625,
      "logps/rejected": -145.81407165527344,
      "loss": 0.3409,
      "rewards/chosen": -0.3117099404335022,
      "rewards/margins": 2.2764155864715576,
      "rewards/rejected": -2.588125467300415,
      "step": 2378
    },
    {
      "epoch": 0.62,
      "grad_norm": 28.57996368408203,
      "kl": 0.0,
      "learning_rate": 1.8869405914682018e-07,
      "logps/chosen": -163.08482360839844,
      "logps/rejected": -152.67636108398438,
      "loss": 0.2951,
      "rewards/chosen": 1.1140191555023193,
      "rewards/margins": 2.7893271446228027,
      "rewards/rejected": -1.6753078699111938,
      "step": 2379
    },
    {
      "epoch": 0.62,
      "grad_norm": 26.788040161132812,
      "kl": 0.0,
      "learning_rate": 1.8856320334990837e-07,
      "logps/chosen": -130.69322204589844,
      "logps/rejected": -254.1563720703125,
      "loss": 0.2663,
      "rewards/chosen": 1.4976850748062134,
      "rewards/margins": 4.847748756408691,
      "rewards/rejected": -3.3500638008117676,
      "step": 2380
    },
    {
      "epoch": 0.62,
      "grad_norm": 36.504066467285156,
      "kl": 0.0,
      "learning_rate": 1.884323475529966e-07,
      "logps/chosen": -198.76889038085938,
      "logps/rejected": -234.64707946777344,
      "loss": 0.401,
      "rewards/chosen": -0.3146289885044098,
      "rewards/margins": 2.4038166999816895,
      "rewards/rejected": -2.7184457778930664,
      "step": 2381
    },
    {
      "epoch": 0.62,
      "grad_norm": 30.19637680053711,
      "kl": 0.0,
      "learning_rate": 1.883014917560848e-07,
      "logps/chosen": -167.89947509765625,
      "logps/rejected": -297.1753234863281,
      "loss": 0.2258,
      "rewards/chosen": 2.146338701248169,
      "rewards/margins": 7.123197555541992,
      "rewards/rejected": -4.976859092712402,
      "step": 2382
    },
    {
      "epoch": 0.62,
      "grad_norm": 46.778839111328125,
      "kl": 0.0,
      "learning_rate": 1.8817063595917299e-07,
      "logps/chosen": -275.8388366699219,
      "logps/rejected": -222.3790283203125,
      "loss": 0.4312,
      "rewards/chosen": -0.8422994613647461,
      "rewards/margins": 1.4927501678466797,
      "rewards/rejected": -2.335049629211426,
      "step": 2383
    },
    {
      "epoch": 0.62,
      "grad_norm": 38.08612823486328,
      "kl": 0.0,
      "learning_rate": 1.8803978016226118e-07,
      "logps/chosen": -189.39913940429688,
      "logps/rejected": -221.18472290039062,
      "loss": 0.2462,
      "rewards/chosen": 0.9700742959976196,
      "rewards/margins": 3.71815824508667,
      "rewards/rejected": -2.74808406829834,
      "step": 2384
    },
    {
      "epoch": 0.62,
      "grad_norm": 31.00204849243164,
      "kl": 0.0,
      "learning_rate": 1.8790892436534938e-07,
      "logps/chosen": -201.84481811523438,
      "logps/rejected": -230.4281005859375,
      "loss": 0.2608,
      "rewards/chosen": 0.6287637948989868,
      "rewards/margins": 3.8814697265625,
      "rewards/rejected": -3.2527058124542236,
      "step": 2385
    },
    {
      "epoch": 0.62,
      "grad_norm": 29.59605598449707,
      "kl": 0.0,
      "learning_rate": 1.8777806856843757e-07,
      "logps/chosen": -182.8221435546875,
      "logps/rejected": -246.32264709472656,
      "loss": 0.2239,
      "rewards/chosen": 0.729421854019165,
      "rewards/margins": 4.723428726196289,
      "rewards/rejected": -3.994007110595703,
      "step": 2386
    },
    {
      "epoch": 0.62,
      "grad_norm": 38.360904693603516,
      "kl": 0.0,
      "learning_rate": 1.8764721277152577e-07,
      "logps/chosen": -209.9322509765625,
      "logps/rejected": -353.68414306640625,
      "loss": 0.2591,
      "rewards/chosen": 1.7438795566558838,
      "rewards/margins": 4.995150566101074,
      "rewards/rejected": -3.2512707710266113,
      "step": 2387
    },
    {
      "epoch": 0.62,
      "grad_norm": 36.978267669677734,
      "kl": 0.0,
      "learning_rate": 1.8751635697461397e-07,
      "logps/chosen": -184.61764526367188,
      "logps/rejected": -162.25514221191406,
      "loss": 0.2072,
      "rewards/chosen": 0.8075557351112366,
      "rewards/margins": 3.699028253555298,
      "rewards/rejected": -2.891472578048706,
      "step": 2388
    },
    {
      "epoch": 0.63,
      "grad_norm": 36.411163330078125,
      "kl": 0.0,
      "learning_rate": 1.8738550117770216e-07,
      "logps/chosen": -247.070068359375,
      "logps/rejected": -230.36900329589844,
      "loss": 0.2015,
      "rewards/chosen": -0.4802358150482178,
      "rewards/margins": 3.238145589828491,
      "rewards/rejected": -3.718381404876709,
      "step": 2389
    },
    {
      "epoch": 0.63,
      "grad_norm": 30.9532470703125,
      "kl": 0.0,
      "learning_rate": 1.8725464538079036e-07,
      "logps/chosen": -161.55776977539062,
      "logps/rejected": -247.14659118652344,
      "loss": 0.2947,
      "rewards/chosen": -0.4317225217819214,
      "rewards/margins": 2.7858991622924805,
      "rewards/rejected": -3.2176215648651123,
      "step": 2390
    },
    {
      "epoch": 0.63,
      "grad_norm": 33.81348419189453,
      "kl": 0.0,
      "learning_rate": 1.8712378958387856e-07,
      "logps/chosen": -246.77601623535156,
      "logps/rejected": -257.72235107421875,
      "loss": 0.2854,
      "rewards/chosen": 0.6340773105621338,
      "rewards/margins": 4.2819366455078125,
      "rewards/rejected": -3.647859573364258,
      "step": 2391
    },
    {
      "epoch": 0.63,
      "grad_norm": 33.270442962646484,
      "kl": 0.0,
      "learning_rate": 1.8699293378696675e-07,
      "logps/chosen": -202.92318725585938,
      "logps/rejected": -237.6372528076172,
      "loss": 0.3072,
      "rewards/chosen": 1.4428868293762207,
      "rewards/margins": 4.175738334655762,
      "rewards/rejected": -2.732851505279541,
      "step": 2392
    },
    {
      "epoch": 0.63,
      "grad_norm": 35.90946578979492,
      "kl": 0.0,
      "learning_rate": 1.8686207799005497e-07,
      "logps/chosen": -205.4209442138672,
      "logps/rejected": -257.3418273925781,
      "loss": 0.313,
      "rewards/chosen": -0.1432461142539978,
      "rewards/margins": 4.7647905349731445,
      "rewards/rejected": -4.908036708831787,
      "step": 2393
    },
    {
      "epoch": 0.63,
      "grad_norm": 33.007659912109375,
      "kl": 0.0,
      "learning_rate": 1.8673122219314314e-07,
      "logps/chosen": -165.77957153320312,
      "logps/rejected": -215.4703369140625,
      "loss": 0.2844,
      "rewards/chosen": 0.5170528888702393,
      "rewards/margins": 4.497333526611328,
      "rewards/rejected": -3.980280637741089,
      "step": 2394
    },
    {
      "epoch": 0.63,
      "grad_norm": 33.04104232788086,
      "kl": 0.0,
      "learning_rate": 1.8660036639623134e-07,
      "logps/chosen": -170.15045166015625,
      "logps/rejected": -236.4322967529297,
      "loss": 0.2423,
      "rewards/chosen": 1.5568426847457886,
      "rewards/margins": 3.240248680114746,
      "rewards/rejected": -1.683405876159668,
      "step": 2395
    },
    {
      "epoch": 0.63,
      "grad_norm": 40.87158203125,
      "kl": 0.0,
      "learning_rate": 1.8646951059931954e-07,
      "logps/chosen": -224.96380615234375,
      "logps/rejected": -285.54058837890625,
      "loss": 0.2544,
      "rewards/chosen": 1.9070959091186523,
      "rewards/margins": 6.817129135131836,
      "rewards/rejected": -4.910033226013184,
      "step": 2396
    },
    {
      "epoch": 0.63,
      "grad_norm": 39.35676956176758,
      "kl": 0.0,
      "learning_rate": 1.8633865480240773e-07,
      "logps/chosen": -186.65476989746094,
      "logps/rejected": -186.18894958496094,
      "loss": 0.3623,
      "rewards/chosen": -0.24426405131816864,
      "rewards/margins": 2.902918815612793,
      "rewards/rejected": -3.1471829414367676,
      "step": 2397
    },
    {
      "epoch": 0.63,
      "grad_norm": 34.04732131958008,
      "kl": 0.0,
      "learning_rate": 1.8620779900549593e-07,
      "logps/chosen": -216.03396606445312,
      "logps/rejected": -287.70501708984375,
      "loss": 0.2585,
      "rewards/chosen": 1.3595733642578125,
      "rewards/margins": 5.254693984985352,
      "rewards/rejected": -3.895120620727539,
      "step": 2398
    },
    {
      "epoch": 0.63,
      "grad_norm": 36.18494415283203,
      "kl": 0.0,
      "learning_rate": 1.8607694320858412e-07,
      "logps/chosen": -215.787841796875,
      "logps/rejected": -389.7153015136719,
      "loss": 0.1888,
      "rewards/chosen": 1.4879820346832275,
      "rewards/margins": 4.753214359283447,
      "rewards/rejected": -3.2652323246002197,
      "step": 2399
    },
    {
      "epoch": 0.63,
      "grad_norm": 49.94510269165039,
      "kl": 0.0,
      "learning_rate": 1.8594608741167235e-07,
      "logps/chosen": -261.67913818359375,
      "logps/rejected": -219.42575073242188,
      "loss": 0.2057,
      "rewards/chosen": 0.8924288153648376,
      "rewards/margins": 4.58126974105835,
      "rewards/rejected": -3.6888411045074463,
      "step": 2400
    },
    {
      "epoch": 0.63,
      "grad_norm": 34.9915771484375,
      "kl": 0.0,
      "learning_rate": 1.8581523161476054e-07,
      "logps/chosen": -157.85699462890625,
      "logps/rejected": -282.4374084472656,
      "loss": 0.2325,
      "rewards/chosen": 2.0740764141082764,
      "rewards/margins": 4.795737266540527,
      "rewards/rejected": -2.72166109085083,
      "step": 2401
    },
    {
      "epoch": 0.63,
      "grad_norm": 36.97850036621094,
      "kl": 0.0,
      "learning_rate": 1.856843758178487e-07,
      "logps/chosen": -219.75802612304688,
      "logps/rejected": -340.8087158203125,
      "loss": 0.2289,
      "rewards/chosen": 2.2684707641601562,
      "rewards/margins": 7.455198287963867,
      "rewards/rejected": -5.186727523803711,
      "step": 2402
    },
    {
      "epoch": 0.63,
      "grad_norm": 35.18425750732422,
      "kl": 0.0,
      "learning_rate": 1.855535200209369e-07,
      "logps/chosen": -159.93328857421875,
      "logps/rejected": -339.95556640625,
      "loss": 0.2479,
      "rewards/chosen": 1.4932546615600586,
      "rewards/margins": 3.469346523284912,
      "rewards/rejected": -1.976091980934143,
      "step": 2403
    },
    {
      "epoch": 0.63,
      "grad_norm": 38.96127700805664,
      "kl": 0.0,
      "learning_rate": 1.854226642240251e-07,
      "logps/chosen": -216.40310668945312,
      "logps/rejected": -195.5192108154297,
      "loss": 0.2884,
      "rewards/chosen": -0.16616499423980713,
      "rewards/margins": 3.938969135284424,
      "rewards/rejected": -4.105134010314941,
      "step": 2404
    },
    {
      "epoch": 0.63,
      "grad_norm": 38.0742073059082,
      "kl": 0.0,
      "learning_rate": 1.852918084271133e-07,
      "logps/chosen": -173.3180694580078,
      "logps/rejected": -259.1162414550781,
      "loss": 0.2673,
      "rewards/chosen": 1.2408561706542969,
      "rewards/margins": 4.919090270996094,
      "rewards/rejected": -3.678234338760376,
      "step": 2405
    },
    {
      "epoch": 0.63,
      "grad_norm": 30.632402420043945,
      "kl": 0.0,
      "learning_rate": 1.8516095263020152e-07,
      "logps/chosen": -143.20333862304688,
      "logps/rejected": -234.39422607421875,
      "loss": 0.2918,
      "rewards/chosen": 1.0636051893234253,
      "rewards/margins": 3.586050033569336,
      "rewards/rejected": -2.522444725036621,
      "step": 2406
    },
    {
      "epoch": 0.63,
      "grad_norm": 38.62371063232422,
      "kl": 0.0,
      "learning_rate": 1.8503009683328972e-07,
      "logps/chosen": -229.66151428222656,
      "logps/rejected": -292.41717529296875,
      "loss": 0.3,
      "rewards/chosen": 0.04666091501712799,
      "rewards/margins": 4.005421161651611,
      "rewards/rejected": -3.9587600231170654,
      "step": 2407
    },
    {
      "epoch": 0.63,
      "grad_norm": 41.32472610473633,
      "kl": 0.0,
      "learning_rate": 1.8489924103637792e-07,
      "logps/chosen": -249.702880859375,
      "logps/rejected": -257.2342529296875,
      "loss": 0.3374,
      "rewards/chosen": 1.1685850620269775,
      "rewards/margins": 2.980180501937866,
      "rewards/rejected": -1.8115954399108887,
      "step": 2408
    },
    {
      "epoch": 0.63,
      "grad_norm": 27.124591827392578,
      "kl": 0.0,
      "learning_rate": 1.847683852394661e-07,
      "logps/chosen": -172.0544891357422,
      "logps/rejected": -246.92007446289062,
      "loss": 0.2379,
      "rewards/chosen": 0.1499977856874466,
      "rewards/margins": 4.02953577041626,
      "rewards/rejected": -3.87953782081604,
      "step": 2409
    },
    {
      "epoch": 0.63,
      "grad_norm": 41.78574752807617,
      "kl": 0.0,
      "learning_rate": 1.8463752944255428e-07,
      "logps/chosen": -175.81947326660156,
      "logps/rejected": -271.74114990234375,
      "loss": 0.2285,
      "rewards/chosen": 1.0703699588775635,
      "rewards/margins": 4.834184646606445,
      "rewards/rejected": -3.7638144493103027,
      "step": 2410
    },
    {
      "epoch": 0.63,
      "grad_norm": 39.91765594482422,
      "kl": 0.0,
      "learning_rate": 1.8450667364564248e-07,
      "logps/chosen": -276.7518005371094,
      "logps/rejected": -347.8869323730469,
      "loss": 0.3181,
      "rewards/chosen": -0.08891824632883072,
      "rewards/margins": 5.355866432189941,
      "rewards/rejected": -5.444784641265869,
      "step": 2411
    },
    {
      "epoch": 0.63,
      "grad_norm": 40.989402770996094,
      "kl": 0.0,
      "learning_rate": 1.8437581784873067e-07,
      "logps/chosen": -221.4810791015625,
      "logps/rejected": -204.2392578125,
      "loss": 0.2914,
      "rewards/chosen": 1.4958677291870117,
      "rewards/margins": 3.875377893447876,
      "rewards/rejected": -2.3795101642608643,
      "step": 2412
    },
    {
      "epoch": 0.63,
      "grad_norm": 40.61765670776367,
      "kl": 0.0,
      "learning_rate": 1.842449620518189e-07,
      "logps/chosen": -157.64474487304688,
      "logps/rejected": -282.7855529785156,
      "loss": 0.3276,
      "rewards/chosen": 0.4709343910217285,
      "rewards/margins": 3.0344228744506836,
      "rewards/rejected": -2.563488483428955,
      "step": 2413
    },
    {
      "epoch": 0.63,
      "grad_norm": 32.019100189208984,
      "kl": 0.0,
      "learning_rate": 1.841141062549071e-07,
      "logps/chosen": -152.17665100097656,
      "logps/rejected": -321.3105163574219,
      "loss": 0.2842,
      "rewards/chosen": 0.6752780079841614,
      "rewards/margins": 5.736945629119873,
      "rewards/rejected": -5.061667442321777,
      "step": 2414
    },
    {
      "epoch": 0.63,
      "grad_norm": 26.227691650390625,
      "kl": 0.0,
      "learning_rate": 1.839832504579953e-07,
      "logps/chosen": -166.89146423339844,
      "logps/rejected": -273.63555908203125,
      "loss": 0.2115,
      "rewards/chosen": 0.6281822919845581,
      "rewards/margins": 2.977193832397461,
      "rewards/rejected": -2.3490116596221924,
      "step": 2415
    },
    {
      "epoch": 0.63,
      "grad_norm": 28.663330078125,
      "kl": 0.0,
      "learning_rate": 1.8385239466108348e-07,
      "logps/chosen": -117.30267333984375,
      "logps/rejected": -310.23828125,
      "loss": 0.2346,
      "rewards/chosen": 0.3416542410850525,
      "rewards/margins": 4.763482570648193,
      "rewards/rejected": -4.421828269958496,
      "step": 2416
    },
    {
      "epoch": 0.63,
      "grad_norm": 28.140478134155273,
      "kl": 0.0,
      "learning_rate": 1.8372153886417168e-07,
      "logps/chosen": -248.0314178466797,
      "logps/rejected": -195.76657104492188,
      "loss": 0.2903,
      "rewards/chosen": 1.8965684175491333,
      "rewards/margins": 5.802505970001221,
      "rewards/rejected": -3.905937671661377,
      "step": 2417
    },
    {
      "epoch": 0.63,
      "grad_norm": 40.49986267089844,
      "kl": 0.0,
      "learning_rate": 1.8359068306725985e-07,
      "logps/chosen": -138.3631134033203,
      "logps/rejected": -231.3643341064453,
      "loss": 0.3318,
      "rewards/chosen": -0.4930739998817444,
      "rewards/margins": 2.0822386741638184,
      "rewards/rejected": -2.575312614440918,
      "step": 2418
    },
    {
      "epoch": 0.63,
      "grad_norm": 28.37915802001953,
      "kl": 0.0,
      "learning_rate": 1.8345982727034807e-07,
      "logps/chosen": -122.53243255615234,
      "logps/rejected": -233.7292938232422,
      "loss": 0.2877,
      "rewards/chosen": -0.30030661821365356,
      "rewards/margins": 4.429220676422119,
      "rewards/rejected": -4.729527473449707,
      "step": 2419
    },
    {
      "epoch": 0.63,
      "grad_norm": 27.743270874023438,
      "kl": 0.0,
      "learning_rate": 1.8332897147343627e-07,
      "logps/chosen": -193.67543029785156,
      "logps/rejected": -227.80419921875,
      "loss": 0.2714,
      "rewards/chosen": 0.15854603052139282,
      "rewards/margins": 3.921008348464966,
      "rewards/rejected": -3.7624623775482178,
      "step": 2420
    },
    {
      "epoch": 0.63,
      "grad_norm": 48.460201263427734,
      "kl": 0.0,
      "learning_rate": 1.8319811567652446e-07,
      "logps/chosen": -185.3589324951172,
      "logps/rejected": -280.7593078613281,
      "loss": 0.3378,
      "rewards/chosen": 0.38898900151252747,
      "rewards/margins": 3.3332223892211914,
      "rewards/rejected": -2.9442334175109863,
      "step": 2421
    },
    {
      "epoch": 0.63,
      "grad_norm": 39.76223373413086,
      "kl": 0.0,
      "learning_rate": 1.8306725987961266e-07,
      "logps/chosen": -206.49517822265625,
      "logps/rejected": -231.34706115722656,
      "loss": 0.1478,
      "rewards/chosen": 2.267397880554199,
      "rewards/margins": 6.553323745727539,
      "rewards/rejected": -4.28592586517334,
      "step": 2422
    },
    {
      "epoch": 0.63,
      "grad_norm": 35.084251403808594,
      "kl": 0.0,
      "learning_rate": 1.8293640408270086e-07,
      "logps/chosen": -233.82870483398438,
      "logps/rejected": -215.7780303955078,
      "loss": 0.2572,
      "rewards/chosen": 0.8703588247299194,
      "rewards/margins": 4.745369911193848,
      "rewards/rejected": -3.8750109672546387,
      "step": 2423
    },
    {
      "epoch": 0.63,
      "grad_norm": 26.757802963256836,
      "kl": 0.0,
      "learning_rate": 1.8280554828578905e-07,
      "logps/chosen": -208.91114807128906,
      "logps/rejected": -294.6512451171875,
      "loss": 0.3139,
      "rewards/chosen": 0.3779858350753784,
      "rewards/margins": 3.4969449043273926,
      "rewards/rejected": -3.1189589500427246,
      "step": 2424
    },
    {
      "epoch": 0.63,
      "grad_norm": 35.949974060058594,
      "kl": 0.0,
      "learning_rate": 1.8267469248887722e-07,
      "logps/chosen": -210.23870849609375,
      "logps/rejected": -213.9252471923828,
      "loss": 0.3204,
      "rewards/chosen": -0.17984256148338318,
      "rewards/margins": 2.2258670330047607,
      "rewards/rejected": -2.4057095050811768,
      "step": 2425
    },
    {
      "epoch": 0.63,
      "grad_norm": 26.635404586791992,
      "kl": 0.0,
      "learning_rate": 1.8254383669196544e-07,
      "logps/chosen": -216.50100708007812,
      "logps/rejected": -290.6695556640625,
      "loss": 0.2065,
      "rewards/chosen": 2.7978515625,
      "rewards/margins": 6.623014450073242,
      "rewards/rejected": -3.825162887573242,
      "step": 2426
    },
    {
      "epoch": 0.64,
      "grad_norm": 37.63100051879883,
      "kl": 0.0,
      "learning_rate": 1.8241298089505364e-07,
      "logps/chosen": -206.00582885742188,
      "logps/rejected": -234.72286987304688,
      "loss": 0.3039,
      "rewards/chosen": 0.5800890922546387,
      "rewards/margins": 2.977942705154419,
      "rewards/rejected": -2.3978536128997803,
      "step": 2427
    },
    {
      "epoch": 0.64,
      "grad_norm": 42.57049560546875,
      "kl": 0.0,
      "learning_rate": 1.8228212509814184e-07,
      "logps/chosen": -288.19244384765625,
      "logps/rejected": -245.7040252685547,
      "loss": 0.4022,
      "rewards/chosen": -0.9926659464836121,
      "rewards/margins": 1.974844217300415,
      "rewards/rejected": -2.967510223388672,
      "step": 2428
    },
    {
      "epoch": 0.64,
      "grad_norm": 29.48101234436035,
      "kl": 0.0,
      "learning_rate": 1.8215126930123003e-07,
      "logps/chosen": -152.51112365722656,
      "logps/rejected": -317.3865966796875,
      "loss": 0.2226,
      "rewards/chosen": 0.774302065372467,
      "rewards/margins": 4.343541622161865,
      "rewards/rejected": -3.569239616394043,
      "step": 2429
    },
    {
      "epoch": 0.64,
      "grad_norm": 32.72949981689453,
      "kl": 0.0,
      "learning_rate": 1.8202041350431823e-07,
      "logps/chosen": -181.32928466796875,
      "logps/rejected": -258.7019958496094,
      "loss": 0.2056,
      "rewards/chosen": 0.38289085030555725,
      "rewards/margins": 5.136065483093262,
      "rewards/rejected": -4.753174781799316,
      "step": 2430
    },
    {
      "epoch": 0.64,
      "grad_norm": 38.77192687988281,
      "kl": 0.0,
      "learning_rate": 1.8188955770740643e-07,
      "logps/chosen": -197.99661254882812,
      "logps/rejected": -297.813232421875,
      "loss": 0.2575,
      "rewards/chosen": 0.8634415864944458,
      "rewards/margins": 3.4837145805358887,
      "rewards/rejected": -2.6202731132507324,
      "step": 2431
    },
    {
      "epoch": 0.64,
      "grad_norm": 37.21419906616211,
      "kl": 0.0,
      "learning_rate": 1.8175870191049465e-07,
      "logps/chosen": -219.07395935058594,
      "logps/rejected": -278.3205871582031,
      "loss": 0.1869,
      "rewards/chosen": 0.7768957018852234,
      "rewards/margins": 5.239342212677002,
      "rewards/rejected": -4.462446689605713,
      "step": 2432
    },
    {
      "epoch": 0.64,
      "grad_norm": 34.17312240600586,
      "kl": 0.0,
      "learning_rate": 1.8162784611358282e-07,
      "logps/chosen": -174.0838623046875,
      "logps/rejected": -198.52992248535156,
      "loss": 0.3237,
      "rewards/chosen": 0.5632308721542358,
      "rewards/margins": 4.041640758514404,
      "rewards/rejected": -3.478409767150879,
      "step": 2433
    },
    {
      "epoch": 0.64,
      "grad_norm": 34.17102813720703,
      "kl": 0.0,
      "learning_rate": 1.8149699031667101e-07,
      "logps/chosen": -182.22317504882812,
      "logps/rejected": -242.95675659179688,
      "loss": 0.3105,
      "rewards/chosen": 0.08409518748521805,
      "rewards/margins": 3.777181625366211,
      "rewards/rejected": -3.6930863857269287,
      "step": 2434
    },
    {
      "epoch": 0.64,
      "grad_norm": 37.4450798034668,
      "kl": 0.0,
      "learning_rate": 1.813661345197592e-07,
      "logps/chosen": -234.27056884765625,
      "logps/rejected": -240.46636962890625,
      "loss": 0.1962,
      "rewards/chosen": 1.776934027671814,
      "rewards/margins": 5.846645355224609,
      "rewards/rejected": -4.069711208343506,
      "step": 2435
    },
    {
      "epoch": 0.64,
      "grad_norm": 30.845748901367188,
      "kl": 0.0,
      "learning_rate": 1.812352787228474e-07,
      "logps/chosen": -252.45223999023438,
      "logps/rejected": -358.770751953125,
      "loss": 0.1712,
      "rewards/chosen": -0.10393345355987549,
      "rewards/margins": 5.5656962394714355,
      "rewards/rejected": -5.6696295738220215,
      "step": 2436
    },
    {
      "epoch": 0.64,
      "grad_norm": 41.58126449584961,
      "kl": 0.0,
      "learning_rate": 1.811044229259356e-07,
      "logps/chosen": -227.92178344726562,
      "logps/rejected": -236.83309936523438,
      "loss": 0.3241,
      "rewards/chosen": 1.2366797924041748,
      "rewards/margins": 3.234030246734619,
      "rewards/rejected": -1.9973505735397339,
      "step": 2437
    },
    {
      "epoch": 0.64,
      "grad_norm": 34.29671859741211,
      "kl": 0.0,
      "learning_rate": 1.8097356712902382e-07,
      "logps/chosen": -194.61155700683594,
      "logps/rejected": -224.98800659179688,
      "loss": 0.283,
      "rewards/chosen": 0.47941941022872925,
      "rewards/margins": 2.887995958328247,
      "rewards/rejected": -2.408576488494873,
      "step": 2438
    },
    {
      "epoch": 0.64,
      "grad_norm": 28.590755462646484,
      "kl": 0.0,
      "learning_rate": 1.8084271133211202e-07,
      "logps/chosen": -145.79449462890625,
      "logps/rejected": -300.8880615234375,
      "loss": 0.1904,
      "rewards/chosen": 0.6898126006126404,
      "rewards/margins": 5.064650058746338,
      "rewards/rejected": -4.374837398529053,
      "step": 2439
    },
    {
      "epoch": 0.64,
      "grad_norm": 21.487871170043945,
      "kl": 0.0,
      "learning_rate": 1.8071185553520022e-07,
      "logps/chosen": -158.74505615234375,
      "logps/rejected": -275.7796630859375,
      "loss": 0.2573,
      "rewards/chosen": 1.552902102470398,
      "rewards/margins": 5.477572917938232,
      "rewards/rejected": -3.924670696258545,
      "step": 2440
    },
    {
      "epoch": 0.64,
      "grad_norm": 28.447002410888672,
      "kl": 0.0,
      "learning_rate": 1.8058099973828839e-07,
      "logps/chosen": -211.07456970214844,
      "logps/rejected": -278.2138671875,
      "loss": 0.258,
      "rewards/chosen": -0.24363432824611664,
      "rewards/margins": 3.8275575637817383,
      "rewards/rejected": -4.071191787719727,
      "step": 2441
    },
    {
      "epoch": 0.64,
      "grad_norm": 34.62162780761719,
      "kl": 0.0,
      "learning_rate": 1.8045014394137658e-07,
      "logps/chosen": -261.4964294433594,
      "logps/rejected": -281.09320068359375,
      "loss": 0.3759,
      "rewards/chosen": 0.2970547080039978,
      "rewards/margins": 2.0743181705474854,
      "rewards/rejected": -1.7772635221481323,
      "step": 2442
    },
    {
      "epoch": 0.64,
      "grad_norm": 37.94488525390625,
      "kl": 0.0,
      "learning_rate": 1.8031928814446478e-07,
      "logps/chosen": -223.45570373535156,
      "logps/rejected": -238.4624481201172,
      "loss": 0.2795,
      "rewards/chosen": 0.44729822874069214,
      "rewards/margins": 4.174413204193115,
      "rewards/rejected": -3.7271151542663574,
      "step": 2443
    },
    {
      "epoch": 0.64,
      "grad_norm": 42.214988708496094,
      "kl": 0.0,
      "learning_rate": 1.8018843234755297e-07,
      "logps/chosen": -194.6533203125,
      "logps/rejected": -261.42962646484375,
      "loss": 0.2167,
      "rewards/chosen": 0.04782336950302124,
      "rewards/margins": 3.3527185916900635,
      "rewards/rejected": -3.3048951625823975,
      "step": 2444
    },
    {
      "epoch": 0.64,
      "grad_norm": 34.41872024536133,
      "kl": 0.0,
      "learning_rate": 1.800575765506412e-07,
      "logps/chosen": -208.5053253173828,
      "logps/rejected": -261.6292724609375,
      "loss": 0.2264,
      "rewards/chosen": 1.414298176765442,
      "rewards/margins": 6.264079570770264,
      "rewards/rejected": -4.849781513214111,
      "step": 2445
    },
    {
      "epoch": 0.64,
      "grad_norm": 55.12093734741211,
      "kl": 0.0,
      "learning_rate": 1.799267207537294e-07,
      "logps/chosen": -213.4249725341797,
      "logps/rejected": -304.13330078125,
      "loss": 0.3324,
      "rewards/chosen": 0.5824081301689148,
      "rewards/margins": 4.4119977951049805,
      "rewards/rejected": -3.82958984375,
      "step": 2446
    },
    {
      "epoch": 0.64,
      "grad_norm": 42.6986198425293,
      "kl": 0.0,
      "learning_rate": 1.797958649568176e-07,
      "logps/chosen": -176.6316680908203,
      "logps/rejected": -293.6900634765625,
      "loss": 0.3693,
      "rewards/chosen": -0.32846707105636597,
      "rewards/margins": 4.005442142486572,
      "rewards/rejected": -4.333909034729004,
      "step": 2447
    },
    {
      "epoch": 0.64,
      "grad_norm": 40.18217468261719,
      "kl": 0.0,
      "learning_rate": 1.7966500915990579e-07,
      "logps/chosen": -183.23350524902344,
      "logps/rejected": -217.79002380371094,
      "loss": 0.2887,
      "rewards/chosen": 1.0397711992263794,
      "rewards/margins": 2.5541157722473145,
      "rewards/rejected": -1.5143444538116455,
      "step": 2448
    },
    {
      "epoch": 0.64,
      "grad_norm": 33.37748336791992,
      "kl": 0.0,
      "learning_rate": 1.7953415336299396e-07,
      "logps/chosen": -202.826416015625,
      "logps/rejected": -217.74087524414062,
      "loss": 0.2367,
      "rewards/chosen": 0.40020081400871277,
      "rewards/margins": 4.570520401000977,
      "rewards/rejected": -4.170319557189941,
      "step": 2449
    },
    {
      "epoch": 0.64,
      "grad_norm": 29.754947662353516,
      "kl": 0.0,
      "learning_rate": 1.7940329756608215e-07,
      "logps/chosen": -222.77371215820312,
      "logps/rejected": -223.1396026611328,
      "loss": 0.2676,
      "rewards/chosen": 1.1596344709396362,
      "rewards/margins": 4.159872055053711,
      "rewards/rejected": -3.000237464904785,
      "step": 2450
    },
    {
      "epoch": 0.64,
      "grad_norm": 39.329864501953125,
      "kl": 0.0,
      "learning_rate": 1.7927244176917037e-07,
      "logps/chosen": -255.2774658203125,
      "logps/rejected": -217.47598266601562,
      "loss": 0.2728,
      "rewards/chosen": 0.08437132835388184,
      "rewards/margins": 3.42891001701355,
      "rewards/rejected": -3.344538688659668,
      "step": 2451
    },
    {
      "epoch": 0.64,
      "grad_norm": 35.733943939208984,
      "kl": 0.0,
      "learning_rate": 1.7914158597225857e-07,
      "logps/chosen": -193.04127502441406,
      "logps/rejected": -307.47894287109375,
      "loss": 0.2573,
      "rewards/chosen": 1.3631714582443237,
      "rewards/margins": 4.002033710479736,
      "rewards/rejected": -2.638862371444702,
      "step": 2452
    },
    {
      "epoch": 0.64,
      "grad_norm": 28.73012924194336,
      "kl": 0.0,
      "learning_rate": 1.7901073017534677e-07,
      "logps/chosen": -175.82322692871094,
      "logps/rejected": -294.29327392578125,
      "loss": 0.2468,
      "rewards/chosen": 1.1468980312347412,
      "rewards/margins": 4.692342281341553,
      "rewards/rejected": -3.5454442501068115,
      "step": 2453
    },
    {
      "epoch": 0.64,
      "grad_norm": 35.15346145629883,
      "kl": 0.0,
      "learning_rate": 1.7887987437843496e-07,
      "logps/chosen": -269.71636962890625,
      "logps/rejected": -274.54364013671875,
      "loss": 0.4064,
      "rewards/chosen": 0.32359778881073,
      "rewards/margins": 2.4833879470825195,
      "rewards/rejected": -2.1597900390625,
      "step": 2454
    },
    {
      "epoch": 0.64,
      "grad_norm": 35.98922348022461,
      "kl": 0.0,
      "learning_rate": 1.7874901858152316e-07,
      "logps/chosen": -180.52163696289062,
      "logps/rejected": -253.1143798828125,
      "loss": 0.298,
      "rewards/chosen": 1.0234401226043701,
      "rewards/margins": 4.086539268493652,
      "rewards/rejected": -3.063098907470703,
      "step": 2455
    },
    {
      "epoch": 0.64,
      "grad_norm": 28.62803077697754,
      "kl": 0.0,
      "learning_rate": 1.7861816278461133e-07,
      "logps/chosen": -206.5957489013672,
      "logps/rejected": -294.3093566894531,
      "loss": 0.2218,
      "rewards/chosen": 0.06527487188577652,
      "rewards/margins": 4.730937480926514,
      "rewards/rejected": -4.66566276550293,
      "step": 2456
    },
    {
      "epoch": 0.64,
      "grad_norm": 48.36724853515625,
      "kl": 0.0,
      "learning_rate": 1.7848730698769955e-07,
      "logps/chosen": -129.61965942382812,
      "logps/rejected": -219.6337890625,
      "loss": 0.2765,
      "rewards/chosen": 0.6784979104995728,
      "rewards/margins": 3.8180155754089355,
      "rewards/rejected": -3.1395177841186523,
      "step": 2457
    },
    {
      "epoch": 0.64,
      "grad_norm": 31.1170654296875,
      "kl": 0.0,
      "learning_rate": 1.7835645119078775e-07,
      "logps/chosen": -196.87185668945312,
      "logps/rejected": -322.5999450683594,
      "loss": 0.2401,
      "rewards/chosen": 1.9326436519622803,
      "rewards/margins": 6.90676212310791,
      "rewards/rejected": -4.974118709564209,
      "step": 2458
    },
    {
      "epoch": 0.64,
      "grad_norm": 24.22307586669922,
      "kl": 0.0,
      "learning_rate": 1.7822559539387594e-07,
      "logps/chosen": -134.24618530273438,
      "logps/rejected": -140.83575439453125,
      "loss": 0.1632,
      "rewards/chosen": 2.179842948913574,
      "rewards/margins": 5.916825294494629,
      "rewards/rejected": -3.7369823455810547,
      "step": 2459
    },
    {
      "epoch": 0.64,
      "grad_norm": 38.252418518066406,
      "kl": 0.0,
      "learning_rate": 1.7809473959696414e-07,
      "logps/chosen": -129.58192443847656,
      "logps/rejected": -221.4130859375,
      "loss": 0.2781,
      "rewards/chosen": 0.37688159942626953,
      "rewards/margins": 2.2952418327331543,
      "rewards/rejected": -1.9183603525161743,
      "step": 2460
    },
    {
      "epoch": 0.64,
      "grad_norm": 32.61762237548828,
      "kl": 0.0,
      "learning_rate": 1.7796388380005233e-07,
      "logps/chosen": -251.0649871826172,
      "logps/rejected": -177.43023681640625,
      "loss": 0.2294,
      "rewards/chosen": 0.15406060218811035,
      "rewards/margins": 4.70060920715332,
      "rewards/rejected": -4.546548843383789,
      "step": 2461
    },
    {
      "epoch": 0.64,
      "grad_norm": 32.96717071533203,
      "kl": 0.0,
      "learning_rate": 1.7783302800314053e-07,
      "logps/chosen": -162.8365478515625,
      "logps/rejected": -236.65098571777344,
      "loss": 0.2856,
      "rewards/chosen": 1.1021099090576172,
      "rewards/margins": 4.090110778808594,
      "rewards/rejected": -2.9880011081695557,
      "step": 2462
    },
    {
      "epoch": 0.64,
      "grad_norm": 27.149503707885742,
      "kl": 0.0,
      "learning_rate": 1.7770217220622873e-07,
      "logps/chosen": -171.068359375,
      "logps/rejected": -306.48956298828125,
      "loss": 0.2813,
      "rewards/chosen": 1.013263463973999,
      "rewards/margins": 4.866916656494141,
      "rewards/rejected": -3.8536529541015625,
      "step": 2463
    },
    {
      "epoch": 0.64,
      "grad_norm": 44.923675537109375,
      "kl": 0.0,
      "learning_rate": 1.7757131640931692e-07,
      "logps/chosen": -206.68359375,
      "logps/rejected": -244.8902587890625,
      "loss": 0.2559,
      "rewards/chosen": 0.11413121223449707,
      "rewards/margins": 4.262955665588379,
      "rewards/rejected": -4.148824214935303,
      "step": 2464
    },
    {
      "epoch": 0.65,
      "grad_norm": 41.188018798828125,
      "kl": 0.0,
      "learning_rate": 1.7744046061240512e-07,
      "logps/chosen": -297.6365051269531,
      "logps/rejected": -292.2042236328125,
      "loss": 0.2326,
      "rewards/chosen": 1.7737928628921509,
      "rewards/margins": 5.583432197570801,
      "rewards/rejected": -3.8096394538879395,
      "step": 2465
    },
    {
      "epoch": 0.65,
      "grad_norm": 26.103349685668945,
      "kl": 0.0,
      "learning_rate": 1.7730960481549332e-07,
      "logps/chosen": -168.4176483154297,
      "logps/rejected": -240.8828125,
      "loss": 0.144,
      "rewards/chosen": 2.443394184112549,
      "rewards/margins": 5.812577247619629,
      "rewards/rejected": -3.369182825088501,
      "step": 2466
    },
    {
      "epoch": 0.65,
      "grad_norm": 33.39318084716797,
      "kl": 0.0,
      "learning_rate": 1.771787490185815e-07,
      "logps/chosen": -160.09738159179688,
      "logps/rejected": -280.89251708984375,
      "loss": 0.2235,
      "rewards/chosen": 1.8487533330917358,
      "rewards/margins": 4.299698829650879,
      "rewards/rejected": -2.4509453773498535,
      "step": 2467
    },
    {
      "epoch": 0.65,
      "grad_norm": 23.734865188598633,
      "kl": 0.0,
      "learning_rate": 1.770478932216697e-07,
      "logps/chosen": -162.56219482421875,
      "logps/rejected": -170.80076599121094,
      "loss": 0.1597,
      "rewards/chosen": -0.08058365434408188,
      "rewards/margins": 4.102053642272949,
      "rewards/rejected": -4.1826372146606445,
      "step": 2468
    },
    {
      "epoch": 0.65,
      "grad_norm": 33.352291107177734,
      "kl": 0.0,
      "learning_rate": 1.769170374247579e-07,
      "logps/chosen": -230.69261169433594,
      "logps/rejected": -318.5282287597656,
      "loss": 0.3542,
      "rewards/chosen": -0.887917160987854,
      "rewards/margins": 4.116541862487793,
      "rewards/rejected": -5.004458904266357,
      "step": 2469
    },
    {
      "epoch": 0.65,
      "grad_norm": 28.36016273498535,
      "kl": 0.0,
      "learning_rate": 1.7678618162784613e-07,
      "logps/chosen": -192.17990112304688,
      "logps/rejected": -252.4993133544922,
      "loss": 0.132,
      "rewards/chosen": 2.809781789779663,
      "rewards/margins": 7.519709587097168,
      "rewards/rejected": -4.709927558898926,
      "step": 2470
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.00200271606445,
      "kl": 0.0,
      "learning_rate": 1.7665532583093432e-07,
      "logps/chosen": -243.95375061035156,
      "logps/rejected": -255.2686309814453,
      "loss": 0.2298,
      "rewards/chosen": 2.4145703315734863,
      "rewards/margins": 6.422689914703369,
      "rewards/rejected": -4.008119583129883,
      "step": 2471
    },
    {
      "epoch": 0.65,
      "grad_norm": 35.79818344116211,
      "kl": 0.0,
      "learning_rate": 1.765244700340225e-07,
      "logps/chosen": -201.85919189453125,
      "logps/rejected": -209.87356567382812,
      "loss": 0.2992,
      "rewards/chosen": 1.0443308353424072,
      "rewards/margins": 4.464914798736572,
      "rewards/rejected": -3.420583963394165,
      "step": 2472
    },
    {
      "epoch": 0.65,
      "grad_norm": 28.738393783569336,
      "kl": 0.0,
      "learning_rate": 1.763936142371107e-07,
      "logps/chosen": -298.4902648925781,
      "logps/rejected": -262.4606628417969,
      "loss": 0.1664,
      "rewards/chosen": 0.7948918342590332,
      "rewards/margins": 4.708080291748047,
      "rewards/rejected": -3.9131884574890137,
      "step": 2473
    },
    {
      "epoch": 0.65,
      "grad_norm": 32.60791015625,
      "kl": 0.0,
      "learning_rate": 1.7626275844019888e-07,
      "logps/chosen": -233.0884552001953,
      "logps/rejected": -306.63031005859375,
      "loss": 0.2184,
      "rewards/chosen": 3.0960965156555176,
      "rewards/margins": 7.467236518859863,
      "rewards/rejected": -4.371140003204346,
      "step": 2474
    },
    {
      "epoch": 0.65,
      "grad_norm": 28.54564094543457,
      "kl": 0.0,
      "learning_rate": 1.7613190264328708e-07,
      "logps/chosen": -171.89976501464844,
      "logps/rejected": -187.40951538085938,
      "loss": 0.2721,
      "rewards/chosen": -0.18746140599250793,
      "rewards/margins": 3.1394734382629395,
      "rewards/rejected": -3.326934814453125,
      "step": 2475
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.77114486694336,
      "kl": 0.0,
      "learning_rate": 1.760010468463753e-07,
      "logps/chosen": -176.38687133789062,
      "logps/rejected": -217.43106079101562,
      "loss": 0.2527,
      "rewards/chosen": 1.2474067211151123,
      "rewards/margins": 4.102912902832031,
      "rewards/rejected": -2.855506420135498,
      "step": 2476
    },
    {
      "epoch": 0.65,
      "grad_norm": 33.361473083496094,
      "kl": 0.0,
      "learning_rate": 1.758701910494635e-07,
      "logps/chosen": -143.1798858642578,
      "logps/rejected": -371.7455749511719,
      "loss": 0.2283,
      "rewards/chosen": -0.4656195044517517,
      "rewards/margins": 5.626425743103027,
      "rewards/rejected": -6.092045307159424,
      "step": 2477
    },
    {
      "epoch": 0.65,
      "grad_norm": 29.22950553894043,
      "kl": 0.0,
      "learning_rate": 1.757393352525517e-07,
      "logps/chosen": -229.2440948486328,
      "logps/rejected": -299.21514892578125,
      "loss": 0.2769,
      "rewards/chosen": 0.769917905330658,
      "rewards/margins": 2.953812599182129,
      "rewards/rejected": -2.183894634246826,
      "step": 2478
    },
    {
      "epoch": 0.65,
      "grad_norm": 44.46550750732422,
      "kl": 0.0,
      "learning_rate": 1.756084794556399e-07,
      "logps/chosen": -235.3984375,
      "logps/rejected": -244.00491333007812,
      "loss": 0.2266,
      "rewards/chosen": 0.822199285030365,
      "rewards/margins": 3.435243606567383,
      "rewards/rejected": -2.613044261932373,
      "step": 2479
    },
    {
      "epoch": 0.65,
      "grad_norm": 30.26697540283203,
      "kl": 0.0,
      "learning_rate": 1.7547762365872806e-07,
      "logps/chosen": -173.37026977539062,
      "logps/rejected": -240.79135131835938,
      "loss": 0.235,
      "rewards/chosen": 1.0367459058761597,
      "rewards/margins": 4.59039831161499,
      "rewards/rejected": -3.55365252494812,
      "step": 2480
    },
    {
      "epoch": 0.65,
      "grad_norm": 36.04526138305664,
      "kl": 0.0,
      "learning_rate": 1.7534676786181626e-07,
      "logps/chosen": -201.38333129882812,
      "logps/rejected": -310.193359375,
      "loss": 0.304,
      "rewards/chosen": -0.028808683156967163,
      "rewards/margins": 3.178330898284912,
      "rewards/rejected": -3.207139492034912,
      "step": 2481
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.90766906738281,
      "kl": 0.0,
      "learning_rate": 1.7521591206490445e-07,
      "logps/chosen": -199.36154174804688,
      "logps/rejected": -197.4955596923828,
      "loss": 0.22,
      "rewards/chosen": 0.07011854648590088,
      "rewards/margins": 3.276404857635498,
      "rewards/rejected": -3.2062861919403076,
      "step": 2482
    },
    {
      "epoch": 0.65,
      "grad_norm": 35.02695846557617,
      "kl": 0.0,
      "learning_rate": 1.7508505626799268e-07,
      "logps/chosen": -215.74803161621094,
      "logps/rejected": -266.92626953125,
      "loss": 0.3153,
      "rewards/chosen": 0.9781593084335327,
      "rewards/margins": 5.768883228302002,
      "rewards/rejected": -4.79072380065918,
      "step": 2483
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.68666076660156,
      "kl": 0.0,
      "learning_rate": 1.7495420047108087e-07,
      "logps/chosen": -166.11175537109375,
      "logps/rejected": -273.3475646972656,
      "loss": 0.2436,
      "rewards/chosen": 1.2479914426803589,
      "rewards/margins": 5.1875176429748535,
      "rewards/rejected": -3.939526081085205,
      "step": 2484
    },
    {
      "epoch": 0.65,
      "grad_norm": 27.92112159729004,
      "kl": 0.0,
      "learning_rate": 1.7482334467416907e-07,
      "logps/chosen": -256.979248046875,
      "logps/rejected": -195.372802734375,
      "loss": 0.2592,
      "rewards/chosen": 1.7819782495498657,
      "rewards/margins": 4.243241786956787,
      "rewards/rejected": -2.461263418197632,
      "step": 2485
    },
    {
      "epoch": 0.65,
      "grad_norm": 38.856407165527344,
      "kl": 0.0,
      "learning_rate": 1.7469248887725726e-07,
      "logps/chosen": -220.26260375976562,
      "logps/rejected": -132.40969848632812,
      "loss": 0.2858,
      "rewards/chosen": 1.6704671382904053,
      "rewards/margins": 3.005314588546753,
      "rewards/rejected": -1.3348474502563477,
      "step": 2486
    },
    {
      "epoch": 0.65,
      "grad_norm": 33.75629425048828,
      "kl": 0.0,
      "learning_rate": 1.7456163308034543e-07,
      "logps/chosen": -224.50924682617188,
      "logps/rejected": -201.2100830078125,
      "loss": 0.2443,
      "rewards/chosen": 0.600041925907135,
      "rewards/margins": 2.671865463256836,
      "rewards/rejected": -2.0718235969543457,
      "step": 2487
    },
    {
      "epoch": 0.65,
      "grad_norm": 32.12745666503906,
      "kl": 0.0,
      "learning_rate": 1.7443077728343363e-07,
      "logps/chosen": -196.9830780029297,
      "logps/rejected": -138.11795043945312,
      "loss": 0.2377,
      "rewards/chosen": 1.765749216079712,
      "rewards/margins": 4.971625328063965,
      "rewards/rejected": -3.205876350402832,
      "step": 2488
    },
    {
      "epoch": 0.65,
      "grad_norm": 36.79436111450195,
      "kl": 0.0,
      "learning_rate": 1.7429992148652185e-07,
      "logps/chosen": -257.759765625,
      "logps/rejected": -156.55078125,
      "loss": 0.2281,
      "rewards/chosen": 1.8479835987091064,
      "rewards/margins": 5.976274490356445,
      "rewards/rejected": -4.128291130065918,
      "step": 2489
    },
    {
      "epoch": 0.65,
      "grad_norm": 38.34791564941406,
      "kl": 0.0,
      "learning_rate": 1.7416906568961005e-07,
      "logps/chosen": -242.24095153808594,
      "logps/rejected": -281.4273681640625,
      "loss": 0.3033,
      "rewards/chosen": -1.0780397653579712,
      "rewards/margins": 3.3613080978393555,
      "rewards/rejected": -4.439347743988037,
      "step": 2490
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.285850524902344,
      "kl": 0.0,
      "learning_rate": 1.7403820989269824e-07,
      "logps/chosen": -167.8081512451172,
      "logps/rejected": -238.1160430908203,
      "loss": 0.2076,
      "rewards/chosen": 2.113269805908203,
      "rewards/margins": 5.478918075561523,
      "rewards/rejected": -3.3656485080718994,
      "step": 2491
    },
    {
      "epoch": 0.65,
      "grad_norm": 36.4212532043457,
      "kl": 0.0,
      "learning_rate": 1.7390735409578644e-07,
      "logps/chosen": -284.6696472167969,
      "logps/rejected": -324.4261779785156,
      "loss": 0.2279,
      "rewards/chosen": -0.020696640014648438,
      "rewards/margins": 3.7279841899871826,
      "rewards/rejected": -3.748680830001831,
      "step": 2492
    },
    {
      "epoch": 0.65,
      "grad_norm": 32.45363998413086,
      "kl": 0.0,
      "learning_rate": 1.7377649829887464e-07,
      "logps/chosen": -295.4963073730469,
      "logps/rejected": -249.78189086914062,
      "loss": 0.2526,
      "rewards/chosen": 2.3719584941864014,
      "rewards/margins": 4.654071807861328,
      "rewards/rejected": -2.282113552093506,
      "step": 2493
    },
    {
      "epoch": 0.65,
      "grad_norm": 29.20089340209961,
      "kl": 0.0,
      "learning_rate": 1.7364564250196283e-07,
      "logps/chosen": -192.12564086914062,
      "logps/rejected": -224.99407958984375,
      "loss": 0.2861,
      "rewards/chosen": 1.1605472564697266,
      "rewards/margins": 4.133330821990967,
      "rewards/rejected": -2.9727835655212402,
      "step": 2494
    },
    {
      "epoch": 0.65,
      "grad_norm": 39.42447280883789,
      "kl": 0.0,
      "learning_rate": 1.73514786705051e-07,
      "logps/chosen": -224.35662841796875,
      "logps/rejected": -237.2237548828125,
      "loss": 0.2572,
      "rewards/chosen": 1.054343581199646,
      "rewards/margins": 3.775313377380371,
      "rewards/rejected": -2.7209696769714355,
      "step": 2495
    },
    {
      "epoch": 0.65,
      "grad_norm": 39.16334915161133,
      "kl": 0.0,
      "learning_rate": 1.7338393090813922e-07,
      "logps/chosen": -214.0321807861328,
      "logps/rejected": -207.50270080566406,
      "loss": 0.232,
      "rewards/chosen": 1.2089825868606567,
      "rewards/margins": 4.443968296051025,
      "rewards/rejected": -3.234985828399658,
      "step": 2496
    },
    {
      "epoch": 0.65,
      "grad_norm": 25.793285369873047,
      "kl": 0.0,
      "learning_rate": 1.7325307511122742e-07,
      "logps/chosen": -179.11578369140625,
      "logps/rejected": -253.93060302734375,
      "loss": 0.2831,
      "rewards/chosen": 1.198627233505249,
      "rewards/margins": 5.378981590270996,
      "rewards/rejected": -4.180354118347168,
      "step": 2497
    },
    {
      "epoch": 0.65,
      "grad_norm": 35.483436584472656,
      "kl": 0.0,
      "learning_rate": 1.7312221931431562e-07,
      "logps/chosen": -181.34971618652344,
      "logps/rejected": -267.29290771484375,
      "loss": 0.3095,
      "rewards/chosen": 0.8412249088287354,
      "rewards/margins": 5.656091690063477,
      "rewards/rejected": -4.814866542816162,
      "step": 2498
    },
    {
      "epoch": 0.65,
      "grad_norm": 26.45248031616211,
      "kl": 0.0,
      "learning_rate": 1.729913635174038e-07,
      "logps/chosen": -268.8468017578125,
      "logps/rejected": -223.29476928710938,
      "loss": 0.2184,
      "rewards/chosen": 1.8355284929275513,
      "rewards/margins": 8.121630668640137,
      "rewards/rejected": -6.286102294921875,
      "step": 2499
    },
    {
      "epoch": 0.65,
      "grad_norm": 27.72164535522461,
      "kl": 0.0,
      "learning_rate": 1.72860507720492e-07,
      "logps/chosen": -250.75514221191406,
      "logps/rejected": -251.33322143554688,
      "loss": 0.1799,
      "rewards/chosen": 2.802277088165283,
      "rewards/margins": 7.107457637786865,
      "rewards/rejected": -4.305180549621582,
      "step": 2500
    },
    {
      "epoch": 0.65,
      "grad_norm": 33.076168060302734,
      "kl": 0.0,
      "learning_rate": 1.727296519235802e-07,
      "logps/chosen": -170.14682006835938,
      "logps/rejected": -306.0361328125,
      "loss": 0.1722,
      "rewards/chosen": 1.6673815250396729,
      "rewards/margins": 5.51771879196167,
      "rewards/rejected": -3.850337266921997,
      "step": 2501
    },
    {
      "epoch": 0.65,
      "grad_norm": 34.463134765625,
      "kl": 0.0,
      "learning_rate": 1.7259879612666843e-07,
      "logps/chosen": -258.2651672363281,
      "logps/rejected": -253.67453002929688,
      "loss": 0.2684,
      "rewards/chosen": 1.771608591079712,
      "rewards/margins": 5.85136604309082,
      "rewards/rejected": -4.0797576904296875,
      "step": 2502
    },
    {
      "epoch": 0.66,
      "grad_norm": 41.39519119262695,
      "kl": 0.0,
      "learning_rate": 1.724679403297566e-07,
      "logps/chosen": -150.8826904296875,
      "logps/rejected": -281.825439453125,
      "loss": 0.3189,
      "rewards/chosen": 0.4526558518409729,
      "rewards/margins": 2.457512855529785,
      "rewards/rejected": -2.004857063293457,
      "step": 2503
    },
    {
      "epoch": 0.66,
      "grad_norm": 33.52042770385742,
      "kl": 0.0,
      "learning_rate": 1.723370845328448e-07,
      "logps/chosen": -270.5859680175781,
      "logps/rejected": -232.13134765625,
      "loss": 0.225,
      "rewards/chosen": 1.8501927852630615,
      "rewards/margins": 5.286325454711914,
      "rewards/rejected": -3.4361324310302734,
      "step": 2504
    },
    {
      "epoch": 0.66,
      "grad_norm": 26.857891082763672,
      "kl": 0.0,
      "learning_rate": 1.72206228735933e-07,
      "logps/chosen": -235.99998474121094,
      "logps/rejected": -252.3543701171875,
      "loss": 0.2047,
      "rewards/chosen": 1.4618678092956543,
      "rewards/margins": 5.876453876495361,
      "rewards/rejected": -4.414586067199707,
      "step": 2505
    },
    {
      "epoch": 0.66,
      "grad_norm": 29.943302154541016,
      "kl": 0.0,
      "learning_rate": 1.7207537293902119e-07,
      "logps/chosen": -158.45936584472656,
      "logps/rejected": -222.40798950195312,
      "loss": 0.2241,
      "rewards/chosen": 1.1429415941238403,
      "rewards/margins": 5.468343734741211,
      "rewards/rejected": -4.32540225982666,
      "step": 2506
    },
    {
      "epoch": 0.66,
      "grad_norm": 32.61026382446289,
      "kl": 0.0,
      "learning_rate": 1.7194451714210938e-07,
      "logps/chosen": -225.8323516845703,
      "logps/rejected": -226.42227172851562,
      "loss": 0.2229,
      "rewards/chosen": 2.261629104614258,
      "rewards/margins": 6.06764030456543,
      "rewards/rejected": -3.8060109615325928,
      "step": 2507
    },
    {
      "epoch": 0.66,
      "grad_norm": 29.26123046875,
      "kl": 0.0,
      "learning_rate": 1.718136613451976e-07,
      "logps/chosen": -226.59225463867188,
      "logps/rejected": -286.61572265625,
      "loss": 0.2764,
      "rewards/chosen": 1.3310816287994385,
      "rewards/margins": 4.778102397918701,
      "rewards/rejected": -3.4470207691192627,
      "step": 2508
    },
    {
      "epoch": 0.66,
      "grad_norm": 37.21846389770508,
      "kl": 0.0,
      "learning_rate": 1.716828055482858e-07,
      "logps/chosen": -153.80140686035156,
      "logps/rejected": -322.50732421875,
      "loss": 0.1747,
      "rewards/chosen": 0.6639166474342346,
      "rewards/margins": 4.689450740814209,
      "rewards/rejected": -4.025534152984619,
      "step": 2509
    },
    {
      "epoch": 0.66,
      "grad_norm": 30.148347854614258,
      "kl": 0.0,
      "learning_rate": 1.7155194975137397e-07,
      "logps/chosen": -226.34432983398438,
      "logps/rejected": -206.22552490234375,
      "loss": 0.3551,
      "rewards/chosen": -0.16970336437225342,
      "rewards/margins": 2.9069418907165527,
      "rewards/rejected": -3.0766453742980957,
      "step": 2510
    },
    {
      "epoch": 0.66,
      "grad_norm": 34.247314453125,
      "kl": 0.0,
      "learning_rate": 1.7142109395446217e-07,
      "logps/chosen": -198.5338897705078,
      "logps/rejected": -207.5816650390625,
      "loss": 0.1954,
      "rewards/chosen": 1.4360510110855103,
      "rewards/margins": 4.661987781524658,
      "rewards/rejected": -3.2259366512298584,
      "step": 2511
    },
    {
      "epoch": 0.66,
      "grad_norm": 22.530338287353516,
      "kl": 0.0,
      "learning_rate": 1.7129023815755036e-07,
      "logps/chosen": -194.3182830810547,
      "logps/rejected": -249.9308624267578,
      "loss": 0.1564,
      "rewards/chosen": 1.923820972442627,
      "rewards/margins": 5.429113864898682,
      "rewards/rejected": -3.5052928924560547,
      "step": 2512
    },
    {
      "epoch": 0.66,
      "grad_norm": 36.972007751464844,
      "kl": 0.0,
      "learning_rate": 1.7115938236063856e-07,
      "logps/chosen": -200.99462890625,
      "logps/rejected": -229.53919982910156,
      "loss": 0.2399,
      "rewards/chosen": 1.6839921474456787,
      "rewards/margins": 4.945981025695801,
      "rewards/rejected": -3.261988878250122,
      "step": 2513
    },
    {
      "epoch": 0.66,
      "grad_norm": 36.51010513305664,
      "kl": 0.0,
      "learning_rate": 1.7102852656372675e-07,
      "logps/chosen": -220.27752685546875,
      "logps/rejected": -185.32997131347656,
      "loss": 0.3269,
      "rewards/chosen": 2.3345282077789307,
      "rewards/margins": 4.075137138366699,
      "rewards/rejected": -1.7406089305877686,
      "step": 2514
    },
    {
      "epoch": 0.66,
      "grad_norm": 45.91500473022461,
      "kl": 0.0,
      "learning_rate": 1.7089767076681498e-07,
      "logps/chosen": -243.1271514892578,
      "logps/rejected": -201.50389099121094,
      "loss": 0.1832,
      "rewards/chosen": 1.141270637512207,
      "rewards/margins": 4.884117126464844,
      "rewards/rejected": -3.742846727371216,
      "step": 2515
    },
    {
      "epoch": 0.66,
      "grad_norm": 31.586618423461914,
      "kl": 0.0,
      "learning_rate": 1.7076681496990317e-07,
      "logps/chosen": -146.20590209960938,
      "logps/rejected": -183.73988342285156,
      "loss": 0.3548,
      "rewards/chosen": 0.2781957685947418,
      "rewards/margins": 2.8900160789489746,
      "rewards/rejected": -2.6118202209472656,
      "step": 2516
    },
    {
      "epoch": 0.66,
      "grad_norm": 31.6408748626709,
      "kl": 0.0,
      "learning_rate": 1.7063595917299137e-07,
      "logps/chosen": -184.8021697998047,
      "logps/rejected": -232.1508026123047,
      "loss": 0.1541,
      "rewards/chosen": 1.8986157178878784,
      "rewards/margins": 5.86688232421875,
      "rewards/rejected": -3.968266725540161,
      "step": 2517
    },
    {
      "epoch": 0.66,
      "grad_norm": 47.6021842956543,
      "kl": 0.0,
      "learning_rate": 1.7050510337607954e-07,
      "logps/chosen": -169.6454315185547,
      "logps/rejected": -179.72286987304688,
      "loss": 0.2734,
      "rewards/chosen": 0.23273229598999023,
      "rewards/margins": 2.5509696006774902,
      "rewards/rejected": -2.3182373046875,
      "step": 2518
    },
    {
      "epoch": 0.66,
      "grad_norm": 37.73974609375,
      "kl": 0.0,
      "learning_rate": 1.7037424757916773e-07,
      "logps/chosen": -152.7825927734375,
      "logps/rejected": -184.64952087402344,
      "loss": 0.2823,
      "rewards/chosen": -0.130534827709198,
      "rewards/margins": 4.2096405029296875,
      "rewards/rejected": -4.340175151824951,
      "step": 2519
    },
    {
      "epoch": 0.66,
      "grad_norm": 31.94325065612793,
      "kl": 0.0,
      "learning_rate": 1.7024339178225593e-07,
      "logps/chosen": -158.05471801757812,
      "logps/rejected": -187.04931640625,
      "loss": 0.2188,
      "rewards/chosen": 1.0949746370315552,
      "rewards/margins": 4.778814315795898,
      "rewards/rejected": -3.6838395595550537,
      "step": 2520
    },
    {
      "epoch": 0.66,
      "grad_norm": 35.16641616821289,
      "kl": 0.0,
      "learning_rate": 1.7011253598534415e-07,
      "logps/chosen": -161.69552612304688,
      "logps/rejected": -265.4871826171875,
      "loss": 0.2512,
      "rewards/chosen": 0.9285306930541992,
      "rewards/margins": 3.704392433166504,
      "rewards/rejected": -2.7758617401123047,
      "step": 2521
    },
    {
      "epoch": 0.66,
      "grad_norm": 38.41954803466797,
      "kl": 0.0,
      "learning_rate": 1.6998168018843235e-07,
      "logps/chosen": -232.1246795654297,
      "logps/rejected": -205.31787109375,
      "loss": 0.2639,
      "rewards/chosen": 0.07170384377241135,
      "rewards/margins": 4.2816596031188965,
      "rewards/rejected": -4.20995569229126,
      "step": 2522
    },
    {
      "epoch": 0.66,
      "grad_norm": 47.72578430175781,
      "kl": 0.0,
      "learning_rate": 1.6985082439152055e-07,
      "logps/chosen": -162.1151123046875,
      "logps/rejected": -244.63079833984375,
      "loss": 0.2459,
      "rewards/chosen": -0.6484707593917847,
      "rewards/margins": 2.6753692626953125,
      "rewards/rejected": -3.3238401412963867,
      "step": 2523
    },
    {
      "epoch": 0.66,
      "grad_norm": 29.99666976928711,
      "kl": 0.0,
      "learning_rate": 1.6971996859460874e-07,
      "logps/chosen": -216.69342041015625,
      "logps/rejected": -247.39427185058594,
      "loss": 0.2399,
      "rewards/chosen": 0.34248360991477966,
      "rewards/margins": 3.2656843662261963,
      "rewards/rejected": -2.923200845718384,
      "step": 2524
    },
    {
      "epoch": 0.66,
      "grad_norm": 36.366554260253906,
      "kl": 0.0,
      "learning_rate": 1.6958911279769694e-07,
      "logps/chosen": -240.5819091796875,
      "logps/rejected": -249.13943481445312,
      "loss": 0.3091,
      "rewards/chosen": 0.6309940218925476,
      "rewards/margins": 4.097060203552246,
      "rewards/rejected": -3.4660661220550537,
      "step": 2525
    },
    {
      "epoch": 0.66,
      "grad_norm": 35.71635055541992,
      "kl": 0.0,
      "learning_rate": 1.694582570007851e-07,
      "logps/chosen": -219.22613525390625,
      "logps/rejected": -270.96051025390625,
      "loss": 0.2344,
      "rewards/chosen": 0.7651382088661194,
      "rewards/margins": 4.163715839385986,
      "rewards/rejected": -3.3985774517059326,
      "step": 2526
    },
    {
      "epoch": 0.66,
      "grad_norm": 26.018346786499023,
      "kl": 0.0,
      "learning_rate": 1.693274012038733e-07,
      "logps/chosen": -124.76805114746094,
      "logps/rejected": -178.9542999267578,
      "loss": 0.2418,
      "rewards/chosen": 0.2460649609565735,
      "rewards/margins": 3.744946241378784,
      "rewards/rejected": -3.4988813400268555,
      "step": 2527
    },
    {
      "epoch": 0.66,
      "grad_norm": 37.38301467895508,
      "kl": 0.0,
      "learning_rate": 1.6919654540696153e-07,
      "logps/chosen": -150.87771606445312,
      "logps/rejected": -201.72645568847656,
      "loss": 0.2214,
      "rewards/chosen": 1.6785235404968262,
      "rewards/margins": 7.901946067810059,
      "rewards/rejected": -6.223422527313232,
      "step": 2528
    },
    {
      "epoch": 0.66,
      "grad_norm": 30.10106086730957,
      "kl": 0.0,
      "learning_rate": 1.6906568961004972e-07,
      "logps/chosen": -193.71878051757812,
      "logps/rejected": -209.6848602294922,
      "loss": 0.2651,
      "rewards/chosen": 0.5997923016548157,
      "rewards/margins": 4.252300262451172,
      "rewards/rejected": -3.652507781982422,
      "step": 2529
    },
    {
      "epoch": 0.66,
      "grad_norm": 35.383975982666016,
      "kl": 0.0,
      "learning_rate": 1.6893483381313792e-07,
      "logps/chosen": -145.73793029785156,
      "logps/rejected": -252.19183349609375,
      "loss": 0.3688,
      "rewards/chosen": 0.0910300612449646,
      "rewards/margins": 4.4787726402282715,
      "rewards/rejected": -4.387742519378662,
      "step": 2530
    },
    {
      "epoch": 0.66,
      "grad_norm": 35.255428314208984,
      "kl": 0.0,
      "learning_rate": 1.6880397801622611e-07,
      "logps/chosen": -172.89564514160156,
      "logps/rejected": -302.4369812011719,
      "loss": 0.2492,
      "rewards/chosen": 1.2666151523590088,
      "rewards/margins": 8.008228302001953,
      "rewards/rejected": -6.741613388061523,
      "step": 2531
    },
    {
      "epoch": 0.66,
      "grad_norm": 45.66690444946289,
      "kl": 0.0,
      "learning_rate": 1.686731222193143e-07,
      "logps/chosen": -236.4287872314453,
      "logps/rejected": -223.3417510986328,
      "loss": 0.3459,
      "rewards/chosen": 0.06413912773132324,
      "rewards/margins": 2.366434335708618,
      "rewards/rejected": -2.302295207977295,
      "step": 2532
    },
    {
      "epoch": 0.66,
      "grad_norm": 29.879467010498047,
      "kl": 0.0,
      "learning_rate": 1.685422664224025e-07,
      "logps/chosen": -199.5425262451172,
      "logps/rejected": -373.2349548339844,
      "loss": 0.203,
      "rewards/chosen": 2.3080244064331055,
      "rewards/margins": 7.140763759613037,
      "rewards/rejected": -4.832739353179932,
      "step": 2533
    },
    {
      "epoch": 0.66,
      "grad_norm": 21.92876625061035,
      "kl": 0.0,
      "learning_rate": 1.684114106254907e-07,
      "logps/chosen": -236.30528259277344,
      "logps/rejected": -233.36373901367188,
      "loss": 0.2341,
      "rewards/chosen": -1.7516714334487915,
      "rewards/margins": 1.928197979927063,
      "rewards/rejected": -3.6798694133758545,
      "step": 2534
    },
    {
      "epoch": 0.66,
      "grad_norm": 35.66194534301758,
      "kl": 0.0,
      "learning_rate": 1.682805548285789e-07,
      "logps/chosen": -211.97149658203125,
      "logps/rejected": -355.55926513671875,
      "loss": 0.2266,
      "rewards/chosen": 1.2822884321212769,
      "rewards/margins": 4.822649955749512,
      "rewards/rejected": -3.5403616428375244,
      "step": 2535
    },
    {
      "epoch": 0.66,
      "grad_norm": 29.88081169128418,
      "kl": 0.0,
      "learning_rate": 1.681496990316671e-07,
      "logps/chosen": -209.38343811035156,
      "logps/rejected": -249.86630249023438,
      "loss": 0.2414,
      "rewards/chosen": 2.7169368267059326,
      "rewards/margins": 7.931248664855957,
      "rewards/rejected": -5.214311599731445,
      "step": 2536
    },
    {
      "epoch": 0.66,
      "grad_norm": 39.38168716430664,
      "kl": 0.0,
      "learning_rate": 1.680188432347553e-07,
      "logps/chosen": -214.57864379882812,
      "logps/rejected": -201.05377197265625,
      "loss": 0.3847,
      "rewards/chosen": 0.13023439049720764,
      "rewards/margins": 3.047044038772583,
      "rewards/rejected": -2.916809558868408,
      "step": 2537
    },
    {
      "epoch": 0.66,
      "grad_norm": 31.947851181030273,
      "kl": 0.0,
      "learning_rate": 1.678879874378435e-07,
      "logps/chosen": -157.2117156982422,
      "logps/rejected": -217.1915283203125,
      "loss": 0.2563,
      "rewards/chosen": 0.8770288228988647,
      "rewards/margins": 5.18677282333374,
      "rewards/rejected": -4.309743881225586,
      "step": 2538
    },
    {
      "epoch": 0.66,
      "grad_norm": 32.08303451538086,
      "kl": 0.0,
      "learning_rate": 1.6775713164093168e-07,
      "logps/chosen": -242.77227783203125,
      "logps/rejected": -256.195556640625,
      "loss": 0.2824,
      "rewards/chosen": 2.168449640274048,
      "rewards/margins": 5.70610237121582,
      "rewards/rejected": -3.5376524925231934,
      "step": 2539
    },
    {
      "epoch": 0.66,
      "grad_norm": 48.098262786865234,
      "kl": 0.0,
      "learning_rate": 1.676262758440199e-07,
      "logps/chosen": -190.1537628173828,
      "logps/rejected": -293.48931884765625,
      "loss": 0.2169,
      "rewards/chosen": 0.08483177423477173,
      "rewards/margins": 3.067207098007202,
      "rewards/rejected": -2.982375383377075,
      "step": 2540
    },
    {
      "epoch": 0.67,
      "grad_norm": 35.13059997558594,
      "kl": 0.0,
      "learning_rate": 1.6749542004710808e-07,
      "logps/chosen": -190.3247833251953,
      "logps/rejected": -254.23439025878906,
      "loss": 0.2753,
      "rewards/chosen": 0.970902144908905,
      "rewards/margins": 4.8270134925842285,
      "rewards/rejected": -3.8561112880706787,
      "step": 2541
    },
    {
      "epoch": 0.67,
      "grad_norm": 27.28483009338379,
      "kl": 0.0,
      "learning_rate": 1.6736456425019627e-07,
      "logps/chosen": -222.0068817138672,
      "logps/rejected": -290.8199462890625,
      "loss": 0.1891,
      "rewards/chosen": 1.633012056350708,
      "rewards/margins": 6.94444465637207,
      "rewards/rejected": -5.311432838439941,
      "step": 2542
    },
    {
      "epoch": 0.67,
      "grad_norm": 39.392120361328125,
      "kl": 0.0,
      "learning_rate": 1.6723370845328447e-07,
      "logps/chosen": -203.91192626953125,
      "logps/rejected": -183.90980529785156,
      "loss": 0.2754,
      "rewards/chosen": 2.2773795127868652,
      "rewards/margins": 5.241265773773193,
      "rewards/rejected": -2.963886260986328,
      "step": 2543
    },
    {
      "epoch": 0.67,
      "grad_norm": 28.039636611938477,
      "kl": 0.0,
      "learning_rate": 1.6710285265637266e-07,
      "logps/chosen": -313.6973571777344,
      "logps/rejected": -302.34881591796875,
      "loss": 0.0985,
      "rewards/chosen": 1.6423184871673584,
      "rewards/margins": 5.527065753936768,
      "rewards/rejected": -3.884747266769409,
      "step": 2544
    },
    {
      "epoch": 0.67,
      "grad_norm": 43.43989944458008,
      "kl": 0.0,
      "learning_rate": 1.6697199685946086e-07,
      "logps/chosen": -250.87657165527344,
      "logps/rejected": -170.1529541015625,
      "loss": 0.2919,
      "rewards/chosen": -0.765656590461731,
      "rewards/margins": 2.5168333053588867,
      "rewards/rejected": -3.282489776611328,
      "step": 2545
    },
    {
      "epoch": 0.67,
      "grad_norm": 26.912771224975586,
      "kl": 0.0,
      "learning_rate": 1.6684114106254906e-07,
      "logps/chosen": -274.5841064453125,
      "logps/rejected": -181.53070068359375,
      "loss": 0.3491,
      "rewards/chosen": 1.5585336685180664,
      "rewards/margins": 3.7967350482940674,
      "rewards/rejected": -2.238201379776001,
      "step": 2546
    },
    {
      "epoch": 0.67,
      "grad_norm": 42.92966842651367,
      "kl": 0.0,
      "learning_rate": 1.6671028526563728e-07,
      "logps/chosen": -223.40972900390625,
      "logps/rejected": -178.333984375,
      "loss": 0.364,
      "rewards/chosen": 0.08250631392002106,
      "rewards/margins": 1.972010850906372,
      "rewards/rejected": -1.8895045518875122,
      "step": 2547
    },
    {
      "epoch": 0.67,
      "grad_norm": 34.5250129699707,
      "kl": 0.0,
      "learning_rate": 1.6657942946872547e-07,
      "logps/chosen": -157.47816467285156,
      "logps/rejected": -239.72552490234375,
      "loss": 0.2466,
      "rewards/chosen": 1.272930383682251,
      "rewards/margins": 4.885026931762695,
      "rewards/rejected": -3.6120967864990234,
      "step": 2548
    },
    {
      "epoch": 0.67,
      "grad_norm": 32.43971633911133,
      "kl": 0.0,
      "learning_rate": 1.6644857367181364e-07,
      "logps/chosen": -226.4892578125,
      "logps/rejected": -253.7250213623047,
      "loss": 0.1761,
      "rewards/chosen": 1.7298102378845215,
      "rewards/margins": 5.25880765914917,
      "rewards/rejected": -3.5289974212646484,
      "step": 2549
    },
    {
      "epoch": 0.67,
      "grad_norm": 24.741518020629883,
      "kl": 0.0,
      "learning_rate": 1.6631771787490184e-07,
      "logps/chosen": -209.94854736328125,
      "logps/rejected": -198.7462615966797,
      "loss": 0.2187,
      "rewards/chosen": 1.6599093675613403,
      "rewards/margins": 4.376770973205566,
      "rewards/rejected": -2.7168617248535156,
      "step": 2550
    },
    {
      "epoch": 0.67,
      "grad_norm": 33.30052185058594,
      "kl": 0.0,
      "learning_rate": 1.6618686207799004e-07,
      "logps/chosen": -180.7981414794922,
      "logps/rejected": -295.49761962890625,
      "loss": 0.2962,
      "rewards/chosen": 0.1354203224182129,
      "rewards/margins": 4.54762601852417,
      "rewards/rejected": -4.412205696105957,
      "step": 2551
    },
    {
      "epoch": 0.67,
      "grad_norm": 27.4842472076416,
      "kl": 0.0,
      "learning_rate": 1.6605600628107823e-07,
      "logps/chosen": -331.968505859375,
      "logps/rejected": -256.1702575683594,
      "loss": 0.2373,
      "rewards/chosen": -2.0421226024627686,
      "rewards/margins": 1.523503065109253,
      "rewards/rejected": -3.5656256675720215,
      "step": 2552
    },
    {
      "epoch": 0.67,
      "grad_norm": 40.02251434326172,
      "kl": 0.0,
      "learning_rate": 1.6592515048416645e-07,
      "logps/chosen": -176.6609344482422,
      "logps/rejected": -343.7615966796875,
      "loss": 0.1817,
      "rewards/chosen": 1.9425089359283447,
      "rewards/margins": 6.32541561126709,
      "rewards/rejected": -4.382906913757324,
      "step": 2553
    },
    {
      "epoch": 0.67,
      "grad_norm": 33.022159576416016,
      "kl": 0.0,
      "learning_rate": 1.6579429468725465e-07,
      "logps/chosen": -184.307861328125,
      "logps/rejected": -344.2357482910156,
      "loss": 0.1872,
      "rewards/chosen": 0.2234984189271927,
      "rewards/margins": 4.028815746307373,
      "rewards/rejected": -3.8053174018859863,
      "step": 2554
    },
    {
      "epoch": 0.67,
      "grad_norm": 39.761287689208984,
      "kl": 0.0,
      "learning_rate": 1.6566343889034285e-07,
      "logps/chosen": -253.70285034179688,
      "logps/rejected": -258.4842834472656,
      "loss": 0.3053,
      "rewards/chosen": -0.629117488861084,
      "rewards/margins": 2.948265552520752,
      "rewards/rejected": -3.577383041381836,
      "step": 2555
    },
    {
      "epoch": 0.67,
      "grad_norm": 35.11906814575195,
      "kl": 0.0,
      "learning_rate": 1.6553258309343104e-07,
      "logps/chosen": -160.90133666992188,
      "logps/rejected": -142.0111846923828,
      "loss": 0.2437,
      "rewards/chosen": 2.033764123916626,
      "rewards/margins": 4.612457275390625,
      "rewards/rejected": -2.578693389892578,
      "step": 2556
    },
    {
      "epoch": 0.67,
      "grad_norm": 28.91179847717285,
      "kl": 0.0,
      "learning_rate": 1.654017272965192e-07,
      "logps/chosen": -211.7546844482422,
      "logps/rejected": -228.7609100341797,
      "loss": 0.2662,
      "rewards/chosen": 1.1779052019119263,
      "rewards/margins": 4.679661750793457,
      "rewards/rejected": -3.5017566680908203,
      "step": 2557
    },
    {
      "epoch": 0.67,
      "grad_norm": 25.67728042602539,
      "kl": 0.0,
      "learning_rate": 1.652708714996074e-07,
      "logps/chosen": -167.1860809326172,
      "logps/rejected": -192.19161987304688,
      "loss": 0.2026,
      "rewards/chosen": 1.4385385513305664,
      "rewards/margins": 5.501163959503174,
      "rewards/rejected": -4.062625408172607,
      "step": 2558
    },
    {
      "epoch": 0.67,
      "grad_norm": 37.68383026123047,
      "kl": 0.0,
      "learning_rate": 1.651400157026956e-07,
      "logps/chosen": -148.10342407226562,
      "logps/rejected": -275.0115966796875,
      "loss": 0.2775,
      "rewards/chosen": 0.9385942220687866,
      "rewards/margins": 5.033987522125244,
      "rewards/rejected": -4.095393180847168,
      "step": 2559
    },
    {
      "epoch": 0.67,
      "grad_norm": 26.639482498168945,
      "kl": 0.0,
      "learning_rate": 1.6500915990578383e-07,
      "logps/chosen": -196.66302490234375,
      "logps/rejected": -277.0069274902344,
      "loss": 0.2187,
      "rewards/chosen": 2.4432859420776367,
      "rewards/margins": 7.975201606750488,
      "rewards/rejected": -5.531915664672852,
      "step": 2560
    },
    {
      "epoch": 0.67,
      "grad_norm": 32.87889099121094,
      "kl": 0.0,
      "learning_rate": 1.6487830410887202e-07,
      "logps/chosen": -273.00091552734375,
      "logps/rejected": -261.990478515625,
      "loss": 0.3054,
      "rewards/chosen": 0.26404502987861633,
      "rewards/margins": 5.634684085845947,
      "rewards/rejected": -5.370638847351074,
      "step": 2561
    },
    {
      "epoch": 0.67,
      "grad_norm": 31.613813400268555,
      "kl": 0.0,
      "learning_rate": 1.6474744831196022e-07,
      "logps/chosen": -134.63540649414062,
      "logps/rejected": -189.412353515625,
      "loss": 0.3389,
      "rewards/chosen": 0.022171571850776672,
      "rewards/margins": 4.27623987197876,
      "rewards/rejected": -4.254068374633789,
      "step": 2562
    },
    {
      "epoch": 0.67,
      "grad_norm": 29.528295516967773,
      "kl": 0.0,
      "learning_rate": 1.6461659251504842e-07,
      "logps/chosen": -224.38973999023438,
      "logps/rejected": -231.9743194580078,
      "loss": 0.1835,
      "rewards/chosen": 1.7728891372680664,
      "rewards/margins": 7.36733341217041,
      "rewards/rejected": -5.594444274902344,
      "step": 2563
    },
    {
      "epoch": 0.67,
      "grad_norm": 38.98188781738281,
      "kl": 0.0,
      "learning_rate": 1.644857367181366e-07,
      "logps/chosen": -207.80917358398438,
      "logps/rejected": -355.8811340332031,
      "loss": 0.2485,
      "rewards/chosen": 1.5398192405700684,
      "rewards/margins": 4.257233142852783,
      "rewards/rejected": -2.717413902282715,
      "step": 2564
    },
    {
      "epoch": 0.67,
      "grad_norm": 33.208133697509766,
      "kl": 0.0,
      "learning_rate": 1.6435488092122478e-07,
      "logps/chosen": -210.67318725585938,
      "logps/rejected": -217.0673370361328,
      "loss": 0.2497,
      "rewards/chosen": 2.5629193782806396,
      "rewards/margins": 5.6290388107299805,
      "rewards/rejected": -3.06611967086792,
      "step": 2565
    },
    {
      "epoch": 0.67,
      "grad_norm": 32.00192642211914,
      "kl": 0.0,
      "learning_rate": 1.64224025124313e-07,
      "logps/chosen": -335.8350524902344,
      "logps/rejected": -221.9557647705078,
      "loss": 0.1103,
      "rewards/chosen": -1.1904784440994263,
      "rewards/margins": 3.179619789123535,
      "rewards/rejected": -4.370098114013672,
      "step": 2566
    },
    {
      "epoch": 0.67,
      "grad_norm": 39.68660354614258,
      "kl": 0.0,
      "learning_rate": 1.640931693274012e-07,
      "logps/chosen": -246.44900512695312,
      "logps/rejected": -313.3238830566406,
      "loss": 0.189,
      "rewards/chosen": 2.0520215034484863,
      "rewards/margins": 6.506097793579102,
      "rewards/rejected": -4.454076290130615,
      "step": 2567
    },
    {
      "epoch": 0.67,
      "grad_norm": 31.22844886779785,
      "kl": 0.0,
      "learning_rate": 1.639623135304894e-07,
      "logps/chosen": -166.51016235351562,
      "logps/rejected": -294.8640441894531,
      "loss": 0.1643,
      "rewards/chosen": 0.9799101948738098,
      "rewards/margins": 5.509171485900879,
      "rewards/rejected": -4.529261112213135,
      "step": 2568
    },
    {
      "epoch": 0.67,
      "grad_norm": 26.876453399658203,
      "kl": 0.0,
      "learning_rate": 1.638314577335776e-07,
      "logps/chosen": -216.63499450683594,
      "logps/rejected": -244.08470153808594,
      "loss": 0.2215,
      "rewards/chosen": -0.8786706924438477,
      "rewards/margins": 2.4429192543029785,
      "rewards/rejected": -3.321589946746826,
      "step": 2569
    },
    {
      "epoch": 0.67,
      "grad_norm": 28.603649139404297,
      "kl": 0.0,
      "learning_rate": 1.637006019366658e-07,
      "logps/chosen": -251.87803649902344,
      "logps/rejected": -275.10626220703125,
      "loss": 0.2048,
      "rewards/chosen": 3.599595785140991,
      "rewards/margins": 9.818099021911621,
      "rewards/rejected": -6.218502998352051,
      "step": 2570
    },
    {
      "epoch": 0.67,
      "grad_norm": 34.09560012817383,
      "kl": 0.0,
      "learning_rate": 1.6356974613975398e-07,
      "logps/chosen": -141.74624633789062,
      "logps/rejected": -207.2333984375,
      "loss": 0.2704,
      "rewards/chosen": -0.39835241436958313,
      "rewards/margins": 3.2094037532806396,
      "rewards/rejected": -3.6077561378479004,
      "step": 2571
    },
    {
      "epoch": 0.67,
      "grad_norm": 31.071380615234375,
      "kl": 0.0,
      "learning_rate": 1.6343889034284218e-07,
      "logps/chosen": -215.2388153076172,
      "logps/rejected": -381.1431579589844,
      "loss": 0.2196,
      "rewards/chosen": 0.882358729839325,
      "rewards/margins": 7.919827938079834,
      "rewards/rejected": -7.037469387054443,
      "step": 2572
    },
    {
      "epoch": 0.67,
      "grad_norm": 36.05377960205078,
      "kl": 0.0,
      "learning_rate": 1.6330803454593038e-07,
      "logps/chosen": -234.13755798339844,
      "logps/rejected": -321.06182861328125,
      "loss": 0.3183,
      "rewards/chosen": 0.21939189732074738,
      "rewards/margins": 3.5455498695373535,
      "rewards/rejected": -3.326158046722412,
      "step": 2573
    },
    {
      "epoch": 0.67,
      "grad_norm": 49.232383728027344,
      "kl": 0.0,
      "learning_rate": 1.6317717874901857e-07,
      "logps/chosen": -191.3748779296875,
      "logps/rejected": -362.3619384765625,
      "loss": 0.3358,
      "rewards/chosen": -0.1097564697265625,
      "rewards/margins": 3.5779199600219727,
      "rewards/rejected": -3.687676429748535,
      "step": 2574
    },
    {
      "epoch": 0.67,
      "grad_norm": 41.3016242980957,
      "kl": 0.0,
      "learning_rate": 1.6304632295210677e-07,
      "logps/chosen": -172.50607299804688,
      "logps/rejected": -286.0248718261719,
      "loss": 0.2457,
      "rewards/chosen": 1.2075005769729614,
      "rewards/margins": 4.418184757232666,
      "rewards/rejected": -3.210684299468994,
      "step": 2575
    },
    {
      "epoch": 0.67,
      "grad_norm": 37.03321838378906,
      "kl": 0.0,
      "learning_rate": 1.6291546715519496e-07,
      "logps/chosen": -270.3417053222656,
      "logps/rejected": -413.8136291503906,
      "loss": 0.23,
      "rewards/chosen": 0.908319890499115,
      "rewards/margins": 5.533775329589844,
      "rewards/rejected": -4.625455379486084,
      "step": 2576
    },
    {
      "epoch": 0.67,
      "grad_norm": 34.594757080078125,
      "kl": 0.0,
      "learning_rate": 1.6278461135828316e-07,
      "logps/chosen": -163.30946350097656,
      "logps/rejected": -162.75482177734375,
      "loss": 0.291,
      "rewards/chosen": 0.06622552871704102,
      "rewards/margins": 2.4234416484832764,
      "rewards/rejected": -2.3572161197662354,
      "step": 2577
    },
    {
      "epoch": 0.67,
      "grad_norm": 48.532493591308594,
      "kl": 0.0,
      "learning_rate": 1.6265375556137136e-07,
      "logps/chosen": -275.835205078125,
      "logps/rejected": -230.15435791015625,
      "loss": 0.1816,
      "rewards/chosen": 2.8731188774108887,
      "rewards/margins": 6.611090660095215,
      "rewards/rejected": -3.737971544265747,
      "step": 2578
    },
    {
      "epoch": 0.67,
      "grad_norm": 33.894386291503906,
      "kl": 0.0,
      "learning_rate": 1.6252289976445958e-07,
      "logps/chosen": -237.61642456054688,
      "logps/rejected": -250.01815795898438,
      "loss": 0.1913,
      "rewards/chosen": 0.8094295859336853,
      "rewards/margins": 5.128325939178467,
      "rewards/rejected": -4.318896293640137,
      "step": 2579
    },
    {
      "epoch": 0.68,
      "grad_norm": 27.747053146362305,
      "kl": 0.0,
      "learning_rate": 1.6239204396754775e-07,
      "logps/chosen": -268.0554504394531,
      "logps/rejected": -167.73800659179688,
      "loss": 0.2013,
      "rewards/chosen": 1.5180522203445435,
      "rewards/margins": 5.381387710571289,
      "rewards/rejected": -3.863335371017456,
      "step": 2580
    },
    {
      "epoch": 0.68,
      "grad_norm": 37.375667572021484,
      "kl": 0.0,
      "learning_rate": 1.6226118817063595e-07,
      "logps/chosen": -260.16192626953125,
      "logps/rejected": -177.43836975097656,
      "loss": 0.2694,
      "rewards/chosen": 1.000507116317749,
      "rewards/margins": 3.943662166595459,
      "rewards/rejected": -2.94315505027771,
      "step": 2581
    },
    {
      "epoch": 0.68,
      "grad_norm": 30.776037216186523,
      "kl": 0.0,
      "learning_rate": 1.6213033237372414e-07,
      "logps/chosen": -226.7903289794922,
      "logps/rejected": -201.4454345703125,
      "loss": 0.1559,
      "rewards/chosen": 3.192276954650879,
      "rewards/margins": 6.419692039489746,
      "rewards/rejected": -3.227415084838867,
      "step": 2582
    },
    {
      "epoch": 0.68,
      "grad_norm": 25.35350227355957,
      "kl": 0.0,
      "learning_rate": 1.6199947657681234e-07,
      "logps/chosen": -237.74191284179688,
      "logps/rejected": -241.92535400390625,
      "loss": 0.2614,
      "rewards/chosen": 1.408836007118225,
      "rewards/margins": 4.510784149169922,
      "rewards/rejected": -3.1019480228424072,
      "step": 2583
    },
    {
      "epoch": 0.68,
      "grad_norm": 38.679447174072266,
      "kl": 0.0,
      "learning_rate": 1.6186862077990053e-07,
      "logps/chosen": -168.99832153320312,
      "logps/rejected": -262.581787109375,
      "loss": 0.2806,
      "rewards/chosen": 2.509021759033203,
      "rewards/margins": 6.563523292541504,
      "rewards/rejected": -4.054501533508301,
      "step": 2584
    },
    {
      "epoch": 0.68,
      "grad_norm": 36.33589553833008,
      "kl": 0.0,
      "learning_rate": 1.6173776498298876e-07,
      "logps/chosen": -243.29920959472656,
      "logps/rejected": -210.0607452392578,
      "loss": 0.2599,
      "rewards/chosen": 3.176084518432617,
      "rewards/margins": 6.009356498718262,
      "rewards/rejected": -2.8332722187042236,
      "step": 2585
    },
    {
      "epoch": 0.68,
      "grad_norm": 34.518402099609375,
      "kl": 0.0,
      "learning_rate": 1.6160690918607695e-07,
      "logps/chosen": -144.8197021484375,
      "logps/rejected": -247.53363037109375,
      "loss": 0.2127,
      "rewards/chosen": 1.2055890560150146,
      "rewards/margins": 5.221860885620117,
      "rewards/rejected": -4.016272068023682,
      "step": 2586
    },
    {
      "epoch": 0.68,
      "grad_norm": 31.37455177307129,
      "kl": 0.0,
      "learning_rate": 1.6147605338916515e-07,
      "logps/chosen": -194.68289184570312,
      "logps/rejected": -296.93902587890625,
      "loss": 0.2881,
      "rewards/chosen": 0.40831291675567627,
      "rewards/margins": 4.4625091552734375,
      "rewards/rejected": -4.054196357727051,
      "step": 2587
    },
    {
      "epoch": 0.68,
      "grad_norm": 36.10368347167969,
      "kl": 0.0,
      "learning_rate": 1.6134519759225332e-07,
      "logps/chosen": -248.6439208984375,
      "logps/rejected": -207.64151000976562,
      "loss": 0.2397,
      "rewards/chosen": 0.784537136554718,
      "rewards/margins": 3.2910349369049072,
      "rewards/rejected": -2.506497859954834,
      "step": 2588
    },
    {
      "epoch": 0.68,
      "grad_norm": 27.241235733032227,
      "kl": 0.0,
      "learning_rate": 1.6121434179534151e-07,
      "logps/chosen": -140.28517150878906,
      "logps/rejected": -276.34014892578125,
      "loss": 0.1638,
      "rewards/chosen": 2.030177354812622,
      "rewards/margins": 5.352236747741699,
      "rewards/rejected": -3.322059392929077,
      "step": 2589
    },
    {
      "epoch": 0.68,
      "grad_norm": 35.7438850402832,
      "kl": 0.0,
      "learning_rate": 1.610834859984297e-07,
      "logps/chosen": -234.2539520263672,
      "logps/rejected": -221.09268188476562,
      "loss": 0.2338,
      "rewards/chosen": 1.7499068975448608,
      "rewards/margins": 4.797221660614014,
      "rewards/rejected": -3.0473148822784424,
      "step": 2590
    },
    {
      "epoch": 0.68,
      "grad_norm": 22.870019912719727,
      "kl": 0.0,
      "learning_rate": 1.609526302015179e-07,
      "logps/chosen": -158.86672973632812,
      "logps/rejected": -288.99896240234375,
      "loss": 0.2452,
      "rewards/chosen": 0.37331318855285645,
      "rewards/margins": 5.675410270690918,
      "rewards/rejected": -5.302096843719482,
      "step": 2591
    },
    {
      "epoch": 0.68,
      "grad_norm": 40.48781204223633,
      "kl": 0.0,
      "learning_rate": 1.6082177440460613e-07,
      "logps/chosen": -237.75875854492188,
      "logps/rejected": -227.9136199951172,
      "loss": 0.1677,
      "rewards/chosen": 1.2197824716567993,
      "rewards/margins": 4.607675075531006,
      "rewards/rejected": -3.387892723083496,
      "step": 2592
    },
    {
      "epoch": 0.68,
      "grad_norm": 25.184553146362305,
      "kl": 0.0,
      "learning_rate": 1.6069091860769432e-07,
      "logps/chosen": -175.31613159179688,
      "logps/rejected": -275.30609130859375,
      "loss": 0.2006,
      "rewards/chosen": 0.3677568733692169,
      "rewards/margins": 5.3581461906433105,
      "rewards/rejected": -4.990389347076416,
      "step": 2593
    },
    {
      "epoch": 0.68,
      "grad_norm": 41.90897750854492,
      "kl": 0.0,
      "learning_rate": 1.6056006281078252e-07,
      "logps/chosen": -166.27017211914062,
      "logps/rejected": -291.6557922363281,
      "loss": 0.2619,
      "rewards/chosen": 1.7553913593292236,
      "rewards/margins": 5.372653961181641,
      "rewards/rejected": -3.617262363433838,
      "step": 2594
    },
    {
      "epoch": 0.68,
      "grad_norm": 23.900545120239258,
      "kl": 0.0,
      "learning_rate": 1.6042920701387072e-07,
      "logps/chosen": -310.2008972167969,
      "logps/rejected": -202.36691284179688,
      "loss": 0.2446,
      "rewards/chosen": -0.24510131776332855,
      "rewards/margins": 4.654469013214111,
      "rewards/rejected": -4.899570465087891,
      "step": 2595
    },
    {
      "epoch": 0.68,
      "grad_norm": 37.62279510498047,
      "kl": 0.0,
      "learning_rate": 1.6029835121695889e-07,
      "logps/chosen": -155.13211059570312,
      "logps/rejected": -285.37933349609375,
      "loss": 0.2217,
      "rewards/chosen": 1.5080240964889526,
      "rewards/margins": 5.727861404418945,
      "rewards/rejected": -4.219837188720703,
      "step": 2596
    },
    {
      "epoch": 0.68,
      "grad_norm": 36.81294250488281,
      "kl": 0.0,
      "learning_rate": 1.6016749542004708e-07,
      "logps/chosen": -228.24301147460938,
      "logps/rejected": -280.6625061035156,
      "loss": 0.1871,
      "rewards/chosen": 1.3690900802612305,
      "rewards/margins": 4.551156520843506,
      "rewards/rejected": -3.1820664405822754,
      "step": 2597
    },
    {
      "epoch": 0.68,
      "grad_norm": 39.81789779663086,
      "kl": 0.0,
      "learning_rate": 1.600366396231353e-07,
      "logps/chosen": -219.61534118652344,
      "logps/rejected": -181.93167114257812,
      "loss": 0.2771,
      "rewards/chosen": 1.7412033081054688,
      "rewards/margins": 5.313830375671387,
      "rewards/rejected": -3.572626829147339,
      "step": 2598
    },
    {
      "epoch": 0.68,
      "grad_norm": 32.26911926269531,
      "kl": 0.0,
      "learning_rate": 1.599057838262235e-07,
      "logps/chosen": -163.3314971923828,
      "logps/rejected": -243.11912536621094,
      "loss": 0.2877,
      "rewards/chosen": -0.1219571903347969,
      "rewards/margins": 3.6402747631073,
      "rewards/rejected": -3.7622320652008057,
      "step": 2599
    },
    {
      "epoch": 0.68,
      "grad_norm": 37.65869140625,
      "kl": 0.0,
      "learning_rate": 1.597749280293117e-07,
      "logps/chosen": -149.3471221923828,
      "logps/rejected": -297.2940673828125,
      "loss": 0.288,
      "rewards/chosen": 0.8982479572296143,
      "rewards/margins": 7.234429359436035,
      "rewards/rejected": -6.336181640625,
      "step": 2600
    },
    {
      "epoch": 0.68,
      "grad_norm": 31.088722229003906,
      "kl": 0.0,
      "learning_rate": 1.596440722323999e-07,
      "logps/chosen": -233.47364807128906,
      "logps/rejected": -185.42413330078125,
      "loss": 0.1748,
      "rewards/chosen": 2.3092989921569824,
      "rewards/margins": 6.058829307556152,
      "rewards/rejected": -3.749530076980591,
      "step": 2601
    },
    {
      "epoch": 0.68,
      "grad_norm": 20.580787658691406,
      "kl": 0.0,
      "learning_rate": 1.595132164354881e-07,
      "logps/chosen": -184.43515014648438,
      "logps/rejected": -319.0971984863281,
      "loss": 0.208,
      "rewards/chosen": 0.845647931098938,
      "rewards/margins": 9.530956268310547,
      "rewards/rejected": -8.685308456420898,
      "step": 2602
    },
    {
      "epoch": 0.68,
      "grad_norm": 36.49890899658203,
      "kl": 0.0,
      "learning_rate": 1.5938236063857626e-07,
      "logps/chosen": -248.2190704345703,
      "logps/rejected": -357.3635559082031,
      "loss": 0.1562,
      "rewards/chosen": 2.4234509468078613,
      "rewards/margins": 6.921143054962158,
      "rewards/rejected": -4.497692108154297,
      "step": 2603
    },
    {
      "epoch": 0.68,
      "grad_norm": 47.470741271972656,
      "kl": 0.0,
      "learning_rate": 1.5925150484166448e-07,
      "logps/chosen": -169.32373046875,
      "logps/rejected": -272.4686584472656,
      "loss": 0.3708,
      "rewards/chosen": -0.13372045755386353,
      "rewards/margins": 4.087759017944336,
      "rewards/rejected": -4.221479415893555,
      "step": 2604
    },
    {
      "epoch": 0.68,
      "grad_norm": 29.60095977783203,
      "kl": 0.0,
      "learning_rate": 1.5912064904475268e-07,
      "logps/chosen": -144.54171752929688,
      "logps/rejected": -233.62033081054688,
      "loss": 0.2275,
      "rewards/chosen": 0.8344153165817261,
      "rewards/margins": 4.808291435241699,
      "rewards/rejected": -3.9738759994506836,
      "step": 2605
    },
    {
      "epoch": 0.68,
      "grad_norm": 20.64869499206543,
      "kl": 0.0,
      "learning_rate": 1.5898979324784087e-07,
      "logps/chosen": -214.33297729492188,
      "logps/rejected": -262.1623229980469,
      "loss": 0.1705,
      "rewards/chosen": 1.7195442914962769,
      "rewards/margins": 5.088170528411865,
      "rewards/rejected": -3.368626117706299,
      "step": 2606
    },
    {
      "epoch": 0.68,
      "grad_norm": 30.235107421875,
      "kl": 0.0,
      "learning_rate": 1.5885893745092907e-07,
      "logps/chosen": -168.021484375,
      "logps/rejected": -231.19189453125,
      "loss": 0.372,
      "rewards/chosen": -0.15469840168952942,
      "rewards/margins": 3.0709803104400635,
      "rewards/rejected": -3.2256786823272705,
      "step": 2607
    },
    {
      "epoch": 0.68,
      "grad_norm": 34.14283752441406,
      "kl": 0.0,
      "learning_rate": 1.5872808165401727e-07,
      "logps/chosen": -210.44888305664062,
      "logps/rejected": -228.640380859375,
      "loss": 0.3293,
      "rewards/chosen": -0.6430841684341431,
      "rewards/margins": 2.8537302017211914,
      "rewards/rejected": -3.496814250946045,
      "step": 2608
    },
    {
      "epoch": 0.68,
      "grad_norm": 40.17918395996094,
      "kl": 0.0,
      "learning_rate": 1.5859722585710546e-07,
      "logps/chosen": -175.11221313476562,
      "logps/rejected": -246.70181274414062,
      "loss": 0.3029,
      "rewards/chosen": 1.0205662250518799,
      "rewards/margins": 5.190764427185059,
      "rewards/rejected": -4.1701979637146,
      "step": 2609
    },
    {
      "epoch": 0.68,
      "grad_norm": 41.95241928100586,
      "kl": 0.0,
      "learning_rate": 1.5846637006019366e-07,
      "logps/chosen": -269.70556640625,
      "logps/rejected": -218.40017700195312,
      "loss": 0.2962,
      "rewards/chosen": -1.2158753871917725,
      "rewards/margins": 1.4317800998687744,
      "rewards/rejected": -2.647655487060547,
      "step": 2610
    },
    {
      "epoch": 0.68,
      "grad_norm": 32.46884536743164,
      "kl": 0.0,
      "learning_rate": 1.5833551426328185e-07,
      "logps/chosen": -240.30992126464844,
      "logps/rejected": -278.72393798828125,
      "loss": 0.2791,
      "rewards/chosen": 1.6434956789016724,
      "rewards/margins": 5.217041492462158,
      "rewards/rejected": -3.5735456943511963,
      "step": 2611
    },
    {
      "epoch": 0.68,
      "grad_norm": 29.526588439941406,
      "kl": 0.0,
      "learning_rate": 1.5820465846637005e-07,
      "logps/chosen": -217.5753631591797,
      "logps/rejected": -239.61083984375,
      "loss": 0.2835,
      "rewards/chosen": 1.0819295644760132,
      "rewards/margins": 4.064085006713867,
      "rewards/rejected": -2.9821553230285645,
      "step": 2612
    },
    {
      "epoch": 0.68,
      "grad_norm": 36.759010314941406,
      "kl": 0.0,
      "learning_rate": 1.5807380266945825e-07,
      "logps/chosen": -196.2762908935547,
      "logps/rejected": -238.6885528564453,
      "loss": 0.1709,
      "rewards/chosen": 0.7026401162147522,
      "rewards/margins": 5.314055442810059,
      "rewards/rejected": -4.611415386199951,
      "step": 2613
    },
    {
      "epoch": 0.68,
      "grad_norm": 32.18585205078125,
      "kl": 0.0,
      "learning_rate": 1.5794294687254644e-07,
      "logps/chosen": -212.0025634765625,
      "logps/rejected": -346.5420227050781,
      "loss": 0.1871,
      "rewards/chosen": 1.7895128726959229,
      "rewards/margins": 6.3749494552612305,
      "rewards/rejected": -4.5854363441467285,
      "step": 2614
    },
    {
      "epoch": 0.68,
      "grad_norm": 29.172779083251953,
      "kl": 0.0,
      "learning_rate": 1.5781209107563464e-07,
      "logps/chosen": -184.76547241210938,
      "logps/rejected": -235.42982482910156,
      "loss": 0.0766,
      "rewards/chosen": 1.806889533996582,
      "rewards/margins": 5.578388690948486,
      "rewards/rejected": -3.7714991569519043,
      "step": 2615
    },
    {
      "epoch": 0.68,
      "grad_norm": 39.95838928222656,
      "kl": 0.0,
      "learning_rate": 1.5768123527872283e-07,
      "logps/chosen": -181.52728271484375,
      "logps/rejected": -280.96826171875,
      "loss": 0.2783,
      "rewards/chosen": 0.32978546619415283,
      "rewards/margins": 6.202417850494385,
      "rewards/rejected": -5.8726325035095215,
      "step": 2616
    },
    {
      "epoch": 0.68,
      "grad_norm": 30.65798568725586,
      "kl": 0.0,
      "learning_rate": 1.5755037948181106e-07,
      "logps/chosen": -198.12869262695312,
      "logps/rejected": -301.61383056640625,
      "loss": 0.1814,
      "rewards/chosen": 1.700964331626892,
      "rewards/margins": 7.295632839202881,
      "rewards/rejected": -5.594668388366699,
      "step": 2617
    },
    {
      "epoch": 0.69,
      "grad_norm": 32.177223205566406,
      "kl": 0.0,
      "learning_rate": 1.5741952368489925e-07,
      "logps/chosen": -161.6401824951172,
      "logps/rejected": -255.524658203125,
      "loss": 0.2605,
      "rewards/chosen": 1.3057931661605835,
      "rewards/margins": 6.553887844085693,
      "rewards/rejected": -5.24809455871582,
      "step": 2618
    },
    {
      "epoch": 0.69,
      "grad_norm": 29.845308303833008,
      "kl": 0.0,
      "learning_rate": 1.5728866788798742e-07,
      "logps/chosen": -239.44155883789062,
      "logps/rejected": -179.19857788085938,
      "loss": 0.2494,
      "rewards/chosen": 0.5105099678039551,
      "rewards/margins": 4.343420028686523,
      "rewards/rejected": -3.8329102993011475,
      "step": 2619
    },
    {
      "epoch": 0.69,
      "grad_norm": 34.138816833496094,
      "kl": 0.0,
      "learning_rate": 1.5715781209107562e-07,
      "logps/chosen": -133.4228515625,
      "logps/rejected": -297.18719482421875,
      "loss": 0.2533,
      "rewards/chosen": 0.6324558258056641,
      "rewards/margins": 5.259095668792725,
      "rewards/rejected": -4.6266398429870605,
      "step": 2620
    },
    {
      "epoch": 0.69,
      "grad_norm": 35.212257385253906,
      "kl": 0.0,
      "learning_rate": 1.5702695629416382e-07,
      "logps/chosen": -200.92498779296875,
      "logps/rejected": -161.45701599121094,
      "loss": 0.276,
      "rewards/chosen": 0.06087028980255127,
      "rewards/margins": 2.053701877593994,
      "rewards/rejected": -1.9928315877914429,
      "step": 2621
    },
    {
      "epoch": 0.69,
      "grad_norm": 53.46183776855469,
      "kl": 0.0,
      "learning_rate": 1.56896100497252e-07,
      "logps/chosen": -210.4583740234375,
      "logps/rejected": -235.5002899169922,
      "loss": 0.2802,
      "rewards/chosen": 2.0013036727905273,
      "rewards/margins": 3.4605746269226074,
      "rewards/rejected": -1.4592708349227905,
      "step": 2622
    },
    {
      "epoch": 0.69,
      "grad_norm": 27.320358276367188,
      "kl": 0.0,
      "learning_rate": 1.5676524470034023e-07,
      "logps/chosen": -238.22268676757812,
      "logps/rejected": -362.32037353515625,
      "loss": 0.1861,
      "rewards/chosen": 0.02560839056968689,
      "rewards/margins": 5.871598720550537,
      "rewards/rejected": -5.845990180969238,
      "step": 2623
    },
    {
      "epoch": 0.69,
      "grad_norm": 44.43696975708008,
      "kl": 0.0,
      "learning_rate": 1.5663438890342843e-07,
      "logps/chosen": -276.4560241699219,
      "logps/rejected": -305.7402648925781,
      "loss": 0.22,
      "rewards/chosen": 2.26029109954834,
      "rewards/margins": 6.064087867736816,
      "rewards/rejected": -3.8037965297698975,
      "step": 2624
    },
    {
      "epoch": 0.69,
      "grad_norm": 23.91146469116211,
      "kl": 0.0,
      "learning_rate": 1.5650353310651663e-07,
      "logps/chosen": -157.71304321289062,
      "logps/rejected": -204.7120361328125,
      "loss": 0.2469,
      "rewards/chosen": -0.3723863363265991,
      "rewards/margins": 3.3025412559509277,
      "rewards/rejected": -3.6749274730682373,
      "step": 2625
    },
    {
      "epoch": 0.69,
      "grad_norm": 26.316261291503906,
      "kl": 0.0,
      "learning_rate": 1.563726773096048e-07,
      "logps/chosen": -206.56991577148438,
      "logps/rejected": -180.08187866210938,
      "loss": 0.3156,
      "rewards/chosen": 0.9054687023162842,
      "rewards/margins": 4.859877586364746,
      "rewards/rejected": -3.954408645629883,
      "step": 2626
    },
    {
      "epoch": 0.69,
      "grad_norm": 37.809608459472656,
      "kl": 0.0,
      "learning_rate": 1.56241821512693e-07,
      "logps/chosen": -208.02249145507812,
      "logps/rejected": -250.53970336914062,
      "loss": 0.3625,
      "rewards/chosen": -0.296889066696167,
      "rewards/margins": 3.3697378635406494,
      "rewards/rejected": -3.6666269302368164,
      "step": 2627
    },
    {
      "epoch": 0.69,
      "grad_norm": 40.240020751953125,
      "kl": 0.0,
      "learning_rate": 1.561109657157812e-07,
      "logps/chosen": -247.34298706054688,
      "logps/rejected": -228.52392578125,
      "loss": 0.355,
      "rewards/chosen": -0.22601445019245148,
      "rewards/margins": 2.2114017009735107,
      "rewards/rejected": -2.4374160766601562,
      "step": 2628
    },
    {
      "epoch": 0.69,
      "grad_norm": 43.382259368896484,
      "kl": 0.0,
      "learning_rate": 1.5598010991886938e-07,
      "logps/chosen": -261.98870849609375,
      "logps/rejected": -233.58889770507812,
      "loss": 0.3239,
      "rewards/chosen": -0.3544365167617798,
      "rewards/margins": 3.8448009490966797,
      "rewards/rejected": -4.19923734664917,
      "step": 2629
    },
    {
      "epoch": 0.69,
      "grad_norm": 28.072172164916992,
      "kl": 0.0,
      "learning_rate": 1.558492541219576e-07,
      "logps/chosen": -210.15908813476562,
      "logps/rejected": -259.5641784667969,
      "loss": 0.1628,
      "rewards/chosen": 1.8099766969680786,
      "rewards/margins": 5.892375469207764,
      "rewards/rejected": -4.082398891448975,
      "step": 2630
    },
    {
      "epoch": 0.69,
      "grad_norm": 33.46635818481445,
      "kl": 0.0,
      "learning_rate": 1.557183983250458e-07,
      "logps/chosen": -228.41885375976562,
      "logps/rejected": -255.07586669921875,
      "loss": 0.3827,
      "rewards/chosen": -0.8277987241744995,
      "rewards/margins": 1.9675222635269165,
      "rewards/rejected": -2.795320987701416,
      "step": 2631
    },
    {
      "epoch": 0.69,
      "grad_norm": 31.32046890258789,
      "kl": 0.0,
      "learning_rate": 1.55587542528134e-07,
      "logps/chosen": -147.96084594726562,
      "logps/rejected": -237.70318603515625,
      "loss": 0.2428,
      "rewards/chosen": 1.3135221004486084,
      "rewards/margins": 4.7350921630859375,
      "rewards/rejected": -3.421570301055908,
      "step": 2632
    },
    {
      "epoch": 0.69,
      "grad_norm": 36.38935852050781,
      "kl": 0.0,
      "learning_rate": 1.554566867312222e-07,
      "logps/chosen": -178.5509490966797,
      "logps/rejected": -218.0067596435547,
      "loss": 0.2722,
      "rewards/chosen": 0.268542617559433,
      "rewards/margins": 3.8187830448150635,
      "rewards/rejected": -3.5502405166625977,
      "step": 2633
    },
    {
      "epoch": 0.69,
      "grad_norm": 33.196720123291016,
      "kl": 0.0,
      "learning_rate": 1.5532583093431036e-07,
      "logps/chosen": -196.33807373046875,
      "logps/rejected": -196.2073516845703,
      "loss": 0.2131,
      "rewards/chosen": 1.0521554946899414,
      "rewards/margins": 7.267307281494141,
      "rewards/rejected": -6.215151786804199,
      "step": 2634
    },
    {
      "epoch": 0.69,
      "grad_norm": 36.032859802246094,
      "kl": 0.0,
      "learning_rate": 1.5519497513739856e-07,
      "logps/chosen": -218.05908203125,
      "logps/rejected": -240.1728515625,
      "loss": 0.275,
      "rewards/chosen": 0.8040695786476135,
      "rewards/margins": 5.664813041687012,
      "rewards/rejected": -4.860743522644043,
      "step": 2635
    },
    {
      "epoch": 0.69,
      "grad_norm": 40.46497344970703,
      "kl": 0.0,
      "learning_rate": 1.5506411934048678e-07,
      "logps/chosen": -168.18862915039062,
      "logps/rejected": -292.72064208984375,
      "loss": 0.3087,
      "rewards/chosen": -0.045253679156303406,
      "rewards/margins": 6.116161346435547,
      "rewards/rejected": -6.161415100097656,
      "step": 2636
    },
    {
      "epoch": 0.69,
      "grad_norm": 28.5998477935791,
      "kl": 0.0,
      "learning_rate": 1.5493326354357498e-07,
      "logps/chosen": -244.7726287841797,
      "logps/rejected": -213.48959350585938,
      "loss": 0.222,
      "rewards/chosen": 1.3389453887939453,
      "rewards/margins": 3.876892328262329,
      "rewards/rejected": -2.537946939468384,
      "step": 2637
    },
    {
      "epoch": 0.69,
      "grad_norm": 26.24286460876465,
      "kl": 0.0,
      "learning_rate": 1.5480240774666318e-07,
      "logps/chosen": -385.7197265625,
      "logps/rejected": -240.03048706054688,
      "loss": 0.1706,
      "rewards/chosen": 0.9546328186988831,
      "rewards/margins": 4.774143218994141,
      "rewards/rejected": -3.8195104598999023,
      "step": 2638
    },
    {
      "epoch": 0.69,
      "grad_norm": 31.864839553833008,
      "kl": 0.0,
      "learning_rate": 1.5467155194975137e-07,
      "logps/chosen": -146.63380432128906,
      "logps/rejected": -181.90550231933594,
      "loss": 0.3093,
      "rewards/chosen": 0.41834282875061035,
      "rewards/margins": 3.2443583011627197,
      "rewards/rejected": -2.8260154724121094,
      "step": 2639
    },
    {
      "epoch": 0.69,
      "grad_norm": 38.77482986450195,
      "kl": 0.0,
      "learning_rate": 1.5454069615283957e-07,
      "logps/chosen": -245.33843994140625,
      "logps/rejected": -288.5142822265625,
      "loss": 0.289,
      "rewards/chosen": 0.6637973785400391,
      "rewards/margins": 3.466867446899414,
      "rewards/rejected": -2.803070068359375,
      "step": 2640
    },
    {
      "epoch": 0.69,
      "grad_norm": 36.03498840332031,
      "kl": 0.0,
      "learning_rate": 1.5440984035592776e-07,
      "logps/chosen": -124.09053802490234,
      "logps/rejected": -223.87887573242188,
      "loss": 0.2097,
      "rewards/chosen": 0.37351658940315247,
      "rewards/margins": 4.57509183883667,
      "rewards/rejected": -4.20157527923584,
      "step": 2641
    },
    {
      "epoch": 0.69,
      "grad_norm": 28.98439598083496,
      "kl": 0.0,
      "learning_rate": 1.5427898455901593e-07,
      "logps/chosen": -198.59564208984375,
      "logps/rejected": -307.36444091796875,
      "loss": 0.1569,
      "rewards/chosen": 1.2171781063079834,
      "rewards/margins": 6.403229713439941,
      "rewards/rejected": -5.186051368713379,
      "step": 2642
    },
    {
      "epoch": 0.69,
      "grad_norm": 34.85798645019531,
      "kl": 0.0,
      "learning_rate": 1.5414812876210416e-07,
      "logps/chosen": -274.24334716796875,
      "logps/rejected": -247.5282745361328,
      "loss": 0.1454,
      "rewards/chosen": -0.12386541068553925,
      "rewards/margins": 3.7028324604034424,
      "rewards/rejected": -3.826697826385498,
      "step": 2643
    },
    {
      "epoch": 0.69,
      "grad_norm": 40.29619216918945,
      "kl": 0.0,
      "learning_rate": 1.5401727296519235e-07,
      "logps/chosen": -299.2466125488281,
      "logps/rejected": -177.40074157714844,
      "loss": 0.2911,
      "rewards/chosen": 0.5023276209831238,
      "rewards/margins": 3.8164069652557373,
      "rewards/rejected": -3.3140792846679688,
      "step": 2644
    },
    {
      "epoch": 0.69,
      "grad_norm": 39.41469192504883,
      "kl": 0.0,
      "learning_rate": 1.5388641716828055e-07,
      "logps/chosen": -159.89877319335938,
      "logps/rejected": -367.87188720703125,
      "loss": 0.3222,
      "rewards/chosen": 0.471483051776886,
      "rewards/margins": 4.570820331573486,
      "rewards/rejected": -4.099337100982666,
      "step": 2645
    },
    {
      "epoch": 0.69,
      "grad_norm": 33.267940521240234,
      "kl": 0.0,
      "learning_rate": 1.5375556137136874e-07,
      "logps/chosen": -288.19708251953125,
      "logps/rejected": -221.0328369140625,
      "loss": 0.2419,
      "rewards/chosen": 1.0454717874526978,
      "rewards/margins": 3.774181842803955,
      "rewards/rejected": -2.728710174560547,
      "step": 2646
    },
    {
      "epoch": 0.69,
      "grad_norm": 38.31817626953125,
      "kl": 0.0,
      "learning_rate": 1.5362470557445694e-07,
      "logps/chosen": -248.33908081054688,
      "logps/rejected": -239.07632446289062,
      "loss": 0.2773,
      "rewards/chosen": 0.7482012510299683,
      "rewards/margins": 5.982780456542969,
      "rewards/rejected": -5.234579086303711,
      "step": 2647
    },
    {
      "epoch": 0.69,
      "grad_norm": 51.11371612548828,
      "kl": 0.0,
      "learning_rate": 1.5349384977754514e-07,
      "logps/chosen": -251.4349822998047,
      "logps/rejected": -238.06494140625,
      "loss": 0.2741,
      "rewards/chosen": -0.5241379737854004,
      "rewards/margins": 3.8754677772521973,
      "rewards/rejected": -4.399605751037598,
      "step": 2648
    },
    {
      "epoch": 0.69,
      "grad_norm": 33.129005432128906,
      "kl": 0.0,
      "learning_rate": 1.5336299398063336e-07,
      "logps/chosen": -205.00674438476562,
      "logps/rejected": -331.13824462890625,
      "loss": 0.2413,
      "rewards/chosen": 0.2049345076084137,
      "rewards/margins": 5.713682174682617,
      "rewards/rejected": -5.508747577667236,
      "step": 2649
    },
    {
      "epoch": 0.69,
      "grad_norm": 35.91727066040039,
      "kl": 0.0,
      "learning_rate": 1.5323213818372153e-07,
      "logps/chosen": -228.20111083984375,
      "logps/rejected": -253.9140625,
      "loss": 0.3428,
      "rewards/chosen": 2.9546375274658203,
      "rewards/margins": 4.23104190826416,
      "rewards/rejected": -1.2764043807983398,
      "step": 2650
    },
    {
      "epoch": 0.69,
      "grad_norm": 26.117427825927734,
      "kl": 0.0,
      "learning_rate": 1.5310128238680972e-07,
      "logps/chosen": -181.5155029296875,
      "logps/rejected": -233.0501251220703,
      "loss": 0.2755,
      "rewards/chosen": 0.12543119490146637,
      "rewards/margins": 2.9917426109313965,
      "rewards/rejected": -2.8663113117218018,
      "step": 2651
    },
    {
      "epoch": 0.69,
      "grad_norm": 48.09539031982422,
      "kl": 0.0,
      "learning_rate": 1.5297042658989792e-07,
      "logps/chosen": -203.1796875,
      "logps/rejected": -259.39404296875,
      "loss": 0.2379,
      "rewards/chosen": 1.2114871740341187,
      "rewards/margins": 6.3016886711120605,
      "rewards/rejected": -5.090201377868652,
      "step": 2652
    },
    {
      "epoch": 0.69,
      "grad_norm": 35.25298309326172,
      "kl": 0.0,
      "learning_rate": 1.5283957079298612e-07,
      "logps/chosen": -290.0359191894531,
      "logps/rejected": -214.7886505126953,
      "loss": 0.3032,
      "rewards/chosen": 1.8502638339996338,
      "rewards/margins": 5.052051544189453,
      "rewards/rejected": -3.2017874717712402,
      "step": 2653
    },
    {
      "epoch": 0.69,
      "grad_norm": 25.84156608581543,
      "kl": 0.0,
      "learning_rate": 1.527087149960743e-07,
      "logps/chosen": -171.43955993652344,
      "logps/rejected": -305.144287109375,
      "loss": 0.1473,
      "rewards/chosen": 1.7509996891021729,
      "rewards/margins": 6.030915260314941,
      "rewards/rejected": -4.2799153327941895,
      "step": 2654
    },
    {
      "epoch": 0.69,
      "grad_norm": 34.59351348876953,
      "kl": 0.0,
      "learning_rate": 1.5257785919916254e-07,
      "logps/chosen": -209.79238891601562,
      "logps/rejected": -225.7039337158203,
      "loss": 0.4036,
      "rewards/chosen": 2.4418153762817383,
      "rewards/margins": 3.5955920219421387,
      "rewards/rejected": -1.1537765264511108,
      "step": 2655
    },
    {
      "epoch": 0.7,
      "grad_norm": 31.137544631958008,
      "kl": 0.0,
      "learning_rate": 1.5244700340225073e-07,
      "logps/chosen": -244.44252014160156,
      "logps/rejected": -289.1747131347656,
      "loss": 0.223,
      "rewards/chosen": 0.9428373575210571,
      "rewards/margins": 5.725986003875732,
      "rewards/rejected": -4.783148765563965,
      "step": 2656
    },
    {
      "epoch": 0.7,
      "grad_norm": 42.90130615234375,
      "kl": 0.0,
      "learning_rate": 1.523161476053389e-07,
      "logps/chosen": -200.8882598876953,
      "logps/rejected": -211.6605224609375,
      "loss": 0.2389,
      "rewards/chosen": 0.7012113332748413,
      "rewards/margins": 4.256584167480469,
      "rewards/rejected": -3.555372953414917,
      "step": 2657
    },
    {
      "epoch": 0.7,
      "grad_norm": 26.240468978881836,
      "kl": 0.0,
      "learning_rate": 1.521852918084271e-07,
      "logps/chosen": -179.78411865234375,
      "logps/rejected": -234.9866943359375,
      "loss": 0.2692,
      "rewards/chosen": 1.508581519126892,
      "rewards/margins": 5.079866409301758,
      "rewards/rejected": -3.571284770965576,
      "step": 2658
    },
    {
      "epoch": 0.7,
      "grad_norm": 54.163883209228516,
      "kl": 0.0,
      "learning_rate": 1.520544360115153e-07,
      "logps/chosen": -178.21804809570312,
      "logps/rejected": -310.228759765625,
      "loss": 0.3183,
      "rewards/chosen": 0.48193949460983276,
      "rewards/margins": 5.572971820831299,
      "rewards/rejected": -5.0910325050354,
      "step": 2659
    },
    {
      "epoch": 0.7,
      "grad_norm": 33.376529693603516,
      "kl": 0.0,
      "learning_rate": 1.519235802146035e-07,
      "logps/chosen": -278.4017028808594,
      "logps/rejected": -241.92855834960938,
      "loss": 0.1961,
      "rewards/chosen": 0.4909915030002594,
      "rewards/margins": 4.12484073638916,
      "rewards/rejected": -3.6338493824005127,
      "step": 2660
    },
    {
      "epoch": 0.7,
      "grad_norm": 35.16926956176758,
      "kl": 0.0,
      "learning_rate": 1.5179272441769169e-07,
      "logps/chosen": -161.02484130859375,
      "logps/rejected": -265.98419189453125,
      "loss": 0.295,
      "rewards/chosen": 0.2557518482208252,
      "rewards/margins": 5.179582595825195,
      "rewards/rejected": -4.923830509185791,
      "step": 2661
    },
    {
      "epoch": 0.7,
      "grad_norm": 37.866119384765625,
      "kl": 0.0,
      "learning_rate": 1.516618686207799e-07,
      "logps/chosen": -188.62120056152344,
      "logps/rejected": -188.76953125,
      "loss": 0.2359,
      "rewards/chosen": 1.9292501211166382,
      "rewards/margins": 5.487701416015625,
      "rewards/rejected": -3.5584514141082764,
      "step": 2662
    },
    {
      "epoch": 0.7,
      "grad_norm": 33.47230529785156,
      "kl": 0.0,
      "learning_rate": 1.515310128238681e-07,
      "logps/chosen": -198.44082641601562,
      "logps/rejected": -214.91188049316406,
      "loss": 0.3577,
      "rewards/chosen": 0.3523358404636383,
      "rewards/margins": 3.134798765182495,
      "rewards/rejected": -2.7824628353118896,
      "step": 2663
    },
    {
      "epoch": 0.7,
      "grad_norm": 38.74263381958008,
      "kl": 0.0,
      "learning_rate": 1.514001570269563e-07,
      "logps/chosen": -285.26641845703125,
      "logps/rejected": -270.3790283203125,
      "loss": 0.2876,
      "rewards/chosen": 0.34473711252212524,
      "rewards/margins": 4.277539253234863,
      "rewards/rejected": -3.9328019618988037,
      "step": 2664
    },
    {
      "epoch": 0.7,
      "grad_norm": 30.61463165283203,
      "kl": 0.0,
      "learning_rate": 1.5126930123004447e-07,
      "logps/chosen": -160.3408966064453,
      "logps/rejected": -221.6944580078125,
      "loss": 0.1917,
      "rewards/chosen": 2.274693012237549,
      "rewards/margins": 5.692990303039551,
      "rewards/rejected": -3.418297290802002,
      "step": 2665
    },
    {
      "epoch": 0.7,
      "grad_norm": 42.397430419921875,
      "kl": 0.0,
      "learning_rate": 1.5113844543313267e-07,
      "logps/chosen": -295.8493957519531,
      "logps/rejected": -198.1302490234375,
      "loss": 0.3157,
      "rewards/chosen": 0.8204801678657532,
      "rewards/margins": 3.688586473464966,
      "rewards/rejected": -2.8681063652038574,
      "step": 2666
    },
    {
      "epoch": 0.7,
      "grad_norm": 38.239810943603516,
      "kl": 0.0,
      "learning_rate": 1.5100758963622086e-07,
      "logps/chosen": -212.99639892578125,
      "logps/rejected": -250.43011474609375,
      "loss": 0.2838,
      "rewards/chosen": 1.9497802257537842,
      "rewards/margins": 4.861065864562988,
      "rewards/rejected": -2.911285400390625,
      "step": 2667
    },
    {
      "epoch": 0.7,
      "grad_norm": 22.39212989807129,
      "kl": 0.0,
      "learning_rate": 1.5087673383930908e-07,
      "logps/chosen": -181.47560119628906,
      "logps/rejected": -157.10704040527344,
      "loss": 0.2478,
      "rewards/chosen": 1.5009618997573853,
      "rewards/margins": 4.634808540344238,
      "rewards/rejected": -3.1338467597961426,
      "step": 2668
    },
    {
      "epoch": 0.7,
      "grad_norm": 39.13432312011719,
      "kl": 0.0,
      "learning_rate": 1.5074587804239728e-07,
      "logps/chosen": -229.26280212402344,
      "logps/rejected": -272.8870849609375,
      "loss": 0.3134,
      "rewards/chosen": 1.0489386320114136,
      "rewards/margins": 4.14993143081665,
      "rewards/rejected": -3.1009929180145264,
      "step": 2669
    },
    {
      "epoch": 0.7,
      "grad_norm": 36.7012825012207,
      "kl": 0.0,
      "learning_rate": 1.5061502224548548e-07,
      "logps/chosen": -220.99365234375,
      "logps/rejected": -224.8020782470703,
      "loss": 0.3269,
      "rewards/chosen": -0.3122532069683075,
      "rewards/margins": 2.7056050300598145,
      "rewards/rejected": -3.0178582668304443,
      "step": 2670
    },
    {
      "epoch": 0.7,
      "grad_norm": 35.51311111450195,
      "kl": 0.0,
      "learning_rate": 1.5048416644857367e-07,
      "logps/chosen": -175.61502075195312,
      "logps/rejected": -212.26882934570312,
      "loss": 0.245,
      "rewards/chosen": 1.2118319272994995,
      "rewards/margins": 5.409755706787109,
      "rewards/rejected": -4.19792366027832,
      "step": 2671
    },
    {
      "epoch": 0.7,
      "grad_norm": 32.9407958984375,
      "kl": 0.0,
      "learning_rate": 1.5035331065166187e-07,
      "logps/chosen": -168.10598754882812,
      "logps/rejected": -314.8375549316406,
      "loss": 0.2223,
      "rewards/chosen": 0.9820523858070374,
      "rewards/margins": 4.9999566078186035,
      "rewards/rejected": -4.017904281616211,
      "step": 2672
    },
    {
      "epoch": 0.7,
      "grad_norm": 40.48832702636719,
      "kl": 0.0,
      "learning_rate": 1.5022245485475004e-07,
      "logps/chosen": -150.69357299804688,
      "logps/rejected": -287.1880798339844,
      "loss": 0.2422,
      "rewards/chosen": 1.9085931777954102,
      "rewards/margins": 5.372089862823486,
      "rewards/rejected": -3.463496685028076,
      "step": 2673
    },
    {
      "epoch": 0.7,
      "grad_norm": 40.512306213378906,
      "kl": 0.0,
      "learning_rate": 1.5009159905783823e-07,
      "logps/chosen": -301.79364013671875,
      "logps/rejected": -183.44329833984375,
      "loss": 0.315,
      "rewards/chosen": 0.24387967586517334,
      "rewards/margins": 2.798271656036377,
      "rewards/rejected": -2.554391860961914,
      "step": 2674
    },
    {
      "epoch": 0.7,
      "grad_norm": 30.446664810180664,
      "kl": 0.0,
      "learning_rate": 1.4996074326092646e-07,
      "logps/chosen": -195.90579223632812,
      "logps/rejected": -343.2037658691406,
      "loss": 0.2241,
      "rewards/chosen": 1.6417574882507324,
      "rewards/margins": 5.597146987915039,
      "rewards/rejected": -3.9553897380828857,
      "step": 2675
    },
    {
      "epoch": 0.7,
      "grad_norm": 32.61503219604492,
      "kl": 0.0,
      "learning_rate": 1.4982988746401465e-07,
      "logps/chosen": -158.60813903808594,
      "logps/rejected": -193.80609130859375,
      "loss": 0.231,
      "rewards/chosen": 2.2817306518554688,
      "rewards/margins": 5.334161281585693,
      "rewards/rejected": -3.0524306297302246,
      "step": 2676
    },
    {
      "epoch": 0.7,
      "grad_norm": 36.24722671508789,
      "kl": 0.0,
      "learning_rate": 1.4969903166710285e-07,
      "logps/chosen": -179.80506896972656,
      "logps/rejected": -192.8569793701172,
      "loss": 0.3059,
      "rewards/chosen": 2.079206943511963,
      "rewards/margins": 3.3447813987731934,
      "rewards/rejected": -1.265574336051941,
      "step": 2677
    },
    {
      "epoch": 0.7,
      "grad_norm": 34.13661193847656,
      "kl": 0.0,
      "learning_rate": 1.4956817587019105e-07,
      "logps/chosen": -181.9069061279297,
      "logps/rejected": -220.1363525390625,
      "loss": 0.2766,
      "rewards/chosen": 1.1834523677825928,
      "rewards/margins": 4.73173713684082,
      "rewards/rejected": -3.5482850074768066,
      "step": 2678
    },
    {
      "epoch": 0.7,
      "grad_norm": 27.697574615478516,
      "kl": 0.0,
      "learning_rate": 1.4943732007327924e-07,
      "logps/chosen": -224.85931396484375,
      "logps/rejected": -242.00808715820312,
      "loss": 0.1453,
      "rewards/chosen": 2.999886989593506,
      "rewards/margins": 7.181798934936523,
      "rewards/rejected": -4.181911945343018,
      "step": 2679
    },
    {
      "epoch": 0.7,
      "grad_norm": 35.0987548828125,
      "kl": 0.0,
      "learning_rate": 1.4930646427636744e-07,
      "logps/chosen": -271.865478515625,
      "logps/rejected": -132.74562072753906,
      "loss": 0.1511,
      "rewards/chosen": 3.920344829559326,
      "rewards/margins": 5.227973461151123,
      "rewards/rejected": -1.3076287508010864,
      "step": 2680
    },
    {
      "epoch": 0.7,
      "grad_norm": 39.40233612060547,
      "kl": 0.0,
      "learning_rate": 1.4917560847945563e-07,
      "logps/chosen": -176.2115020751953,
      "logps/rejected": -169.6930389404297,
      "loss": 0.2693,
      "rewards/chosen": -0.39483752846717834,
      "rewards/margins": 2.6290271282196045,
      "rewards/rejected": -3.02386474609375,
      "step": 2681
    },
    {
      "epoch": 0.7,
      "grad_norm": 34.06243896484375,
      "kl": 0.0,
      "learning_rate": 1.4904475268254383e-07,
      "logps/chosen": -154.3968505859375,
      "logps/rejected": -251.61129760742188,
      "loss": 0.2755,
      "rewards/chosen": -0.5720101594924927,
      "rewards/margins": 4.821446418762207,
      "rewards/rejected": -5.39345645904541,
      "step": 2682
    },
    {
      "epoch": 0.7,
      "grad_norm": 49.528343200683594,
      "kl": 0.0,
      "learning_rate": 1.4891389688563203e-07,
      "logps/chosen": -202.0147705078125,
      "logps/rejected": -214.54263305664062,
      "loss": 0.3572,
      "rewards/chosen": 0.9688149690628052,
      "rewards/margins": 3.14668607711792,
      "rewards/rejected": -2.177870988845825,
      "step": 2683
    },
    {
      "epoch": 0.7,
      "grad_norm": 39.2160530090332,
      "kl": 0.0,
      "learning_rate": 1.4878304108872022e-07,
      "logps/chosen": -317.7562561035156,
      "logps/rejected": -237.9340057373047,
      "loss": 0.1387,
      "rewards/chosen": 2.1960606575012207,
      "rewards/margins": 5.66136360168457,
      "rewards/rejected": -3.4653029441833496,
      "step": 2684
    },
    {
      "epoch": 0.7,
      "grad_norm": 33.726722717285156,
      "kl": 0.0,
      "learning_rate": 1.4865218529180842e-07,
      "logps/chosen": -241.3274688720703,
      "logps/rejected": -260.200439453125,
      "loss": 0.2198,
      "rewards/chosen": 2.3069839477539062,
      "rewards/margins": 7.066941261291504,
      "rewards/rejected": -4.759957313537598,
      "step": 2685
    },
    {
      "epoch": 0.7,
      "grad_norm": 37.164852142333984,
      "kl": 0.0,
      "learning_rate": 1.4852132949489661e-07,
      "logps/chosen": -303.6715087890625,
      "logps/rejected": -298.21490478515625,
      "loss": 0.2933,
      "rewards/chosen": -1.2913599014282227,
      "rewards/margins": 3.334641933441162,
      "rewards/rejected": -4.626001834869385,
      "step": 2686
    },
    {
      "epoch": 0.7,
      "grad_norm": 39.393348693847656,
      "kl": 0.0,
      "learning_rate": 1.4839047369798484e-07,
      "logps/chosen": -285.0570983886719,
      "logps/rejected": -248.0108642578125,
      "loss": 0.1947,
      "rewards/chosen": 2.4950828552246094,
      "rewards/margins": 5.850523948669434,
      "rewards/rejected": -3.355441093444824,
      "step": 2687
    },
    {
      "epoch": 0.7,
      "grad_norm": 30.06007194519043,
      "kl": 0.0,
      "learning_rate": 1.48259617901073e-07,
      "logps/chosen": -205.47360229492188,
      "logps/rejected": -374.58258056640625,
      "loss": 0.2137,
      "rewards/chosen": 1.308136224746704,
      "rewards/margins": 5.134586334228516,
      "rewards/rejected": -3.8264501094818115,
      "step": 2688
    },
    {
      "epoch": 0.7,
      "grad_norm": 36.91188430786133,
      "kl": 0.0,
      "learning_rate": 1.481287621041612e-07,
      "logps/chosen": -172.80679321289062,
      "logps/rejected": -306.2414245605469,
      "loss": 0.2751,
      "rewards/chosen": 0.7240972518920898,
      "rewards/margins": 4.706175804138184,
      "rewards/rejected": -3.9820785522460938,
      "step": 2689
    },
    {
      "epoch": 0.7,
      "grad_norm": 33.07395553588867,
      "kl": 0.0,
      "learning_rate": 1.479979063072494e-07,
      "logps/chosen": -173.76805114746094,
      "logps/rejected": -237.40435791015625,
      "loss": 0.2752,
      "rewards/chosen": 0.2675023376941681,
      "rewards/margins": 3.5950818061828613,
      "rewards/rejected": -3.3275794982910156,
      "step": 2690
    },
    {
      "epoch": 0.7,
      "grad_norm": 28.920312881469727,
      "kl": 0.0,
      "learning_rate": 1.478670505103376e-07,
      "logps/chosen": -187.97975158691406,
      "logps/rejected": -165.814697265625,
      "loss": 0.2615,
      "rewards/chosen": 1.113723635673523,
      "rewards/margins": 3.297226905822754,
      "rewards/rejected": -2.1835031509399414,
      "step": 2691
    },
    {
      "epoch": 0.7,
      "grad_norm": 38.51026916503906,
      "kl": 0.0,
      "learning_rate": 1.477361947134258e-07,
      "logps/chosen": -188.7342987060547,
      "logps/rejected": -285.8819274902344,
      "loss": 0.2037,
      "rewards/chosen": 1.7209330797195435,
      "rewards/margins": 6.286745548248291,
      "rewards/rejected": -4.565812587738037,
      "step": 2692
    },
    {
      "epoch": 0.7,
      "grad_norm": 38.706119537353516,
      "kl": 0.0,
      "learning_rate": 1.47605338916514e-07,
      "logps/chosen": -148.42007446289062,
      "logps/rejected": -273.3832092285156,
      "loss": 0.2485,
      "rewards/chosen": 0.41066497564315796,
      "rewards/margins": 2.5059401988983154,
      "rewards/rejected": -2.0952751636505127,
      "step": 2693
    },
    {
      "epoch": 0.71,
      "grad_norm": 29.851346969604492,
      "kl": 0.0,
      "learning_rate": 1.474744831196022e-07,
      "logps/chosen": -230.69467163085938,
      "logps/rejected": -298.66607666015625,
      "loss": 0.2614,
      "rewards/chosen": 0.36818838119506836,
      "rewards/margins": 4.815489292144775,
      "rewards/rejected": -4.447300910949707,
      "step": 2694
    },
    {
      "epoch": 0.71,
      "grad_norm": 39.34650802612305,
      "kl": 0.0,
      "learning_rate": 1.473436273226904e-07,
      "logps/chosen": -172.72879028320312,
      "logps/rejected": -300.1294250488281,
      "loss": 0.2825,
      "rewards/chosen": 0.9292474389076233,
      "rewards/margins": 4.453124523162842,
      "rewards/rejected": -3.5238771438598633,
      "step": 2695
    },
    {
      "epoch": 0.71,
      "grad_norm": 37.44293212890625,
      "kl": 0.0,
      "learning_rate": 1.4721277152577858e-07,
      "logps/chosen": -181.06982421875,
      "logps/rejected": -196.63552856445312,
      "loss": 0.3068,
      "rewards/chosen": 1.38923180103302,
      "rewards/margins": 3.2653074264526367,
      "rewards/rejected": -1.8760757446289062,
      "step": 2696
    },
    {
      "epoch": 0.71,
      "grad_norm": 29.282588958740234,
      "kl": 0.0,
      "learning_rate": 1.4708191572886677e-07,
      "logps/chosen": -151.28732299804688,
      "logps/rejected": -230.4415740966797,
      "loss": 0.2568,
      "rewards/chosen": 2.138603448867798,
      "rewards/margins": 6.064770698547363,
      "rewards/rejected": -3.9261670112609863,
      "step": 2697
    },
    {
      "epoch": 0.71,
      "grad_norm": 35.645111083984375,
      "kl": 0.0,
      "learning_rate": 1.4695105993195497e-07,
      "logps/chosen": -247.11302185058594,
      "logps/rejected": -190.36375427246094,
      "loss": 0.1182,
      "rewards/chosen": 1.9163931608200073,
      "rewards/margins": 5.842294692993164,
      "rewards/rejected": -3.925901412963867,
      "step": 2698
    },
    {
      "epoch": 0.71,
      "grad_norm": 27.345233917236328,
      "kl": 0.0,
      "learning_rate": 1.4682020413504316e-07,
      "logps/chosen": -240.89288330078125,
      "logps/rejected": -336.945068359375,
      "loss": 0.256,
      "rewards/chosen": 1.7607756853103638,
      "rewards/margins": 8.707010269165039,
      "rewards/rejected": -6.946234703063965,
      "step": 2699
    },
    {
      "epoch": 0.71,
      "grad_norm": 31.545499801635742,
      "kl": 0.0,
      "learning_rate": 1.4668934833813139e-07,
      "logps/chosen": -188.96682739257812,
      "logps/rejected": -277.4720458984375,
      "loss": 0.2019,
      "rewards/chosen": 0.2540876865386963,
      "rewards/margins": 4.599085807800293,
      "rewards/rejected": -4.344997882843018,
      "step": 2700
    },
    {
      "epoch": 0.71,
      "grad_norm": 33.34171676635742,
      "kl": 0.0,
      "learning_rate": 1.4655849254121958e-07,
      "logps/chosen": -175.2567901611328,
      "logps/rejected": -243.83938598632812,
      "loss": 0.1399,
      "rewards/chosen": 2.2294211387634277,
      "rewards/margins": 5.678004264831543,
      "rewards/rejected": -3.4485833644866943,
      "step": 2701
    },
    {
      "epoch": 0.71,
      "grad_norm": 36.91253662109375,
      "kl": 0.0,
      "learning_rate": 1.4642763674430778e-07,
      "logps/chosen": -269.1799011230469,
      "logps/rejected": -342.16302490234375,
      "loss": 0.2629,
      "rewards/chosen": 2.4199092388153076,
      "rewards/margins": 8.765036582946777,
      "rewards/rejected": -6.345127582550049,
      "step": 2702
    },
    {
      "epoch": 0.71,
      "grad_norm": 41.61262130737305,
      "kl": 0.0,
      "learning_rate": 1.4629678094739597e-07,
      "logps/chosen": -185.5560302734375,
      "logps/rejected": -290.873046875,
      "loss": 0.2414,
      "rewards/chosen": 1.2345657348632812,
      "rewards/margins": 4.153214454650879,
      "rewards/rejected": -2.9186487197875977,
      "step": 2703
    },
    {
      "epoch": 0.71,
      "grad_norm": 34.835086822509766,
      "kl": 0.0,
      "learning_rate": 1.4616592515048414e-07,
      "logps/chosen": -244.6498260498047,
      "logps/rejected": -348.86383056640625,
      "loss": 0.2069,
      "rewards/chosen": 3.809274435043335,
      "rewards/margins": 7.847353935241699,
      "rewards/rejected": -4.038079738616943,
      "step": 2704
    },
    {
      "epoch": 0.71,
      "grad_norm": 43.22990036010742,
      "kl": 0.0,
      "learning_rate": 1.4603506935357234e-07,
      "logps/chosen": -250.3634033203125,
      "logps/rejected": -218.6263427734375,
      "loss": 0.3321,
      "rewards/chosen": 1.0411466360092163,
      "rewards/margins": 2.7422757148742676,
      "rewards/rejected": -1.7011290788650513,
      "step": 2705
    },
    {
      "epoch": 0.71,
      "grad_norm": 34.81473159790039,
      "kl": 0.0,
      "learning_rate": 1.4590421355666054e-07,
      "logps/chosen": -167.74928283691406,
      "logps/rejected": -185.5796356201172,
      "loss": 0.2666,
      "rewards/chosen": 1.0022720098495483,
      "rewards/margins": 3.729177474975586,
      "rewards/rejected": -2.726905345916748,
      "step": 2706
    },
    {
      "epoch": 0.71,
      "grad_norm": 40.64683151245117,
      "kl": 0.0,
      "learning_rate": 1.4577335775974876e-07,
      "logps/chosen": -161.60569763183594,
      "logps/rejected": -221.24998474121094,
      "loss": 0.2768,
      "rewards/chosen": -0.024179527536034584,
      "rewards/margins": 2.4233968257904053,
      "rewards/rejected": -2.4475762844085693,
      "step": 2707
    },
    {
      "epoch": 0.71,
      "grad_norm": 29.897737503051758,
      "kl": 0.0,
      "learning_rate": 1.4564250196283695e-07,
      "logps/chosen": -264.284912109375,
      "logps/rejected": -192.35458374023438,
      "loss": 0.2565,
      "rewards/chosen": 1.6027510166168213,
      "rewards/margins": 4.857088088989258,
      "rewards/rejected": -3.2543368339538574,
      "step": 2708
    },
    {
      "epoch": 0.71,
      "grad_norm": 27.95981216430664,
      "kl": 0.0,
      "learning_rate": 1.4551164616592515e-07,
      "logps/chosen": -206.34573364257812,
      "logps/rejected": -274.0844421386719,
      "loss": 0.2179,
      "rewards/chosen": 0.7812842130661011,
      "rewards/margins": 4.885187149047852,
      "rewards/rejected": -4.103902816772461,
      "step": 2709
    },
    {
      "epoch": 0.71,
      "grad_norm": 32.9197998046875,
      "kl": 0.0,
      "learning_rate": 1.4538079036901335e-07,
      "logps/chosen": -214.70269775390625,
      "logps/rejected": -313.8812255859375,
      "loss": 0.3233,
      "rewards/chosen": -0.612155020236969,
      "rewards/margins": 6.03891134262085,
      "rewards/rejected": -6.651066303253174,
      "step": 2710
    },
    {
      "epoch": 0.71,
      "grad_norm": 39.657840728759766,
      "kl": 0.0,
      "learning_rate": 1.4524993457210154e-07,
      "logps/chosen": -226.01513671875,
      "logps/rejected": -229.1612548828125,
      "loss": 0.3154,
      "rewards/chosen": 0.7531558871269226,
      "rewards/margins": 2.399531126022339,
      "rewards/rejected": -1.6463751792907715,
      "step": 2711
    },
    {
      "epoch": 0.71,
      "grad_norm": 32.31519317626953,
      "kl": 0.0,
      "learning_rate": 1.451190787751897e-07,
      "logps/chosen": -236.01914978027344,
      "logps/rejected": -264.3641052246094,
      "loss": 0.21,
      "rewards/chosen": 0.8073936700820923,
      "rewards/margins": 4.921087741851807,
      "rewards/rejected": -4.113694190979004,
      "step": 2712
    },
    {
      "epoch": 0.71,
      "grad_norm": 39.790504455566406,
      "kl": 0.0,
      "learning_rate": 1.4498822297827794e-07,
      "logps/chosen": -220.0768280029297,
      "logps/rejected": -199.35006713867188,
      "loss": 0.1974,
      "rewards/chosen": 1.070143699645996,
      "rewards/margins": 3.531202793121338,
      "rewards/rejected": -2.461059093475342,
      "step": 2713
    },
    {
      "epoch": 0.71,
      "grad_norm": 39.84419250488281,
      "kl": 0.0,
      "learning_rate": 1.4485736718136613e-07,
      "logps/chosen": -122.71448516845703,
      "logps/rejected": -245.1878662109375,
      "loss": 0.2674,
      "rewards/chosen": -0.22302575409412384,
      "rewards/margins": 4.754169940948486,
      "rewards/rejected": -4.977195739746094,
      "step": 2714
    },
    {
      "epoch": 0.71,
      "grad_norm": 32.12276077270508,
      "kl": 0.0,
      "learning_rate": 1.4472651138445433e-07,
      "logps/chosen": -229.18992614746094,
      "logps/rejected": -229.65902709960938,
      "loss": 0.3738,
      "rewards/chosen": -0.2686086595058441,
      "rewards/margins": 3.344529628753662,
      "rewards/rejected": -3.613138198852539,
      "step": 2715
    },
    {
      "epoch": 0.71,
      "grad_norm": 35.63005447387695,
      "kl": 0.0,
      "learning_rate": 1.4459565558754252e-07,
      "logps/chosen": -240.97824096679688,
      "logps/rejected": -382.0249938964844,
      "loss": 0.2113,
      "rewards/chosen": 1.1434698104858398,
      "rewards/margins": 10.897199630737305,
      "rewards/rejected": -9.753729820251465,
      "step": 2716
    },
    {
      "epoch": 0.71,
      "grad_norm": 31.236305236816406,
      "kl": 0.0,
      "learning_rate": 1.4446479979063072e-07,
      "logps/chosen": -232.86383056640625,
      "logps/rejected": -166.2635498046875,
      "loss": 0.3384,
      "rewards/chosen": 0.37403538823127747,
      "rewards/margins": 2.4673333168029785,
      "rewards/rejected": -2.0932979583740234,
      "step": 2717
    },
    {
      "epoch": 0.71,
      "grad_norm": 30.102787017822266,
      "kl": 0.0,
      "learning_rate": 1.4433394399371892e-07,
      "logps/chosen": -209.07101440429688,
      "logps/rejected": -200.48077392578125,
      "loss": 0.2709,
      "rewards/chosen": 1.7002995014190674,
      "rewards/margins": 7.061903953552246,
      "rewards/rejected": -5.361604690551758,
      "step": 2718
    },
    {
      "epoch": 0.71,
      "grad_norm": 46.64667892456055,
      "kl": 0.0,
      "learning_rate": 1.442030881968071e-07,
      "logps/chosen": -135.42672729492188,
      "logps/rejected": -240.09103393554688,
      "loss": 0.2792,
      "rewards/chosen": 0.5562674403190613,
      "rewards/margins": 4.204452991485596,
      "rewards/rejected": -3.6481854915618896,
      "step": 2719
    },
    {
      "epoch": 0.71,
      "grad_norm": 32.72495651245117,
      "kl": 0.0,
      "learning_rate": 1.440722323998953e-07,
      "logps/chosen": -228.77261352539062,
      "logps/rejected": -401.26251220703125,
      "loss": 0.2102,
      "rewards/chosen": 0.26309388875961304,
      "rewards/margins": 4.260984897613525,
      "rewards/rejected": -3.9978909492492676,
      "step": 2720
    },
    {
      "epoch": 0.71,
      "grad_norm": 25.724273681640625,
      "kl": 0.0,
      "learning_rate": 1.439413766029835e-07,
      "logps/chosen": -179.09849548339844,
      "logps/rejected": -202.8656005859375,
      "loss": 0.2205,
      "rewards/chosen": 0.9190692901611328,
      "rewards/margins": 5.547544002532959,
      "rewards/rejected": -4.628474712371826,
      "step": 2721
    },
    {
      "epoch": 0.71,
      "grad_norm": 33.738853454589844,
      "kl": 0.0,
      "learning_rate": 1.438105208060717e-07,
      "logps/chosen": -255.45628356933594,
      "logps/rejected": -245.0790252685547,
      "loss": 0.1962,
      "rewards/chosen": 3.40286922454834,
      "rewards/margins": 6.362302780151367,
      "rewards/rejected": -2.9594333171844482,
      "step": 2722
    },
    {
      "epoch": 0.71,
      "grad_norm": 47.03477096557617,
      "kl": 0.0,
      "learning_rate": 1.436796650091599e-07,
      "logps/chosen": -235.11585998535156,
      "logps/rejected": -235.00161743164062,
      "loss": 0.2361,
      "rewards/chosen": 1.440369725227356,
      "rewards/margins": 5.251692295074463,
      "rewards/rejected": -3.8113226890563965,
      "step": 2723
    },
    {
      "epoch": 0.71,
      "grad_norm": 34.62791442871094,
      "kl": 0.0,
      "learning_rate": 1.435488092122481e-07,
      "logps/chosen": -244.5506591796875,
      "logps/rejected": -347.0370178222656,
      "loss": 0.3516,
      "rewards/chosen": 0.8647078275680542,
      "rewards/margins": 4.863985538482666,
      "rewards/rejected": -3.9992778301239014,
      "step": 2724
    },
    {
      "epoch": 0.71,
      "grad_norm": 30.0848445892334,
      "kl": 0.0,
      "learning_rate": 1.434179534153363e-07,
      "logps/chosen": -207.02052307128906,
      "logps/rejected": -293.9341125488281,
      "loss": 0.1265,
      "rewards/chosen": 2.3349344730377197,
      "rewards/margins": 6.6972551345825195,
      "rewards/rejected": -4.362320899963379,
      "step": 2725
    },
    {
      "epoch": 0.71,
      "grad_norm": 26.7783260345459,
      "kl": 0.0,
      "learning_rate": 1.432870976184245e-07,
      "logps/chosen": -170.4520721435547,
      "logps/rejected": -281.2052917480469,
      "loss": 0.1864,
      "rewards/chosen": 1.0393160581588745,
      "rewards/margins": 5.177910804748535,
      "rewards/rejected": -4.138594627380371,
      "step": 2726
    },
    {
      "epoch": 0.71,
      "grad_norm": 32.45759201049805,
      "kl": 0.0,
      "learning_rate": 1.4315624182151268e-07,
      "logps/chosen": -229.599365234375,
      "logps/rejected": -188.02590942382812,
      "loss": 0.2791,
      "rewards/chosen": 2.7275326251983643,
      "rewards/margins": 5.223902702331543,
      "rewards/rejected": -2.4963700771331787,
      "step": 2727
    },
    {
      "epoch": 0.71,
      "grad_norm": 25.652467727661133,
      "kl": 0.0,
      "learning_rate": 1.4302538602460088e-07,
      "logps/chosen": -242.80453491210938,
      "logps/rejected": -230.92465209960938,
      "loss": 0.2005,
      "rewards/chosen": 1.264347791671753,
      "rewards/margins": 6.404541969299316,
      "rewards/rejected": -5.140194416046143,
      "step": 2728
    },
    {
      "epoch": 0.71,
      "grad_norm": 43.789573669433594,
      "kl": 0.0,
      "learning_rate": 1.4289453022768907e-07,
      "logps/chosen": -203.66488647460938,
      "logps/rejected": -180.29945373535156,
      "loss": 0.2973,
      "rewards/chosen": 1.972968578338623,
      "rewards/margins": 4.173624038696289,
      "rewards/rejected": -2.200655460357666,
      "step": 2729
    },
    {
      "epoch": 0.71,
      "grad_norm": 40.94158172607422,
      "kl": 0.0,
      "learning_rate": 1.4276367443077727e-07,
      "logps/chosen": -259.09027099609375,
      "logps/rejected": -186.64659118652344,
      "loss": 0.3076,
      "rewards/chosen": 0.5558538436889648,
      "rewards/margins": 3.275819778442383,
      "rewards/rejected": -2.719965934753418,
      "step": 2730
    },
    {
      "epoch": 0.71,
      "grad_norm": 31.99633026123047,
      "kl": 0.0,
      "learning_rate": 1.4263281863386546e-07,
      "logps/chosen": -157.29074096679688,
      "logps/rejected": -279.3443603515625,
      "loss": 0.145,
      "rewards/chosen": 0.8477217555046082,
      "rewards/margins": 5.280526638031006,
      "rewards/rejected": -4.432805061340332,
      "step": 2731
    },
    {
      "epoch": 0.71,
      "grad_norm": 27.111766815185547,
      "kl": 0.0,
      "learning_rate": 1.425019628369537e-07,
      "logps/chosen": -153.20616149902344,
      "logps/rejected": -160.79698181152344,
      "loss": 0.1781,
      "rewards/chosen": 3.4651429653167725,
      "rewards/margins": 6.490252494812012,
      "rewards/rejected": -3.0251095294952393,
      "step": 2732
    },
    {
      "epoch": 0.72,
      "grad_norm": 33.77357864379883,
      "kl": 0.0,
      "learning_rate": 1.4237110704004188e-07,
      "logps/chosen": -252.4764404296875,
      "logps/rejected": -168.84027099609375,
      "loss": 0.3839,
      "rewards/chosen": -0.3884561359882355,
      "rewards/margins": 2.14807391166687,
      "rewards/rejected": -2.536530017852783,
      "step": 2733
    },
    {
      "epoch": 0.72,
      "grad_norm": 32.69058609008789,
      "kl": 0.0,
      "learning_rate": 1.4224025124313008e-07,
      "logps/chosen": -198.02716064453125,
      "logps/rejected": -187.51708984375,
      "loss": 0.2079,
      "rewards/chosen": 1.7017436027526855,
      "rewards/margins": 3.9817001819610596,
      "rewards/rejected": -2.279956579208374,
      "step": 2734
    },
    {
      "epoch": 0.72,
      "grad_norm": 25.28070640563965,
      "kl": 0.0,
      "learning_rate": 1.4210939544621825e-07,
      "logps/chosen": -182.21871948242188,
      "logps/rejected": -251.4492950439453,
      "loss": 0.1658,
      "rewards/chosen": 0.9423468708992004,
      "rewards/margins": 4.9712605476379395,
      "rewards/rejected": -4.028913497924805,
      "step": 2735
    },
    {
      "epoch": 0.72,
      "grad_norm": 28.65892791748047,
      "kl": 0.0,
      "learning_rate": 1.4197853964930645e-07,
      "logps/chosen": -171.02603149414062,
      "logps/rejected": -222.0957489013672,
      "loss": 0.3054,
      "rewards/chosen": -0.5188900232315063,
      "rewards/margins": 1.9518908262252808,
      "rewards/rejected": -2.470780849456787,
      "step": 2736
    },
    {
      "epoch": 0.72,
      "grad_norm": 28.866334915161133,
      "kl": 0.0,
      "learning_rate": 1.4184768385239464e-07,
      "logps/chosen": -207.8139190673828,
      "logps/rejected": -366.18408203125,
      "loss": 0.2341,
      "rewards/chosen": 1.4074229001998901,
      "rewards/margins": 5.109437465667725,
      "rewards/rejected": -3.702014446258545,
      "step": 2737
    },
    {
      "epoch": 0.72,
      "grad_norm": 26.634572982788086,
      "kl": 0.0,
      "learning_rate": 1.4171682805548286e-07,
      "logps/chosen": -260.7043762207031,
      "logps/rejected": -213.9114990234375,
      "loss": 0.2407,
      "rewards/chosen": -0.022909751161932945,
      "rewards/margins": 4.45982027053833,
      "rewards/rejected": -4.482729911804199,
      "step": 2738
    },
    {
      "epoch": 0.72,
      "grad_norm": 32.87309646606445,
      "kl": 0.0,
      "learning_rate": 1.4158597225857106e-07,
      "logps/chosen": -147.40817260742188,
      "logps/rejected": -252.726806640625,
      "loss": 0.2776,
      "rewards/chosen": 1.218986988067627,
      "rewards/margins": 3.1168408393859863,
      "rewards/rejected": -1.8978537321090698,
      "step": 2739
    },
    {
      "epoch": 0.72,
      "grad_norm": 21.100400924682617,
      "kl": 0.0,
      "learning_rate": 1.4145511646165926e-07,
      "logps/chosen": -334.18707275390625,
      "logps/rejected": -321.79534912109375,
      "loss": 0.2434,
      "rewards/chosen": 0.3896353840827942,
      "rewards/margins": 4.801957607269287,
      "rewards/rejected": -4.412322044372559,
      "step": 2740
    },
    {
      "epoch": 0.72,
      "grad_norm": 43.7648811340332,
      "kl": 0.0,
      "learning_rate": 1.4132426066474745e-07,
      "logps/chosen": -217.05458068847656,
      "logps/rejected": -253.01095581054688,
      "loss": 0.3203,
      "rewards/chosen": 0.4657108783721924,
      "rewards/margins": 3.0607008934020996,
      "rewards/rejected": -2.5949900150299072,
      "step": 2741
    },
    {
      "epoch": 0.72,
      "grad_norm": 26.86048698425293,
      "kl": 0.0,
      "learning_rate": 1.4119340486783562e-07,
      "logps/chosen": -216.07846069335938,
      "logps/rejected": -217.1433563232422,
      "loss": 0.1815,
      "rewards/chosen": 0.9621667265892029,
      "rewards/margins": 4.179025173187256,
      "rewards/rejected": -3.2168586254119873,
      "step": 2742
    },
    {
      "epoch": 0.72,
      "grad_norm": 34.33148193359375,
      "kl": 0.0,
      "learning_rate": 1.4106254907092382e-07,
      "logps/chosen": -208.30294799804688,
      "logps/rejected": -191.87002563476562,
      "loss": 0.3044,
      "rewards/chosen": 0.46693187952041626,
      "rewards/margins": 4.43991231918335,
      "rewards/rejected": -3.972980499267578,
      "step": 2743
    },
    {
      "epoch": 0.72,
      "grad_norm": 36.13833236694336,
      "kl": 0.0,
      "learning_rate": 1.4093169327401201e-07,
      "logps/chosen": -193.1121368408203,
      "logps/rejected": -216.50210571289062,
      "loss": 0.3074,
      "rewards/chosen": 0.9344757795333862,
      "rewards/margins": 3.430565357208252,
      "rewards/rejected": -2.496089458465576,
      "step": 2744
    },
    {
      "epoch": 0.72,
      "grad_norm": 31.421340942382812,
      "kl": 0.0,
      "learning_rate": 1.4080083747710024e-07,
      "logps/chosen": -174.89791870117188,
      "logps/rejected": -237.71141052246094,
      "loss": 0.3358,
      "rewards/chosen": 0.9983955025672913,
      "rewards/margins": 4.591882705688477,
      "rewards/rejected": -3.59348726272583,
      "step": 2745
    },
    {
      "epoch": 0.72,
      "grad_norm": 48.589500427246094,
      "kl": 0.0,
      "learning_rate": 1.4066998168018843e-07,
      "logps/chosen": -248.13365173339844,
      "logps/rejected": -260.446533203125,
      "loss": 0.3376,
      "rewards/chosen": 0.1320873200893402,
      "rewards/margins": 3.9734907150268555,
      "rewards/rejected": -3.8414034843444824,
      "step": 2746
    },
    {
      "epoch": 0.72,
      "grad_norm": 35.76465606689453,
      "kl": 0.0,
      "learning_rate": 1.4053912588327663e-07,
      "logps/chosen": -162.7195587158203,
      "logps/rejected": -221.4072723388672,
      "loss": 0.2968,
      "rewards/chosen": 1.0338228940963745,
      "rewards/margins": 3.8434853553771973,
      "rewards/rejected": -2.809662342071533,
      "step": 2747
    },
    {
      "epoch": 0.72,
      "grad_norm": 37.44093322753906,
      "kl": 0.0,
      "learning_rate": 1.4040827008636483e-07,
      "logps/chosen": -219.79983520507812,
      "logps/rejected": -173.9461212158203,
      "loss": 0.3516,
      "rewards/chosen": 0.971840500831604,
      "rewards/margins": 5.520664215087891,
      "rewards/rejected": -4.548823833465576,
      "step": 2748
    },
    {
      "epoch": 0.72,
      "grad_norm": 35.50620651245117,
      "kl": 0.0,
      "learning_rate": 1.4027741428945302e-07,
      "logps/chosen": -127.27174377441406,
      "logps/rejected": -257.6485595703125,
      "loss": 0.2437,
      "rewards/chosen": 0.9993495941162109,
      "rewards/margins": 4.665413856506348,
      "rewards/rejected": -3.666064500808716,
      "step": 2749
    },
    {
      "epoch": 0.72,
      "grad_norm": 36.798675537109375,
      "kl": 0.0,
      "learning_rate": 1.401465584925412e-07,
      "logps/chosen": -230.24996948242188,
      "logps/rejected": -275.0447082519531,
      "loss": 0.2929,
      "rewards/chosen": 2.6916356086730957,
      "rewards/margins": 5.939266204833984,
      "rewards/rejected": -3.2476305961608887,
      "step": 2750
    },
    {
      "epoch": 0.72,
      "grad_norm": 34.59363555908203,
      "kl": 0.0,
      "learning_rate": 1.4001570269562941e-07,
      "logps/chosen": -182.05906677246094,
      "logps/rejected": -231.069091796875,
      "loss": 0.2712,
      "rewards/chosen": 1.328806757926941,
      "rewards/margins": 4.8035783767700195,
      "rewards/rejected": -3.474771738052368,
      "step": 2751
    },
    {
      "epoch": 0.72,
      "grad_norm": 35.60285949707031,
      "kl": 0.0,
      "learning_rate": 1.398848468987176e-07,
      "logps/chosen": -207.7548065185547,
      "logps/rejected": -264.8133544921875,
      "loss": 0.2353,
      "rewards/chosen": 0.33422619104385376,
      "rewards/margins": 3.725780963897705,
      "rewards/rejected": -3.391554832458496,
      "step": 2752
    },
    {
      "epoch": 0.72,
      "grad_norm": 41.25855255126953,
      "kl": 0.0,
      "learning_rate": 1.397539911018058e-07,
      "logps/chosen": -221.4940948486328,
      "logps/rejected": -160.0779571533203,
      "loss": 0.2543,
      "rewards/chosen": 0.46627089381217957,
      "rewards/margins": 2.0166687965393066,
      "rewards/rejected": -1.5503979921340942,
      "step": 2753
    },
    {
      "epoch": 0.72,
      "grad_norm": 29.777326583862305,
      "kl": 0.0,
      "learning_rate": 1.39623135304894e-07,
      "logps/chosen": -112.76001739501953,
      "logps/rejected": -175.68617248535156,
      "loss": 0.2654,
      "rewards/chosen": 2.5609025955200195,
      "rewards/margins": 4.485593318939209,
      "rewards/rejected": -1.9246906042099,
      "step": 2754
    },
    {
      "epoch": 0.72,
      "grad_norm": 31.612537384033203,
      "kl": 0.0,
      "learning_rate": 1.394922795079822e-07,
      "logps/chosen": -228.182373046875,
      "logps/rejected": -271.0409851074219,
      "loss": 0.2301,
      "rewards/chosen": 0.5148449540138245,
      "rewards/margins": 3.6974213123321533,
      "rewards/rejected": -3.1825764179229736,
      "step": 2755
    },
    {
      "epoch": 0.72,
      "grad_norm": 28.019922256469727,
      "kl": 0.0,
      "learning_rate": 1.393614237110704e-07,
      "logps/chosen": -129.9990997314453,
      "logps/rejected": -211.51722717285156,
      "loss": 0.2257,
      "rewards/chosen": 0.5919703245162964,
      "rewards/margins": 3.691455364227295,
      "rewards/rejected": -3.099485158920288,
      "step": 2756
    },
    {
      "epoch": 0.72,
      "grad_norm": 33.64453887939453,
      "kl": 0.0,
      "learning_rate": 1.3923056791415862e-07,
      "logps/chosen": -292.7642822265625,
      "logps/rejected": -293.0862731933594,
      "loss": 0.2164,
      "rewards/chosen": 0.4971492290496826,
      "rewards/margins": 4.758174896240234,
      "rewards/rejected": -4.261025905609131,
      "step": 2757
    },
    {
      "epoch": 0.72,
      "grad_norm": 33.85685348510742,
      "kl": 0.0,
      "learning_rate": 1.3909971211724679e-07,
      "logps/chosen": -134.65257263183594,
      "logps/rejected": -253.585693359375,
      "loss": 0.2708,
      "rewards/chosen": 1.0100104808807373,
      "rewards/margins": 3.8374264240264893,
      "rewards/rejected": -2.827415943145752,
      "step": 2758
    },
    {
      "epoch": 0.72,
      "grad_norm": 31.849849700927734,
      "kl": 0.0,
      "learning_rate": 1.3896885632033498e-07,
      "logps/chosen": -227.2148895263672,
      "logps/rejected": -259.6429138183594,
      "loss": 0.2142,
      "rewards/chosen": 2.4732985496520996,
      "rewards/margins": 5.716508865356445,
      "rewards/rejected": -3.2432103157043457,
      "step": 2759
    },
    {
      "epoch": 0.72,
      "grad_norm": 43.46824264526367,
      "kl": 0.0,
      "learning_rate": 1.3883800052342318e-07,
      "logps/chosen": -159.94631958007812,
      "logps/rejected": -315.32855224609375,
      "loss": 0.2885,
      "rewards/chosen": -0.026400430127978325,
      "rewards/margins": 3.871565103530884,
      "rewards/rejected": -3.897965431213379,
      "step": 2760
    },
    {
      "epoch": 0.72,
      "grad_norm": 33.06340789794922,
      "kl": 0.0,
      "learning_rate": 1.3870714472651137e-07,
      "logps/chosen": -186.7371063232422,
      "logps/rejected": -278.01531982421875,
      "loss": 0.1404,
      "rewards/chosen": 2.455404758453369,
      "rewards/margins": 7.5706915855407715,
      "rewards/rejected": -5.115286827087402,
      "step": 2761
    },
    {
      "epoch": 0.72,
      "grad_norm": 34.241844177246094,
      "kl": 0.0,
      "learning_rate": 1.3857628892959957e-07,
      "logps/chosen": -138.23428344726562,
      "logps/rejected": -227.390625,
      "loss": 0.3009,
      "rewards/chosen": -0.31029558181762695,
      "rewards/margins": 2.701427459716797,
      "rewards/rejected": -3.011723041534424,
      "step": 2762
    },
    {
      "epoch": 0.72,
      "grad_norm": 33.2817268371582,
      "kl": 0.0,
      "learning_rate": 1.3844543313268777e-07,
      "logps/chosen": -150.46156311035156,
      "logps/rejected": -204.14328002929688,
      "loss": 0.2398,
      "rewards/chosen": 0.7092301845550537,
      "rewards/margins": 3.1490092277526855,
      "rewards/rejected": -2.439779043197632,
      "step": 2763
    },
    {
      "epoch": 0.72,
      "grad_norm": 41.400272369384766,
      "kl": 0.0,
      "learning_rate": 1.38314577335776e-07,
      "logps/chosen": -237.2517852783203,
      "logps/rejected": -275.0649719238281,
      "loss": 0.3358,
      "rewards/chosen": -1.6412899494171143,
      "rewards/margins": 0.6008193492889404,
      "rewards/rejected": -2.2421092987060547,
      "step": 2764
    },
    {
      "epoch": 0.72,
      "grad_norm": 37.423851013183594,
      "kl": 0.0,
      "learning_rate": 1.3818372153886419e-07,
      "logps/chosen": -294.60052490234375,
      "logps/rejected": -282.4591064453125,
      "loss": 0.3369,
      "rewards/chosen": -0.7444419264793396,
      "rewards/margins": 2.0405898094177246,
      "rewards/rejected": -2.785031795501709,
      "step": 2765
    },
    {
      "epoch": 0.72,
      "grad_norm": 30.79787826538086,
      "kl": 0.0,
      "learning_rate": 1.3805286574195235e-07,
      "logps/chosen": -246.27394104003906,
      "logps/rejected": -155.23995971679688,
      "loss": 0.2902,
      "rewards/chosen": 0.21204765141010284,
      "rewards/margins": 3.1054985523223877,
      "rewards/rejected": -2.893450975418091,
      "step": 2766
    },
    {
      "epoch": 0.72,
      "grad_norm": 27.49995994567871,
      "kl": 0.0,
      "learning_rate": 1.3792200994504055e-07,
      "logps/chosen": -182.4592742919922,
      "logps/rejected": -204.7582244873047,
      "loss": 0.242,
      "rewards/chosen": 1.772039771080017,
      "rewards/margins": 4.657109260559082,
      "rewards/rejected": -2.8850696086883545,
      "step": 2767
    },
    {
      "epoch": 0.72,
      "grad_norm": 29.28543472290039,
      "kl": 0.0,
      "learning_rate": 1.3779115414812875e-07,
      "logps/chosen": -198.22662353515625,
      "logps/rejected": -167.86358642578125,
      "loss": 0.2965,
      "rewards/chosen": 1.0747374296188354,
      "rewards/margins": 4.275830268859863,
      "rewards/rejected": -3.2010929584503174,
      "step": 2768
    },
    {
      "epoch": 0.72,
      "grad_norm": 37.219173431396484,
      "kl": 0.0,
      "learning_rate": 1.3766029835121694e-07,
      "logps/chosen": -245.44570922851562,
      "logps/rejected": -235.1141357421875,
      "loss": 0.1944,
      "rewards/chosen": 2.7740979194641113,
      "rewards/margins": 5.648436546325684,
      "rewards/rejected": -2.8743388652801514,
      "step": 2769
    },
    {
      "epoch": 0.72,
      "grad_norm": 34.19056701660156,
      "kl": 0.0,
      "learning_rate": 1.3752944255430517e-07,
      "logps/chosen": -313.8372497558594,
      "logps/rejected": -224.35574340820312,
      "loss": 0.1986,
      "rewards/chosen": 1.6918792724609375,
      "rewards/margins": 4.29853630065918,
      "rewards/rejected": -2.606657028198242,
      "step": 2770
    },
    {
      "epoch": 0.73,
      "grad_norm": 32.03276824951172,
      "kl": 0.0,
      "learning_rate": 1.3739858675739336e-07,
      "logps/chosen": -176.02622985839844,
      "logps/rejected": -199.72654724121094,
      "loss": 0.3817,
      "rewards/chosen": -0.32908573746681213,
      "rewards/margins": 1.3851908445358276,
      "rewards/rejected": -1.7142765522003174,
      "step": 2771
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.00090789794922,
      "kl": 0.0,
      "learning_rate": 1.3726773096048156e-07,
      "logps/chosen": -252.97853088378906,
      "logps/rejected": -260.77093505859375,
      "loss": 0.2482,
      "rewards/chosen": 1.4872760772705078,
      "rewards/margins": 5.681765556335449,
      "rewards/rejected": -4.194489479064941,
      "step": 2772
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.82801818847656,
      "kl": 0.0,
      "learning_rate": 1.3713687516356973e-07,
      "logps/chosen": -259.81781005859375,
      "logps/rejected": -212.13023376464844,
      "loss": 0.2776,
      "rewards/chosen": 1.3029401302337646,
      "rewards/margins": 4.064812660217285,
      "rewards/rejected": -2.7618727684020996,
      "step": 2773
    },
    {
      "epoch": 0.73,
      "grad_norm": 30.624717712402344,
      "kl": 0.0,
      "learning_rate": 1.3700601936665792e-07,
      "logps/chosen": -237.0125732421875,
      "logps/rejected": -277.92010498046875,
      "loss": 0.2677,
      "rewards/chosen": 2.0963778495788574,
      "rewards/margins": 5.673805236816406,
      "rewards/rejected": -3.577427387237549,
      "step": 2774
    },
    {
      "epoch": 0.73,
      "grad_norm": 39.34622573852539,
      "kl": 0.0,
      "learning_rate": 1.3687516356974612e-07,
      "logps/chosen": -189.960205078125,
      "logps/rejected": -252.06198120117188,
      "loss": 0.2544,
      "rewards/chosen": 1.7920467853546143,
      "rewards/margins": 3.5940065383911133,
      "rewards/rejected": -1.801959753036499,
      "step": 2775
    },
    {
      "epoch": 0.73,
      "grad_norm": 42.03234100341797,
      "kl": 0.0,
      "learning_rate": 1.3674430777283432e-07,
      "logps/chosen": -265.41485595703125,
      "logps/rejected": -206.2362823486328,
      "loss": 0.3364,
      "rewards/chosen": 0.413890540599823,
      "rewards/margins": 3.1098921298980713,
      "rewards/rejected": -2.6960015296936035,
      "step": 2776
    },
    {
      "epoch": 0.73,
      "grad_norm": 31.57137680053711,
      "kl": 0.0,
      "learning_rate": 1.3661345197592254e-07,
      "logps/chosen": -189.6539764404297,
      "logps/rejected": -242.28109741210938,
      "loss": 0.2578,
      "rewards/chosen": 1.3802999258041382,
      "rewards/margins": 4.1886820793151855,
      "rewards/rejected": -2.808382034301758,
      "step": 2777
    },
    {
      "epoch": 0.73,
      "grad_norm": 29.047449111938477,
      "kl": 0.0,
      "learning_rate": 1.3648259617901073e-07,
      "logps/chosen": -260.63482666015625,
      "logps/rejected": -221.86618041992188,
      "loss": 0.3031,
      "rewards/chosen": -0.33293989300727844,
      "rewards/margins": 2.5053622722625732,
      "rewards/rejected": -2.8383021354675293,
      "step": 2778
    },
    {
      "epoch": 0.73,
      "grad_norm": 26.14084243774414,
      "kl": 0.0,
      "learning_rate": 1.3635174038209893e-07,
      "logps/chosen": -132.5585479736328,
      "logps/rejected": -174.09848022460938,
      "loss": 0.2469,
      "rewards/chosen": 0.8879119157791138,
      "rewards/margins": 3.6658644676208496,
      "rewards/rejected": -2.7779526710510254,
      "step": 2779
    },
    {
      "epoch": 0.73,
      "grad_norm": 41.65951156616211,
      "kl": 0.0,
      "learning_rate": 1.3622088458518713e-07,
      "logps/chosen": -217.3573455810547,
      "logps/rejected": -331.310546875,
      "loss": 0.2999,
      "rewards/chosen": 1.362498164176941,
      "rewards/margins": 3.556079864501953,
      "rewards/rejected": -2.1935815811157227,
      "step": 2780
    },
    {
      "epoch": 0.73,
      "grad_norm": 37.427066802978516,
      "kl": 0.0,
      "learning_rate": 1.360900287882753e-07,
      "logps/chosen": -218.29087829589844,
      "logps/rejected": -271.3686218261719,
      "loss": 0.3319,
      "rewards/chosen": -0.09768380224704742,
      "rewards/margins": 2.889094829559326,
      "rewards/rejected": -2.986778736114502,
      "step": 2781
    },
    {
      "epoch": 0.73,
      "grad_norm": 39.8854866027832,
      "kl": 0.0,
      "learning_rate": 1.359591729913635e-07,
      "logps/chosen": -164.79013061523438,
      "logps/rejected": -248.52499389648438,
      "loss": 0.297,
      "rewards/chosen": -0.26507285237312317,
      "rewards/margins": 3.319533109664917,
      "rewards/rejected": -3.5846059322357178,
      "step": 2782
    },
    {
      "epoch": 0.73,
      "grad_norm": 30.36602783203125,
      "kl": 0.0,
      "learning_rate": 1.3582831719445171e-07,
      "logps/chosen": -215.8096160888672,
      "logps/rejected": -179.60113525390625,
      "loss": 0.2619,
      "rewards/chosen": -0.6932493448257446,
      "rewards/margins": 1.766062617301941,
      "rewards/rejected": -2.4593119621276855,
      "step": 2783
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.83140182495117,
      "kl": 0.0,
      "learning_rate": 1.356974613975399e-07,
      "logps/chosen": -269.39556884765625,
      "logps/rejected": -259.3585205078125,
      "loss": 0.2726,
      "rewards/chosen": 1.2403514385223389,
      "rewards/margins": 4.321313858032227,
      "rewards/rejected": -3.0809621810913086,
      "step": 2784
    },
    {
      "epoch": 0.73,
      "grad_norm": 35.393802642822266,
      "kl": 0.0,
      "learning_rate": 1.355666056006281e-07,
      "logps/chosen": -228.4115447998047,
      "logps/rejected": -283.6307373046875,
      "loss": 0.272,
      "rewards/chosen": 1.7523528337478638,
      "rewards/margins": 5.948849678039551,
      "rewards/rejected": -4.196496963500977,
      "step": 2785
    },
    {
      "epoch": 0.73,
      "grad_norm": 26.124181747436523,
      "kl": 0.0,
      "learning_rate": 1.354357498037163e-07,
      "logps/chosen": -196.4839324951172,
      "logps/rejected": -228.32936096191406,
      "loss": 0.2173,
      "rewards/chosen": 0.9325391054153442,
      "rewards/margins": 4.881499767303467,
      "rewards/rejected": -3.948960781097412,
      "step": 2786
    },
    {
      "epoch": 0.73,
      "grad_norm": 38.89645767211914,
      "kl": 0.0,
      "learning_rate": 1.353048940068045e-07,
      "logps/chosen": -178.40771484375,
      "logps/rejected": -188.349853515625,
      "loss": 0.4389,
      "rewards/chosen": -0.047868043184280396,
      "rewards/margins": 1.500084400177002,
      "rewards/rejected": -1.54795241355896,
      "step": 2787
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.04124450683594,
      "kl": 0.0,
      "learning_rate": 1.351740382098927e-07,
      "logps/chosen": -254.98538208007812,
      "logps/rejected": -298.51654052734375,
      "loss": 0.3969,
      "rewards/chosen": 0.29132279753685,
      "rewards/margins": 4.159118175506592,
      "rewards/rejected": -3.86779522895813,
      "step": 2788
    },
    {
      "epoch": 0.73,
      "grad_norm": 27.67196273803711,
      "kl": 0.0,
      "learning_rate": 1.3504318241298086e-07,
      "logps/chosen": -174.4413299560547,
      "logps/rejected": -330.2989807128906,
      "loss": 0.18,
      "rewards/chosen": 0.7002738118171692,
      "rewards/margins": 5.744529724121094,
      "rewards/rejected": -5.04425573348999,
      "step": 2789
    },
    {
      "epoch": 0.73,
      "grad_norm": 33.7601203918457,
      "kl": 0.0,
      "learning_rate": 1.349123266160691e-07,
      "logps/chosen": -149.75143432617188,
      "logps/rejected": -281.0748596191406,
      "loss": 0.2179,
      "rewards/chosen": 1.7364554405212402,
      "rewards/margins": 6.038804531097412,
      "rewards/rejected": -4.302349090576172,
      "step": 2790
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.44429016113281,
      "kl": 0.0,
      "learning_rate": 1.3478147081915728e-07,
      "logps/chosen": -235.973876953125,
      "logps/rejected": -297.8359375,
      "loss": 0.2328,
      "rewards/chosen": 1.891502857208252,
      "rewards/margins": 6.574864387512207,
      "rewards/rejected": -4.683361530303955,
      "step": 2791
    },
    {
      "epoch": 0.73,
      "grad_norm": 37.90352249145508,
      "kl": 0.0,
      "learning_rate": 1.3465061502224548e-07,
      "logps/chosen": -108.64974212646484,
      "logps/rejected": -247.4210205078125,
      "loss": 0.2774,
      "rewards/chosen": 1.4293962717056274,
      "rewards/margins": 3.7838568687438965,
      "rewards/rejected": -2.3544607162475586,
      "step": 2792
    },
    {
      "epoch": 0.73,
      "grad_norm": 34.55895233154297,
      "kl": 0.0,
      "learning_rate": 1.3451975922533368e-07,
      "logps/chosen": -239.15586853027344,
      "logps/rejected": -230.9355010986328,
      "loss": 0.3273,
      "rewards/chosen": 0.25174659490585327,
      "rewards/margins": 3.4415252208709717,
      "rewards/rejected": -3.1897785663604736,
      "step": 2793
    },
    {
      "epoch": 0.73,
      "grad_norm": 39.81635284423828,
      "kl": 0.0,
      "learning_rate": 1.3438890342842187e-07,
      "logps/chosen": -253.4542236328125,
      "logps/rejected": -341.01483154296875,
      "loss": 0.2496,
      "rewards/chosen": 0.5031195878982544,
      "rewards/margins": 4.771719932556152,
      "rewards/rejected": -4.2686004638671875,
      "step": 2794
    },
    {
      "epoch": 0.73,
      "grad_norm": 29.70501136779785,
      "kl": 0.0,
      "learning_rate": 1.3425804763151007e-07,
      "logps/chosen": -154.07347106933594,
      "logps/rejected": -212.02188110351562,
      "loss": 0.2204,
      "rewards/chosen": 0.0629243478178978,
      "rewards/margins": 2.979095220565796,
      "rewards/rejected": -2.916170835494995,
      "step": 2795
    },
    {
      "epoch": 0.73,
      "grad_norm": 25.16206932067871,
      "kl": 0.0,
      "learning_rate": 1.341271918345983e-07,
      "logps/chosen": -204.69512939453125,
      "logps/rejected": -185.43121337890625,
      "loss": 0.1135,
      "rewards/chosen": 2.616105318069458,
      "rewards/margins": 5.921466827392578,
      "rewards/rejected": -3.305361747741699,
      "step": 2796
    },
    {
      "epoch": 0.73,
      "grad_norm": 35.79527282714844,
      "kl": 0.0,
      "learning_rate": 1.3399633603768646e-07,
      "logps/chosen": -273.466552734375,
      "logps/rejected": -260.4999084472656,
      "loss": 0.2636,
      "rewards/chosen": -0.1472821831703186,
      "rewards/margins": 4.861036777496338,
      "rewards/rejected": -5.008318901062012,
      "step": 2797
    },
    {
      "epoch": 0.73,
      "grad_norm": 32.936561584472656,
      "kl": 0.0,
      "learning_rate": 1.3386548024077466e-07,
      "logps/chosen": -256.4706115722656,
      "logps/rejected": -284.8575744628906,
      "loss": 0.2715,
      "rewards/chosen": 0.46489885449409485,
      "rewards/margins": 3.55414080619812,
      "rewards/rejected": -3.0892419815063477,
      "step": 2798
    },
    {
      "epoch": 0.73,
      "grad_norm": 29.936857223510742,
      "kl": 0.0,
      "learning_rate": 1.3373462444386285e-07,
      "logps/chosen": -218.6918487548828,
      "logps/rejected": -254.4846649169922,
      "loss": 0.2718,
      "rewards/chosen": 0.6963696479797363,
      "rewards/margins": 3.5298426151275635,
      "rewards/rejected": -2.833472967147827,
      "step": 2799
    },
    {
      "epoch": 0.73,
      "grad_norm": 40.529151916503906,
      "kl": 0.0,
      "learning_rate": 1.3360376864695105e-07,
      "logps/chosen": -198.93524169921875,
      "logps/rejected": -221.15386962890625,
      "loss": 0.2807,
      "rewards/chosen": 1.1004393100738525,
      "rewards/margins": 5.532849311828613,
      "rewards/rejected": -4.43241024017334,
      "step": 2800
    },
    {
      "epoch": 0.73,
      "grad_norm": 39.31686019897461,
      "kl": 0.0,
      "learning_rate": 1.3347291285003924e-07,
      "logps/chosen": -207.00869750976562,
      "logps/rejected": -250.30758666992188,
      "loss": 0.2649,
      "rewards/chosen": -0.03526926040649414,
      "rewards/margins": 3.3345630168914795,
      "rewards/rejected": -3.3698322772979736,
      "step": 2801
    },
    {
      "epoch": 0.73,
      "grad_norm": 26.114871978759766,
      "kl": 0.0,
      "learning_rate": 1.3334205705312747e-07,
      "logps/chosen": -204.65841674804688,
      "logps/rejected": -247.81874084472656,
      "loss": 0.2696,
      "rewards/chosen": 0.9131640195846558,
      "rewards/margins": 4.818316459655762,
      "rewards/rejected": -3.9051523208618164,
      "step": 2802
    },
    {
      "epoch": 0.73,
      "grad_norm": 35.78491973876953,
      "kl": 0.0,
      "learning_rate": 1.3321120125621566e-07,
      "logps/chosen": -251.0792999267578,
      "logps/rejected": -247.90907287597656,
      "loss": 0.2803,
      "rewards/chosen": 1.001746654510498,
      "rewards/margins": 4.692512512207031,
      "rewards/rejected": -3.6907660961151123,
      "step": 2803
    },
    {
      "epoch": 0.73,
      "grad_norm": 31.028039932250977,
      "kl": 0.0,
      "learning_rate": 1.3308034545930383e-07,
      "logps/chosen": -150.0222930908203,
      "logps/rejected": -263.18804931640625,
      "loss": 0.2219,
      "rewards/chosen": 0.88972407579422,
      "rewards/margins": 4.433566570281982,
      "rewards/rejected": -3.543842315673828,
      "step": 2804
    },
    {
      "epoch": 0.73,
      "grad_norm": 32.93028259277344,
      "kl": 0.0,
      "learning_rate": 1.3294948966239203e-07,
      "logps/chosen": -160.91116333007812,
      "logps/rejected": -246.55479431152344,
      "loss": 0.2802,
      "rewards/chosen": 0.5891302824020386,
      "rewards/margins": 3.9440999031066895,
      "rewards/rejected": -3.3549697399139404,
      "step": 2805
    },
    {
      "epoch": 0.73,
      "grad_norm": 33.47201919555664,
      "kl": 0.0,
      "learning_rate": 1.3281863386548022e-07,
      "logps/chosen": -284.26470947265625,
      "logps/rejected": -261.2138671875,
      "loss": 0.2462,
      "rewards/chosen": 0.9916735887527466,
      "rewards/margins": 5.291661739349365,
      "rewards/rejected": -4.299988269805908,
      "step": 2806
    },
    {
      "epoch": 0.73,
      "grad_norm": 31.195194244384766,
      "kl": 0.0,
      "learning_rate": 1.3268777806856842e-07,
      "logps/chosen": -223.95889282226562,
      "logps/rejected": -220.4429931640625,
      "loss": 0.2153,
      "rewards/chosen": 0.1629033088684082,
      "rewards/margins": 3.741335153579712,
      "rewards/rejected": -3.5784318447113037,
      "step": 2807
    },
    {
      "epoch": 0.73,
      "grad_norm": 36.86294174194336,
      "kl": 0.0,
      "learning_rate": 1.3255692227165662e-07,
      "logps/chosen": -288.4587097167969,
      "logps/rejected": -332.08062744140625,
      "loss": 0.1844,
      "rewards/chosen": 2.108771800994873,
      "rewards/margins": 7.597304344177246,
      "rewards/rejected": -5.488532543182373,
      "step": 2808
    },
    {
      "epoch": 0.74,
      "grad_norm": 27.21337890625,
      "kl": 0.0,
      "learning_rate": 1.3242606647474484e-07,
      "logps/chosen": -178.190185546875,
      "logps/rejected": -295.3204650878906,
      "loss": 0.1621,
      "rewards/chosen": 2.2621607780456543,
      "rewards/margins": 5.234269142150879,
      "rewards/rejected": -2.9721086025238037,
      "step": 2809
    },
    {
      "epoch": 0.74,
      "grad_norm": 32.42479705810547,
      "kl": 0.0,
      "learning_rate": 1.3229521067783304e-07,
      "logps/chosen": -267.2407531738281,
      "logps/rejected": -195.87710571289062,
      "loss": 0.2883,
      "rewards/chosen": 0.8129584789276123,
      "rewards/margins": 3.803115129470825,
      "rewards/rejected": -2.990156650543213,
      "step": 2810
    },
    {
      "epoch": 0.74,
      "grad_norm": 40.45661926269531,
      "kl": 0.0,
      "learning_rate": 1.3216435488092123e-07,
      "logps/chosen": -194.41912841796875,
      "logps/rejected": -258.4333801269531,
      "loss": 0.2662,
      "rewards/chosen": 0.8145712018013,
      "rewards/margins": 4.6983819007873535,
      "rewards/rejected": -3.8838107585906982,
      "step": 2811
    },
    {
      "epoch": 0.74,
      "grad_norm": 27.655622482299805,
      "kl": 0.0,
      "learning_rate": 1.320334990840094e-07,
      "logps/chosen": -263.7073059082031,
      "logps/rejected": -285.5018005371094,
      "loss": 0.2301,
      "rewards/chosen": 2.682894229888916,
      "rewards/margins": 6.3988237380981445,
      "rewards/rejected": -3.7159295082092285,
      "step": 2812
    },
    {
      "epoch": 0.74,
      "grad_norm": 42.45039367675781,
      "kl": 0.0,
      "learning_rate": 1.319026432870976e-07,
      "logps/chosen": -208.87033081054688,
      "logps/rejected": -189.85069274902344,
      "loss": 0.3343,
      "rewards/chosen": 0.5289901494979858,
      "rewards/margins": 3.121410369873047,
      "rewards/rejected": -2.5924201011657715,
      "step": 2813
    },
    {
      "epoch": 0.74,
      "grad_norm": 33.07746505737305,
      "kl": 0.0,
      "learning_rate": 1.317717874901858e-07,
      "logps/chosen": -325.9510803222656,
      "logps/rejected": -175.13290405273438,
      "loss": 0.2399,
      "rewards/chosen": 1.947821855545044,
      "rewards/margins": 6.143939971923828,
      "rewards/rejected": -4.196118354797363,
      "step": 2814
    },
    {
      "epoch": 0.74,
      "grad_norm": 31.211721420288086,
      "kl": 0.0,
      "learning_rate": 1.3164093169327402e-07,
      "logps/chosen": -134.55836486816406,
      "logps/rejected": -330.57147216796875,
      "loss": 0.2007,
      "rewards/chosen": 1.4923220872879028,
      "rewards/margins": 4.616800785064697,
      "rewards/rejected": -3.124478578567505,
      "step": 2815
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.78199005126953,
      "kl": 0.0,
      "learning_rate": 1.315100758963622e-07,
      "logps/chosen": -298.7881774902344,
      "logps/rejected": -305.9605407714844,
      "loss": 0.2001,
      "rewards/chosen": 2.4990897178649902,
      "rewards/margins": 6.823459148406982,
      "rewards/rejected": -4.324369430541992,
      "step": 2816
    },
    {
      "epoch": 0.74,
      "grad_norm": 27.847217559814453,
      "kl": 0.0,
      "learning_rate": 1.313792200994504e-07,
      "logps/chosen": -208.53501892089844,
      "logps/rejected": -265.9464111328125,
      "loss": 0.172,
      "rewards/chosen": 2.3986852169036865,
      "rewards/margins": 6.262216091156006,
      "rewards/rejected": -3.8635308742523193,
      "step": 2817
    },
    {
      "epoch": 0.74,
      "grad_norm": 24.946470260620117,
      "kl": 0.0,
      "learning_rate": 1.312483643025386e-07,
      "logps/chosen": -153.75546264648438,
      "logps/rejected": -194.64955139160156,
      "loss": 0.2043,
      "rewards/chosen": 1.8664082288742065,
      "rewards/margins": 5.680118083953857,
      "rewards/rejected": -3.8137097358703613,
      "step": 2818
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.00639343261719,
      "kl": 0.0,
      "learning_rate": 1.311175085056268e-07,
      "logps/chosen": -213.54026794433594,
      "logps/rejected": -204.0853271484375,
      "loss": 0.2822,
      "rewards/chosen": 1.1439380645751953,
      "rewards/margins": 3.6970744132995605,
      "rewards/rejected": -2.5531363487243652,
      "step": 2819
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.658931732177734,
      "kl": 0.0,
      "learning_rate": 1.3098665270871497e-07,
      "logps/chosen": -280.839599609375,
      "logps/rejected": -277.06951904296875,
      "loss": 0.2256,
      "rewards/chosen": 1.5120201110839844,
      "rewards/margins": 5.279638290405273,
      "rewards/rejected": -3.76761794090271,
      "step": 2820
    },
    {
      "epoch": 0.74,
      "grad_norm": 33.02054977416992,
      "kl": 0.0,
      "learning_rate": 1.3085579691180317e-07,
      "logps/chosen": -209.41510009765625,
      "logps/rejected": -184.58187866210938,
      "loss": 0.1982,
      "rewards/chosen": 1.890760064125061,
      "rewards/margins": 5.372137069702148,
      "rewards/rejected": -3.481376886367798,
      "step": 2821
    },
    {
      "epoch": 0.74,
      "grad_norm": 29.64247703552246,
      "kl": 0.0,
      "learning_rate": 1.307249411148914e-07,
      "logps/chosen": -244.3789825439453,
      "logps/rejected": -236.93154907226562,
      "loss": 0.2085,
      "rewards/chosen": 2.0428965091705322,
      "rewards/margins": 5.614164352416992,
      "rewards/rejected": -3.571267604827881,
      "step": 2822
    },
    {
      "epoch": 0.74,
      "grad_norm": 39.05708312988281,
      "kl": 0.0,
      "learning_rate": 1.3059408531797958e-07,
      "logps/chosen": -322.265869140625,
      "logps/rejected": -282.5099182128906,
      "loss": 0.2222,
      "rewards/chosen": 0.1460273712873459,
      "rewards/margins": 3.9599175453186035,
      "rewards/rejected": -3.813890218734741,
      "step": 2823
    },
    {
      "epoch": 0.74,
      "grad_norm": 49.731666564941406,
      "kl": 0.0,
      "learning_rate": 1.3046322952106778e-07,
      "logps/chosen": -335.8645324707031,
      "logps/rejected": -237.4248504638672,
      "loss": 0.1767,
      "rewards/chosen": 0.3891473412513733,
      "rewards/margins": 3.2716710567474365,
      "rewards/rejected": -2.882523775100708,
      "step": 2824
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.1038818359375,
      "kl": 0.0,
      "learning_rate": 1.3033237372415598e-07,
      "logps/chosen": -188.07669067382812,
      "logps/rejected": -259.12640380859375,
      "loss": 0.2208,
      "rewards/chosen": 2.7795515060424805,
      "rewards/margins": 5.237063884735107,
      "rewards/rejected": -2.457512378692627,
      "step": 2825
    },
    {
      "epoch": 0.74,
      "grad_norm": 38.49484634399414,
      "kl": 0.0,
      "learning_rate": 1.3020151792724417e-07,
      "logps/chosen": -215.10951232910156,
      "logps/rejected": -226.5205535888672,
      "loss": 0.2578,
      "rewards/chosen": 0.7708897590637207,
      "rewards/margins": 1.8423322439193726,
      "rewards/rejected": -1.0714424848556519,
      "step": 2826
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.55825424194336,
      "kl": 0.0,
      "learning_rate": 1.3007066213033237e-07,
      "logps/chosen": -181.13351440429688,
      "logps/rejected": -340.60955810546875,
      "loss": 0.1785,
      "rewards/chosen": 0.8886696696281433,
      "rewards/margins": 5.78781795501709,
      "rewards/rejected": -4.899148464202881,
      "step": 2827
    },
    {
      "epoch": 0.74,
      "grad_norm": 51.14204025268555,
      "kl": 0.0,
      "learning_rate": 1.2993980633342057e-07,
      "logps/chosen": -184.99362182617188,
      "logps/rejected": -222.6909637451172,
      "loss": 0.2876,
      "rewards/chosen": 2.048811674118042,
      "rewards/margins": 4.70229959487915,
      "rewards/rejected": -2.6534879207611084,
      "step": 2828
    },
    {
      "epoch": 0.74,
      "grad_norm": 38.40785598754883,
      "kl": 0.0,
      "learning_rate": 1.2980895053650876e-07,
      "logps/chosen": -272.9462890625,
      "logps/rejected": -202.24850463867188,
      "loss": 0.2388,
      "rewards/chosen": 1.8347502946853638,
      "rewards/margins": 3.973668098449707,
      "rewards/rejected": -2.1389176845550537,
      "step": 2829
    },
    {
      "epoch": 0.74,
      "grad_norm": 32.16298294067383,
      "kl": 0.0,
      "learning_rate": 1.2967809473959696e-07,
      "logps/chosen": -195.91893005371094,
      "logps/rejected": -312.01727294921875,
      "loss": 0.2314,
      "rewards/chosen": 2.0726287364959717,
      "rewards/margins": 5.435080528259277,
      "rewards/rejected": -3.3624520301818848,
      "step": 2830
    },
    {
      "epoch": 0.74,
      "grad_norm": 33.658443450927734,
      "kl": 0.0,
      "learning_rate": 1.2954723894268515e-07,
      "logps/chosen": -183.74356079101562,
      "logps/rejected": -224.95396423339844,
      "loss": 0.1889,
      "rewards/chosen": 1.7647711038589478,
      "rewards/margins": 5.015651702880859,
      "rewards/rejected": -3.250880718231201,
      "step": 2831
    },
    {
      "epoch": 0.74,
      "grad_norm": 34.89668273925781,
      "kl": 0.0,
      "learning_rate": 1.2941638314577335e-07,
      "logps/chosen": -218.40185546875,
      "logps/rejected": -170.23614501953125,
      "loss": 0.2388,
      "rewards/chosen": 2.2100014686584473,
      "rewards/margins": 4.51767635345459,
      "rewards/rejected": -2.3076748847961426,
      "step": 2832
    },
    {
      "epoch": 0.74,
      "grad_norm": 32.520774841308594,
      "kl": 0.0,
      "learning_rate": 1.2928552734886155e-07,
      "logps/chosen": -300.9206848144531,
      "logps/rejected": -271.1351623535156,
      "loss": 0.279,
      "rewards/chosen": -0.4567300081253052,
      "rewards/margins": 4.208921909332275,
      "rewards/rejected": -4.665651798248291,
      "step": 2833
    },
    {
      "epoch": 0.74,
      "grad_norm": 38.00114059448242,
      "kl": 0.0,
      "learning_rate": 1.2915467155194977e-07,
      "logps/chosen": -155.7928924560547,
      "logps/rejected": -290.24359130859375,
      "loss": 0.2434,
      "rewards/chosen": 1.9330978393554688,
      "rewards/margins": 4.819847106933594,
      "rewards/rejected": -2.886749267578125,
      "step": 2834
    },
    {
      "epoch": 0.74,
      "grad_norm": 35.707950592041016,
      "kl": 0.0,
      "learning_rate": 1.2902381575503794e-07,
      "logps/chosen": -246.90481567382812,
      "logps/rejected": -271.83056640625,
      "loss": 0.2246,
      "rewards/chosen": 1.0874056816101074,
      "rewards/margins": 4.229583740234375,
      "rewards/rejected": -3.1421780586242676,
      "step": 2835
    },
    {
      "epoch": 0.74,
      "grad_norm": 35.23543167114258,
      "kl": 0.0,
      "learning_rate": 1.2889295995812613e-07,
      "logps/chosen": -158.85867309570312,
      "logps/rejected": -159.07968139648438,
      "loss": 0.2004,
      "rewards/chosen": 3.075439929962158,
      "rewards/margins": 6.421757698059082,
      "rewards/rejected": -3.346317768096924,
      "step": 2836
    },
    {
      "epoch": 0.74,
      "grad_norm": 36.72352600097656,
      "kl": 0.0,
      "learning_rate": 1.2876210416121433e-07,
      "logps/chosen": -207.18263244628906,
      "logps/rejected": -279.25860595703125,
      "loss": 0.1752,
      "rewards/chosen": 1.3812538385391235,
      "rewards/margins": 4.762180805206299,
      "rewards/rejected": -3.380927085876465,
      "step": 2837
    },
    {
      "epoch": 0.74,
      "grad_norm": 37.70661926269531,
      "kl": 0.0,
      "learning_rate": 1.2863124836430253e-07,
      "logps/chosen": -167.04473876953125,
      "logps/rejected": -321.56060791015625,
      "loss": 0.2574,
      "rewards/chosen": 2.6713783740997314,
      "rewards/margins": 5.929797172546387,
      "rewards/rejected": -3.2584190368652344,
      "step": 2838
    },
    {
      "epoch": 0.74,
      "grad_norm": 32.261695861816406,
      "kl": 0.0,
      "learning_rate": 1.2850039256739072e-07,
      "logps/chosen": -204.63113403320312,
      "logps/rejected": -239.1153564453125,
      "loss": 0.1671,
      "rewards/chosen": 1.1335726976394653,
      "rewards/margins": 5.587896347045898,
      "rewards/rejected": -4.454323768615723,
      "step": 2839
    },
    {
      "epoch": 0.74,
      "grad_norm": 30.68526268005371,
      "kl": 0.0,
      "learning_rate": 1.2836953677047892e-07,
      "logps/chosen": -189.3596954345703,
      "logps/rejected": -240.0169219970703,
      "loss": 0.2215,
      "rewards/chosen": 3.3140547275543213,
      "rewards/margins": 5.870977401733398,
      "rewards/rejected": -2.556922674179077,
      "step": 2840
    },
    {
      "epoch": 0.74,
      "grad_norm": 40.76029586791992,
      "kl": 0.0,
      "learning_rate": 1.2823868097356714e-07,
      "logps/chosen": -267.5643310546875,
      "logps/rejected": -378.8826599121094,
      "loss": 0.3176,
      "rewards/chosen": 0.5281890034675598,
      "rewards/margins": 5.403535842895508,
      "rewards/rejected": -4.875346660614014,
      "step": 2841
    },
    {
      "epoch": 0.74,
      "grad_norm": 31.456031799316406,
      "kl": 0.0,
      "learning_rate": 1.2810782517665534e-07,
      "logps/chosen": -191.51092529296875,
      "logps/rejected": -234.32913208007812,
      "loss": 0.2861,
      "rewards/chosen": 0.31234022974967957,
      "rewards/margins": 2.9550578594207764,
      "rewards/rejected": -2.6427175998687744,
      "step": 2842
    },
    {
      "epoch": 0.74,
      "grad_norm": 54.55989074707031,
      "kl": 0.0,
      "learning_rate": 1.279769693797435e-07,
      "logps/chosen": -226.59442138671875,
      "logps/rejected": -211.96656799316406,
      "loss": 0.3083,
      "rewards/chosen": 1.6426137685775757,
      "rewards/margins": 3.635915756225586,
      "rewards/rejected": -1.9933021068572998,
      "step": 2843
    },
    {
      "epoch": 0.74,
      "grad_norm": 30.520023345947266,
      "kl": 0.0,
      "learning_rate": 1.278461135828317e-07,
      "logps/chosen": -177.09706115722656,
      "logps/rejected": -190.50453186035156,
      "loss": 0.127,
      "rewards/chosen": 1.1707515716552734,
      "rewards/margins": 5.3002448081970215,
      "rewards/rejected": -4.129493236541748,
      "step": 2844
    },
    {
      "epoch": 0.74,
      "grad_norm": 43.148956298828125,
      "kl": 0.0,
      "learning_rate": 1.277152577859199e-07,
      "logps/chosen": -226.48245239257812,
      "logps/rejected": -201.1748046875,
      "loss": 0.3406,
      "rewards/chosen": -0.8820660710334778,
      "rewards/margins": 2.871274471282959,
      "rewards/rejected": -3.753340482711792,
      "step": 2845
    },
    {
      "epoch": 0.74,
      "grad_norm": 27.76190948486328,
      "kl": 0.0,
      "learning_rate": 1.275844019890081e-07,
      "logps/chosen": -201.8677978515625,
      "logps/rejected": -255.43519592285156,
      "loss": 0.1749,
      "rewards/chosen": 2.41351056098938,
      "rewards/margins": 7.772078514099121,
      "rewards/rejected": -5.358567714691162,
      "step": 2846
    },
    {
      "epoch": 0.75,
      "grad_norm": 25.458728790283203,
      "kl": 0.0,
      "learning_rate": 1.2745354619209632e-07,
      "logps/chosen": -207.4493408203125,
      "logps/rejected": -224.9928436279297,
      "loss": 0.1354,
      "rewards/chosen": 1.2176116704940796,
      "rewards/margins": 5.856996536254883,
      "rewards/rejected": -4.639384746551514,
      "step": 2847
    },
    {
      "epoch": 0.75,
      "grad_norm": 37.17304992675781,
      "kl": 0.0,
      "learning_rate": 1.2732269039518451e-07,
      "logps/chosen": -234.0641632080078,
      "logps/rejected": -266.6547546386719,
      "loss": 0.3049,
      "rewards/chosen": 1.5123035907745361,
      "rewards/margins": 3.934211254119873,
      "rewards/rejected": -2.421907663345337,
      "step": 2848
    },
    {
      "epoch": 0.75,
      "grad_norm": 31.614761352539062,
      "kl": 0.0,
      "learning_rate": 1.271918345982727e-07,
      "logps/chosen": -149.47503662109375,
      "logps/rejected": -312.2427978515625,
      "loss": 0.3124,
      "rewards/chosen": 0.877579391002655,
      "rewards/margins": 4.342172145843506,
      "rewards/rejected": -3.464592933654785,
      "step": 2849
    },
    {
      "epoch": 0.75,
      "grad_norm": 29.895217895507812,
      "kl": 0.0,
      "learning_rate": 1.270609788013609e-07,
      "logps/chosen": -204.3394012451172,
      "logps/rejected": -203.88108825683594,
      "loss": 0.2429,
      "rewards/chosen": 2.5909066200256348,
      "rewards/margins": 5.3905134201049805,
      "rewards/rejected": -2.799607038497925,
      "step": 2850
    },
    {
      "epoch": 0.75,
      "grad_norm": 29.323200225830078,
      "kl": 0.0,
      "learning_rate": 1.2693012300444908e-07,
      "logps/chosen": -200.34286499023438,
      "logps/rejected": -305.9087829589844,
      "loss": 0.3187,
      "rewards/chosen": 0.6447303295135498,
      "rewards/margins": 4.248477458953857,
      "rewards/rejected": -3.6037471294403076,
      "step": 2851
    },
    {
      "epoch": 0.75,
      "grad_norm": 33.479618072509766,
      "kl": 0.0,
      "learning_rate": 1.2679926720753727e-07,
      "logps/chosen": -211.47300720214844,
      "logps/rejected": -234.58995056152344,
      "loss": 0.1717,
      "rewards/chosen": 0.6005478501319885,
      "rewards/margins": 5.318211078643799,
      "rewards/rejected": -4.717663288116455,
      "step": 2852
    },
    {
      "epoch": 0.75,
      "grad_norm": 33.29733657836914,
      "kl": 0.0,
      "learning_rate": 1.2666841141062547e-07,
      "logps/chosen": -185.76760864257812,
      "logps/rejected": -336.5068359375,
      "loss": 0.2649,
      "rewards/chosen": 0.17454427480697632,
      "rewards/margins": 3.135378837585449,
      "rewards/rejected": -2.960834503173828,
      "step": 2853
    },
    {
      "epoch": 0.75,
      "grad_norm": 28.257957458496094,
      "kl": 0.0,
      "learning_rate": 1.265375556137137e-07,
      "logps/chosen": -188.02969360351562,
      "logps/rejected": -208.114501953125,
      "loss": 0.2783,
      "rewards/chosen": 2.182997465133667,
      "rewards/margins": 5.579323768615723,
      "rewards/rejected": -3.3963260650634766,
      "step": 2854
    },
    {
      "epoch": 0.75,
      "grad_norm": 40.46211624145508,
      "kl": 0.0,
      "learning_rate": 1.2640669981680189e-07,
      "logps/chosen": -247.18666076660156,
      "logps/rejected": -271.8265380859375,
      "loss": 0.1999,
      "rewards/chosen": 0.41464537382125854,
      "rewards/margins": 5.825222969055176,
      "rewards/rejected": -5.410577774047852,
      "step": 2855
    },
    {
      "epoch": 0.75,
      "grad_norm": 38.835689544677734,
      "kl": 0.0,
      "learning_rate": 1.2627584401989008e-07,
      "logps/chosen": -226.5570526123047,
      "logps/rejected": -229.08575439453125,
      "loss": 0.2518,
      "rewards/chosen": 2.2463645935058594,
      "rewards/margins": 6.170310020446777,
      "rewards/rejected": -3.923945188522339,
      "step": 2856
    },
    {
      "epoch": 0.75,
      "grad_norm": 37.920345306396484,
      "kl": 0.0,
      "learning_rate": 1.2614498822297828e-07,
      "logps/chosen": -209.99766540527344,
      "logps/rejected": -227.28494262695312,
      "loss": 0.2198,
      "rewards/chosen": 1.4239280223846436,
      "rewards/margins": 5.315568923950195,
      "rewards/rejected": -3.8916409015655518,
      "step": 2857
    },
    {
      "epoch": 0.75,
      "grad_norm": 35.52437973022461,
      "kl": 0.0,
      "learning_rate": 1.2601413242606647e-07,
      "logps/chosen": -215.59051513671875,
      "logps/rejected": -300.15167236328125,
      "loss": 0.1788,
      "rewards/chosen": 2.225846767425537,
      "rewards/margins": 6.136997699737549,
      "rewards/rejected": -3.9111509323120117,
      "step": 2858
    },
    {
      "epoch": 0.75,
      "grad_norm": 37.68877410888672,
      "kl": 0.0,
      "learning_rate": 1.2588327662915464e-07,
      "logps/chosen": -205.9502410888672,
      "logps/rejected": -235.28225708007812,
      "loss": 0.2043,
      "rewards/chosen": 1.3810704946517944,
      "rewards/margins": 5.162478446960449,
      "rewards/rejected": -3.7814078330993652,
      "step": 2859
    },
    {
      "epoch": 0.75,
      "grad_norm": 29.949377059936523,
      "kl": 0.0,
      "learning_rate": 1.2575242083224287e-07,
      "logps/chosen": -240.81326293945312,
      "logps/rejected": -176.24057006835938,
      "loss": 0.2927,
      "rewards/chosen": 0.8815681338310242,
      "rewards/margins": 4.728879928588867,
      "rewards/rejected": -3.8473119735717773,
      "step": 2860
    },
    {
      "epoch": 0.75,
      "grad_norm": 36.66761016845703,
      "kl": 0.0,
      "learning_rate": 1.2562156503533106e-07,
      "logps/chosen": -158.10072326660156,
      "logps/rejected": -300.7066345214844,
      "loss": 0.249,
      "rewards/chosen": 0.6434949636459351,
      "rewards/margins": 5.606198787689209,
      "rewards/rejected": -4.962703704833984,
      "step": 2861
    },
    {
      "epoch": 0.75,
      "grad_norm": 42.22517013549805,
      "kl": 0.0,
      "learning_rate": 1.2549070923841926e-07,
      "logps/chosen": -243.78887939453125,
      "logps/rejected": -282.7771911621094,
      "loss": 0.2351,
      "rewards/chosen": 2.329538106918335,
      "rewards/margins": 5.888676643371582,
      "rewards/rejected": -3.559138774871826,
      "step": 2862
    },
    {
      "epoch": 0.75,
      "grad_norm": 28.08859634399414,
      "kl": 0.0,
      "learning_rate": 1.2535985344150746e-07,
      "logps/chosen": -180.0941925048828,
      "logps/rejected": -300.3199157714844,
      "loss": 0.2466,
      "rewards/chosen": 0.9009974002838135,
      "rewards/margins": 4.991559982299805,
      "rewards/rejected": -4.09056282043457,
      "step": 2863
    },
    {
      "epoch": 0.75,
      "grad_norm": 36.271453857421875,
      "kl": 0.0,
      "learning_rate": 1.2522899764459565e-07,
      "logps/chosen": -142.3069305419922,
      "logps/rejected": -477.3985900878906,
      "loss": 0.2336,
      "rewards/chosen": 0.7708263397216797,
      "rewards/margins": 7.935967922210693,
      "rewards/rejected": -7.165141582489014,
      "step": 2864
    },
    {
      "epoch": 0.75,
      "grad_norm": 36.600616455078125,
      "kl": 0.0,
      "learning_rate": 1.2509814184768385e-07,
      "logps/chosen": -297.0329895019531,
      "logps/rejected": -275.8288879394531,
      "loss": 0.2147,
      "rewards/chosen": 0.1721043586730957,
      "rewards/margins": 5.66850471496582,
      "rewards/rejected": -5.496400356292725,
      "step": 2865
    },
    {
      "epoch": 0.75,
      "grad_norm": 32.472042083740234,
      "kl": 0.0,
      "learning_rate": 1.2496728605077204e-07,
      "logps/chosen": -197.33554077148438,
      "logps/rejected": -202.48606872558594,
      "loss": 0.3602,
      "rewards/chosen": 0.5126596689224243,
      "rewards/margins": 4.411322116851807,
      "rewards/rejected": -3.8986623287200928,
      "step": 2866
    },
    {
      "epoch": 0.75,
      "grad_norm": 31.626291275024414,
      "kl": 0.0,
      "learning_rate": 1.2483643025386024e-07,
      "logps/chosen": -220.61660766601562,
      "logps/rejected": -281.240966796875,
      "loss": 0.237,
      "rewards/chosen": 1.3547782897949219,
      "rewards/margins": 3.2674479484558105,
      "rewards/rejected": -1.9126697778701782,
      "step": 2867
    },
    {
      "epoch": 0.75,
      "grad_norm": 30.53521728515625,
      "kl": 0.0,
      "learning_rate": 1.2470557445694844e-07,
      "logps/chosen": -180.86444091796875,
      "logps/rejected": -281.6353454589844,
      "loss": 0.2105,
      "rewards/chosen": 1.173238754272461,
      "rewards/margins": 4.285660743713379,
      "rewards/rejected": -3.112421751022339,
      "step": 2868
    },
    {
      "epoch": 0.75,
      "grad_norm": 38.975013732910156,
      "kl": 0.0,
      "learning_rate": 1.2457471866003663e-07,
      "logps/chosen": -221.84617614746094,
      "logps/rejected": -238.2664794921875,
      "loss": 0.2226,
      "rewards/chosen": 1.4654902219772339,
      "rewards/margins": 5.618080139160156,
      "rewards/rejected": -4.152589797973633,
      "step": 2869
    },
    {
      "epoch": 0.75,
      "grad_norm": 31.062091827392578,
      "kl": 0.0,
      "learning_rate": 1.2444386286312483e-07,
      "logps/chosen": -240.03097534179688,
      "logps/rejected": -269.620361328125,
      "loss": 0.2435,
      "rewards/chosen": 1.7040454149246216,
      "rewards/margins": 4.696616172790527,
      "rewards/rejected": -2.9925708770751953,
      "step": 2870
    },
    {
      "epoch": 0.75,
      "grad_norm": 29.831459045410156,
      "kl": 0.0,
      "learning_rate": 1.2431300706621302e-07,
      "logps/chosen": -158.68234252929688,
      "logps/rejected": -241.193115234375,
      "loss": 0.2662,
      "rewards/chosen": 0.047480225563049316,
      "rewards/margins": 3.3005714416503906,
      "rewards/rejected": -3.253091335296631,
      "step": 2871
    },
    {
      "epoch": 0.75,
      "grad_norm": 26.44230079650879,
      "kl": 0.0,
      "learning_rate": 1.2418215126930122e-07,
      "logps/chosen": -345.5702819824219,
      "logps/rejected": -201.339111328125,
      "loss": 0.1851,
      "rewards/chosen": 1.2837327718734741,
      "rewards/margins": 5.676604747772217,
      "rewards/rejected": -4.392871856689453,
      "step": 2872
    },
    {
      "epoch": 0.75,
      "grad_norm": 33.63683319091797,
      "kl": 0.0,
      "learning_rate": 1.2405129547238942e-07,
      "logps/chosen": -177.061279296875,
      "logps/rejected": -248.8441925048828,
      "loss": 0.1536,
      "rewards/chosen": 4.102713108062744,
      "rewards/margins": 7.341584205627441,
      "rewards/rejected": -3.2388713359832764,
      "step": 2873
    },
    {
      "epoch": 0.75,
      "grad_norm": 31.185894012451172,
      "kl": 0.0,
      "learning_rate": 1.239204396754776e-07,
      "logps/chosen": -174.25146484375,
      "logps/rejected": -331.90045166015625,
      "loss": 0.1468,
      "rewards/chosen": 3.4060115814208984,
      "rewards/margins": 8.459066390991211,
      "rewards/rejected": -5.053055286407471,
      "step": 2874
    },
    {
      "epoch": 0.75,
      "grad_norm": 36.41456985473633,
      "kl": 0.0,
      "learning_rate": 1.237895838785658e-07,
      "logps/chosen": -257.15521240234375,
      "logps/rejected": -217.49343872070312,
      "loss": 0.2738,
      "rewards/chosen": 2.5418829917907715,
      "rewards/margins": 5.1777496337890625,
      "rewards/rejected": -2.635866403579712,
      "step": 2875
    },
    {
      "epoch": 0.75,
      "grad_norm": 27.071542739868164,
      "kl": 0.0,
      "learning_rate": 1.23658728081654e-07,
      "logps/chosen": -199.8568572998047,
      "logps/rejected": -237.8272247314453,
      "loss": 0.2854,
      "rewards/chosen": 0.3178449869155884,
      "rewards/margins": 3.073559284210205,
      "rewards/rejected": -2.7557144165039062,
      "step": 2876
    },
    {
      "epoch": 0.75,
      "grad_norm": 28.203113555908203,
      "kl": 0.0,
      "learning_rate": 1.235278722847422e-07,
      "logps/chosen": -184.3144989013672,
      "logps/rejected": -269.3959655761719,
      "loss": 0.2805,
      "rewards/chosen": 0.6483626961708069,
      "rewards/margins": 6.0424699783325195,
      "rewards/rejected": -5.394107341766357,
      "step": 2877
    },
    {
      "epoch": 0.75,
      "grad_norm": 35.722450256347656,
      "kl": 0.0,
      "learning_rate": 1.233970164878304e-07,
      "logps/chosen": -197.92510986328125,
      "logps/rejected": -298.2138671875,
      "loss": 0.2674,
      "rewards/chosen": 0.42415380477905273,
      "rewards/margins": 3.037757158279419,
      "rewards/rejected": -2.613603353500366,
      "step": 2878
    },
    {
      "epoch": 0.75,
      "grad_norm": 41.887447357177734,
      "kl": 0.0,
      "learning_rate": 1.2326616069091862e-07,
      "logps/chosen": -217.36782836914062,
      "logps/rejected": -303.54290771484375,
      "loss": 0.3861,
      "rewards/chosen": 0.8330056667327881,
      "rewards/margins": 3.9003896713256836,
      "rewards/rejected": -3.0673840045928955,
      "step": 2879
    },
    {
      "epoch": 0.75,
      "grad_norm": 25.675533294677734,
      "kl": 0.0,
      "learning_rate": 1.231353048940068e-07,
      "logps/chosen": -217.59751892089844,
      "logps/rejected": -293.5157470703125,
      "loss": 0.2278,
      "rewards/chosen": 0.7520068883895874,
      "rewards/margins": 6.41138219833374,
      "rewards/rejected": -5.659375190734863,
      "step": 2880
    },
    {
      "epoch": 0.75,
      "grad_norm": 33.77341842651367,
      "kl": 0.0,
      "learning_rate": 1.2300444909709498e-07,
      "logps/chosen": -190.04165649414062,
      "logps/rejected": -243.47940063476562,
      "loss": 0.4059,
      "rewards/chosen": -1.3758662939071655,
      "rewards/margins": 1.1594191789627075,
      "rewards/rejected": -2.535285472869873,
      "step": 2881
    },
    {
      "epoch": 0.75,
      "grad_norm": 37.45033645629883,
      "kl": 0.0,
      "learning_rate": 1.228735933001832e-07,
      "logps/chosen": -232.29136657714844,
      "logps/rejected": -252.78013610839844,
      "loss": 0.1841,
      "rewards/chosen": 1.7727702856063843,
      "rewards/margins": 3.7750887870788574,
      "rewards/rejected": -2.0023186206817627,
      "step": 2882
    },
    {
      "epoch": 0.75,
      "grad_norm": 36.58238983154297,
      "kl": 0.0,
      "learning_rate": 1.227427375032714e-07,
      "logps/chosen": -235.13485717773438,
      "logps/rejected": -155.39901733398438,
      "loss": 0.2151,
      "rewards/chosen": 1.5003623962402344,
      "rewards/margins": 4.468344688415527,
      "rewards/rejected": -2.967982053756714,
      "step": 2883
    },
    {
      "epoch": 0.75,
      "grad_norm": 34.22605895996094,
      "kl": 0.0,
      "learning_rate": 1.2261188170635957e-07,
      "logps/chosen": -243.13113403320312,
      "logps/rejected": -213.46397399902344,
      "loss": 0.1841,
      "rewards/chosen": 0.6716588139533997,
      "rewards/margins": 5.077381610870361,
      "rewards/rejected": -4.405722618103027,
      "step": 2884
    },
    {
      "epoch": 0.76,
      "grad_norm": 38.82898712158203,
      "kl": 0.0,
      "learning_rate": 1.224810259094478e-07,
      "logps/chosen": -212.1895294189453,
      "logps/rejected": -275.56085205078125,
      "loss": 0.2102,
      "rewards/chosen": 0.16195379197597504,
      "rewards/margins": 5.055579662322998,
      "rewards/rejected": -4.893625736236572,
      "step": 2885
    },
    {
      "epoch": 0.76,
      "grad_norm": 33.71118927001953,
      "kl": 0.0,
      "learning_rate": 1.22350170112536e-07,
      "logps/chosen": -222.42874145507812,
      "logps/rejected": -211.8098907470703,
      "loss": 0.2643,
      "rewards/chosen": 0.7175885438919067,
      "rewards/margins": 3.3388028144836426,
      "rewards/rejected": -2.6212143898010254,
      "step": 2886
    },
    {
      "epoch": 0.76,
      "grad_norm": 44.06155014038086,
      "kl": 0.0,
      "learning_rate": 1.222193143156242e-07,
      "logps/chosen": -192.49566650390625,
      "logps/rejected": -219.57589721679688,
      "loss": 0.3047,
      "rewards/chosen": 1.4475936889648438,
      "rewards/margins": 3.2824594974517822,
      "rewards/rejected": -1.8348658084869385,
      "step": 2887
    },
    {
      "epoch": 0.76,
      "grad_norm": 37.90978240966797,
      "kl": 0.0,
      "learning_rate": 1.2208845851871236e-07,
      "logps/chosen": -221.5460205078125,
      "logps/rejected": -296.1938171386719,
      "loss": 0.2267,
      "rewards/chosen": 1.5457605123519897,
      "rewards/margins": 4.726670742034912,
      "rewards/rejected": -3.180910348892212,
      "step": 2888
    },
    {
      "epoch": 0.76,
      "grad_norm": 38.740081787109375,
      "kl": 0.0,
      "learning_rate": 1.2195760272180058e-07,
      "logps/chosen": -215.4282989501953,
      "logps/rejected": -215.94064331054688,
      "loss": 0.2473,
      "rewards/chosen": 1.6079682111740112,
      "rewards/margins": 4.185258388519287,
      "rewards/rejected": -2.5772900581359863,
      "step": 2889
    },
    {
      "epoch": 0.76,
      "grad_norm": 28.836488723754883,
      "kl": 0.0,
      "learning_rate": 1.2182674692488878e-07,
      "logps/chosen": -234.37863159179688,
      "logps/rejected": -252.62863159179688,
      "loss": 0.2242,
      "rewards/chosen": 1.2684545516967773,
      "rewards/margins": 6.477774620056152,
      "rewards/rejected": -5.209320068359375,
      "step": 2890
    },
    {
      "epoch": 0.76,
      "grad_norm": 30.18337631225586,
      "kl": 0.0,
      "learning_rate": 1.2169589112797695e-07,
      "logps/chosen": -267.667724609375,
      "logps/rejected": -222.0542755126953,
      "loss": 0.2185,
      "rewards/chosen": 1.34689462184906,
      "rewards/margins": 5.4736127853393555,
      "rewards/rejected": -4.126718044281006,
      "step": 2891
    },
    {
      "epoch": 0.76,
      "grad_norm": 25.85747718811035,
      "kl": 0.0,
      "learning_rate": 1.2156503533106517e-07,
      "logps/chosen": -198.385498046875,
      "logps/rejected": -330.31707763671875,
      "loss": 0.1765,
      "rewards/chosen": 1.7326695919036865,
      "rewards/margins": 4.857114315032959,
      "rewards/rejected": -3.1244447231292725,
      "step": 2892
    },
    {
      "epoch": 0.76,
      "grad_norm": 51.8387451171875,
      "kl": 0.0,
      "learning_rate": 1.2143417953415336e-07,
      "logps/chosen": -192.14889526367188,
      "logps/rejected": -185.65231323242188,
      "loss": 0.2475,
      "rewards/chosen": 1.5236883163452148,
      "rewards/margins": 5.039030075073242,
      "rewards/rejected": -3.5153415203094482,
      "step": 2893
    },
    {
      "epoch": 0.76,
      "grad_norm": 32.10373306274414,
      "kl": 0.0,
      "learning_rate": 1.2130332373724156e-07,
      "logps/chosen": -174.0630340576172,
      "logps/rejected": -271.5389099121094,
      "loss": 0.1499,
      "rewards/chosen": 1.0889085531234741,
      "rewards/margins": 2.904127597808838,
      "rewards/rejected": -1.8152190446853638,
      "step": 2894
    },
    {
      "epoch": 0.76,
      "grad_norm": 28.912342071533203,
      "kl": 0.0,
      "learning_rate": 1.2117246794032976e-07,
      "logps/chosen": -206.34083557128906,
      "logps/rejected": -236.11744689941406,
      "loss": 0.1711,
      "rewards/chosen": 2.815239429473877,
      "rewards/margins": 6.5172834396362305,
      "rewards/rejected": -3.7020437717437744,
      "step": 2895
    },
    {
      "epoch": 0.76,
      "grad_norm": 35.956024169921875,
      "kl": 0.0,
      "learning_rate": 1.2104161214341795e-07,
      "logps/chosen": -265.2823181152344,
      "logps/rejected": -413.5455017089844,
      "loss": 0.1822,
      "rewards/chosen": 1.1553301811218262,
      "rewards/margins": 6.729024887084961,
      "rewards/rejected": -5.573694705963135,
      "step": 2896
    },
    {
      "epoch": 0.76,
      "grad_norm": 32.14839172363281,
      "kl": 0.0,
      "learning_rate": 1.2091075634650615e-07,
      "logps/chosen": -152.54571533203125,
      "logps/rejected": -204.89080810546875,
      "loss": 0.2508,
      "rewards/chosen": 1.2394222021102905,
      "rewards/margins": 3.0110769271850586,
      "rewards/rejected": -1.7716546058654785,
      "step": 2897
    },
    {
      "epoch": 0.76,
      "grad_norm": 21.592060089111328,
      "kl": 0.0,
      "learning_rate": 1.2077990054959434e-07,
      "logps/chosen": -286.0022277832031,
      "logps/rejected": -226.6644744873047,
      "loss": 0.3169,
      "rewards/chosen": -1.6726806163787842,
      "rewards/margins": 2.2264373302459717,
      "rewards/rejected": -3.899117946624756,
      "step": 2898
    },
    {
      "epoch": 0.76,
      "grad_norm": 30.0306339263916,
      "kl": 0.0,
      "learning_rate": 1.2064904475268254e-07,
      "logps/chosen": -206.56137084960938,
      "logps/rejected": -223.92355346679688,
      "loss": 0.2848,
      "rewards/chosen": 1.3688303232192993,
      "rewards/margins": 4.59629487991333,
      "rewards/rejected": -3.2274646759033203,
      "step": 2899
    },
    {
      "epoch": 0.76,
      "grad_norm": 26.28697395324707,
      "kl": 0.0,
      "learning_rate": 1.2051818895577074e-07,
      "logps/chosen": -227.15841674804688,
      "logps/rejected": -260.24835205078125,
      "loss": 0.308,
      "rewards/chosen": -0.8018272519111633,
      "rewards/margins": 3.403519868850708,
      "rewards/rejected": -4.205347061157227,
      "step": 2900
    },
    {
      "epoch": 0.76,
      "grad_norm": 32.87641143798828,
      "kl": 0.0,
      "learning_rate": 1.2038733315885893e-07,
      "logps/chosen": -242.5460205078125,
      "logps/rejected": -257.4617004394531,
      "loss": 0.2887,
      "rewards/chosen": -0.07051602005958557,
      "rewards/margins": 4.160338401794434,
      "rewards/rejected": -4.230854511260986,
      "step": 2901
    },
    {
      "epoch": 0.76,
      "grad_norm": 30.622848510742188,
      "kl": 0.0,
      "learning_rate": 1.2025647736194713e-07,
      "logps/chosen": -232.50624084472656,
      "logps/rejected": -188.72296142578125,
      "loss": 0.341,
      "rewards/chosen": 1.6381621360778809,
      "rewards/margins": 4.317827224731445,
      "rewards/rejected": -2.6796653270721436,
      "step": 2902
    },
    {
      "epoch": 0.76,
      "grad_norm": 35.67957305908203,
      "kl": 0.0,
      "learning_rate": 1.2012562156503533e-07,
      "logps/chosen": -291.6125793457031,
      "logps/rejected": -262.95806884765625,
      "loss": 0.2262,
      "rewards/chosen": 2.4084537029266357,
      "rewards/margins": 6.307066917419434,
      "rewards/rejected": -3.8986129760742188,
      "step": 2903
    },
    {
      "epoch": 0.76,
      "grad_norm": 28.24009132385254,
      "kl": 0.0,
      "learning_rate": 1.1999476576812352e-07,
      "logps/chosen": -223.4033203125,
      "logps/rejected": -267.7476806640625,
      "loss": 0.2482,
      "rewards/chosen": 0.842714786529541,
      "rewards/margins": 6.109996318817139,
      "rewards/rejected": -5.267281532287598,
      "step": 2904
    },
    {
      "epoch": 0.76,
      "grad_norm": 37.31360626220703,
      "kl": 0.0,
      "learning_rate": 1.1986390997121172e-07,
      "logps/chosen": -249.6298370361328,
      "logps/rejected": -187.60121154785156,
      "loss": 0.2571,
      "rewards/chosen": 0.9127798080444336,
      "rewards/margins": 4.006119251251221,
      "rewards/rejected": -3.093339443206787,
      "step": 2905
    },
    {
      "epoch": 0.76,
      "grad_norm": 25.03266143798828,
      "kl": 0.0,
      "learning_rate": 1.1973305417429991e-07,
      "logps/chosen": -209.19241333007812,
      "logps/rejected": -292.7968444824219,
      "loss": 0.1879,
      "rewards/chosen": 2.684803009033203,
      "rewards/margins": 6.696554660797119,
      "rewards/rejected": -4.011751651763916,
      "step": 2906
    },
    {
      "epoch": 0.76,
      "grad_norm": 41.646968841552734,
      "kl": 0.0,
      "learning_rate": 1.196021983773881e-07,
      "logps/chosen": -151.85186767578125,
      "logps/rejected": -266.0614013671875,
      "loss": 0.3084,
      "rewards/chosen": 0.06417274475097656,
      "rewards/margins": 4.529610633850098,
      "rewards/rejected": -4.465437889099121,
      "step": 2907
    },
    {
      "epoch": 0.76,
      "grad_norm": 31.28655242919922,
      "kl": 0.0,
      "learning_rate": 1.194713425804763e-07,
      "logps/chosen": -191.10202026367188,
      "logps/rejected": -179.45779418945312,
      "loss": 0.2224,
      "rewards/chosen": 0.6876000165939331,
      "rewards/margins": 3.735599994659424,
      "rewards/rejected": -3.0480000972747803,
      "step": 2908
    },
    {
      "epoch": 0.76,
      "grad_norm": 35.99723434448242,
      "kl": 0.0,
      "learning_rate": 1.193404867835645e-07,
      "logps/chosen": -178.0907745361328,
      "logps/rejected": -234.1808319091797,
      "loss": 0.2611,
      "rewards/chosen": -0.0443677119910717,
      "rewards/margins": 1.617915391921997,
      "rewards/rejected": -1.6622830629348755,
      "step": 2909
    },
    {
      "epoch": 0.76,
      "grad_norm": 38.04181671142578,
      "kl": 0.0,
      "learning_rate": 1.192096309866527e-07,
      "logps/chosen": -252.9052734375,
      "logps/rejected": -265.5356140136719,
      "loss": 0.3118,
      "rewards/chosen": -0.2618914544582367,
      "rewards/margins": 2.1319985389709473,
      "rewards/rejected": -2.393889904022217,
      "step": 2910
    },
    {
      "epoch": 0.76,
      "grad_norm": 33.879615783691406,
      "kl": 0.0,
      "learning_rate": 1.190787751897409e-07,
      "logps/chosen": -142.2236328125,
      "logps/rejected": -220.40982055664062,
      "loss": 0.2546,
      "rewards/chosen": 2.9108173847198486,
      "rewards/margins": 5.476387977600098,
      "rewards/rejected": -2.565570831298828,
      "step": 2911
    },
    {
      "epoch": 0.76,
      "grad_norm": 31.349456787109375,
      "kl": 0.0,
      "learning_rate": 1.1894791939282909e-07,
      "logps/chosen": -273.1512756347656,
      "logps/rejected": -231.77330017089844,
      "loss": 0.2502,
      "rewards/chosen": 1.1709834337234497,
      "rewards/margins": 4.816771507263184,
      "rewards/rejected": -3.6457881927490234,
      "step": 2912
    },
    {
      "epoch": 0.76,
      "grad_norm": 25.215503692626953,
      "kl": 0.0,
      "learning_rate": 1.188170635959173e-07,
      "logps/chosen": -206.77252197265625,
      "logps/rejected": -157.4561309814453,
      "loss": 0.2568,
      "rewards/chosen": 0.6348246335983276,
      "rewards/margins": 6.27030086517334,
      "rewards/rejected": -5.635476112365723,
      "step": 2913
    },
    {
      "epoch": 0.76,
      "grad_norm": 28.806495666503906,
      "kl": 0.0,
      "learning_rate": 1.186862077990055e-07,
      "logps/chosen": -170.64573669433594,
      "logps/rejected": -262.3679504394531,
      "loss": 0.2113,
      "rewards/chosen": 0.7701768279075623,
      "rewards/margins": 5.672473907470703,
      "rewards/rejected": -4.902297019958496,
      "step": 2914
    },
    {
      "epoch": 0.76,
      "grad_norm": 30.437780380249023,
      "kl": 0.0,
      "learning_rate": 1.1855535200209368e-07,
      "logps/chosen": -245.5428466796875,
      "logps/rejected": -245.26171875,
      "loss": 0.2124,
      "rewards/chosen": 0.24975238740444183,
      "rewards/margins": 3.772033929824829,
      "rewards/rejected": -3.5222816467285156,
      "step": 2915
    },
    {
      "epoch": 0.76,
      "grad_norm": 29.952688217163086,
      "kl": 0.0,
      "learning_rate": 1.1842449620518189e-07,
      "logps/chosen": -194.86053466796875,
      "logps/rejected": -286.28790283203125,
      "loss": 0.1902,
      "rewards/chosen": 2.382014036178589,
      "rewards/margins": 6.134878158569336,
      "rewards/rejected": -3.752864122390747,
      "step": 2916
    },
    {
      "epoch": 0.76,
      "grad_norm": 33.65608596801758,
      "kl": 0.0,
      "learning_rate": 1.1829364040827008e-07,
      "logps/chosen": -221.7244415283203,
      "logps/rejected": -244.54493713378906,
      "loss": 0.1968,
      "rewards/chosen": 1.4137190580368042,
      "rewards/margins": 5.684474468231201,
      "rewards/rejected": -4.270755290985107,
      "step": 2917
    },
    {
      "epoch": 0.76,
      "grad_norm": 25.30941390991211,
      "kl": 0.0,
      "learning_rate": 1.1816278461135827e-07,
      "logps/chosen": -181.33401489257812,
      "logps/rejected": -193.39193725585938,
      "loss": 0.2273,
      "rewards/chosen": 1.3112400770187378,
      "rewards/margins": 5.295674800872803,
      "rewards/rejected": -3.9844348430633545,
      "step": 2918
    },
    {
      "epoch": 0.76,
      "grad_norm": 34.57612991333008,
      "kl": 0.0,
      "learning_rate": 1.1803192881444648e-07,
      "logps/chosen": -223.0674285888672,
      "logps/rejected": -245.7630615234375,
      "loss": 0.235,
      "rewards/chosen": 0.19638967514038086,
      "rewards/margins": 3.93575382232666,
      "rewards/rejected": -3.7393641471862793,
      "step": 2919
    },
    {
      "epoch": 0.76,
      "grad_norm": 24.53961181640625,
      "kl": 0.0,
      "learning_rate": 1.1790107301753467e-07,
      "logps/chosen": -187.73519897460938,
      "logps/rejected": -231.53778076171875,
      "loss": 0.2707,
      "rewards/chosen": 2.8392274379730225,
      "rewards/margins": 5.255084037780762,
      "rewards/rejected": -2.41585636138916,
      "step": 2920
    },
    {
      "epoch": 0.76,
      "grad_norm": 27.83038330078125,
      "kl": 0.0,
      "learning_rate": 1.1777021722062287e-07,
      "logps/chosen": -269.1419677734375,
      "logps/rejected": -262.9184875488281,
      "loss": 0.1798,
      "rewards/chosen": 0.33988016843795776,
      "rewards/margins": 4.673036575317383,
      "rewards/rejected": -4.333156585693359,
      "step": 2921
    },
    {
      "epoch": 0.76,
      "grad_norm": 40.96464920043945,
      "kl": 0.0,
      "learning_rate": 1.1763936142371106e-07,
      "logps/chosen": -258.025146484375,
      "logps/rejected": -282.2995910644531,
      "loss": 0.2673,
      "rewards/chosen": 2.189265012741089,
      "rewards/margins": 5.157666206359863,
      "rewards/rejected": -2.9684009552001953,
      "step": 2922
    },
    {
      "epoch": 0.76,
      "grad_norm": 30.345224380493164,
      "kl": 0.0,
      "learning_rate": 1.1750850562679926e-07,
      "logps/chosen": -242.3159942626953,
      "logps/rejected": -242.1539306640625,
      "loss": 0.3145,
      "rewards/chosen": -1.015539288520813,
      "rewards/margins": 1.1760417222976685,
      "rewards/rejected": -2.1915810108184814,
      "step": 2923
    },
    {
      "epoch": 0.77,
      "grad_norm": 37.97049331665039,
      "kl": 0.0,
      "learning_rate": 1.1737764982988746e-07,
      "logps/chosen": -198.21041870117188,
      "logps/rejected": -361.84039306640625,
      "loss": 0.268,
      "rewards/chosen": 1.1192774772644043,
      "rewards/margins": 3.962264060974121,
      "rewards/rejected": -2.842986583709717,
      "step": 2924
    },
    {
      "epoch": 0.77,
      "grad_norm": 27.184776306152344,
      "kl": 0.0,
      "learning_rate": 1.1724679403297567e-07,
      "logps/chosen": -172.3856201171875,
      "logps/rejected": -172.68576049804688,
      "loss": 0.3505,
      "rewards/chosen": 0.21904900670051575,
      "rewards/margins": 2.9532828330993652,
      "rewards/rejected": -2.734233856201172,
      "step": 2925
    },
    {
      "epoch": 0.77,
      "grad_norm": 31.706565856933594,
      "kl": 0.0,
      "learning_rate": 1.1711593823606385e-07,
      "logps/chosen": -257.7572021484375,
      "logps/rejected": -253.30271911621094,
      "loss": 0.2443,
      "rewards/chosen": 2.2228496074676514,
      "rewards/margins": 6.374711990356445,
      "rewards/rejected": -4.151862144470215,
      "step": 2926
    },
    {
      "epoch": 0.77,
      "grad_norm": 29.20972442626953,
      "kl": 0.0,
      "learning_rate": 1.1698508243915204e-07,
      "logps/chosen": -159.6472625732422,
      "logps/rejected": -226.85902404785156,
      "loss": 0.1695,
      "rewards/chosen": 2.3836936950683594,
      "rewards/margins": 7.045234203338623,
      "rewards/rejected": -4.661540508270264,
      "step": 2927
    },
    {
      "epoch": 0.77,
      "grad_norm": 23.147682189941406,
      "kl": 0.0,
      "learning_rate": 1.1685422664224024e-07,
      "logps/chosen": -157.58294677734375,
      "logps/rejected": -241.945556640625,
      "loss": 0.2749,
      "rewards/chosen": 0.3442791998386383,
      "rewards/margins": 3.804286003112793,
      "rewards/rejected": -3.4600067138671875,
      "step": 2928
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.65460968017578,
      "kl": 0.0,
      "learning_rate": 1.1672337084532845e-07,
      "logps/chosen": -195.1674346923828,
      "logps/rejected": -227.59368896484375,
      "loss": 0.2232,
      "rewards/chosen": 1.0592485666275024,
      "rewards/margins": 3.595780372619629,
      "rewards/rejected": -2.536531925201416,
      "step": 2929
    },
    {
      "epoch": 0.77,
      "grad_norm": 35.047645568847656,
      "kl": 0.0,
      "learning_rate": 1.1659251504841663e-07,
      "logps/chosen": -181.13290405273438,
      "logps/rejected": -268.6132507324219,
      "loss": 0.3072,
      "rewards/chosen": 0.24703586101531982,
      "rewards/margins": 3.9326653480529785,
      "rewards/rejected": -3.6856296062469482,
      "step": 2930
    },
    {
      "epoch": 0.77,
      "grad_norm": 39.733726501464844,
      "kl": 0.0,
      "learning_rate": 1.1646165925150483e-07,
      "logps/chosen": -255.24049377441406,
      "logps/rejected": -268.782958984375,
      "loss": 0.2573,
      "rewards/chosen": 1.9264146089553833,
      "rewards/margins": 6.034917831420898,
      "rewards/rejected": -4.108503341674805,
      "step": 2931
    },
    {
      "epoch": 0.77,
      "grad_norm": 25.3881778717041,
      "kl": 0.0,
      "learning_rate": 1.1633080345459304e-07,
      "logps/chosen": -149.32199096679688,
      "logps/rejected": -205.93418884277344,
      "loss": 0.2107,
      "rewards/chosen": 1.3308162689208984,
      "rewards/margins": 5.0680389404296875,
      "rewards/rejected": -3.73722243309021,
      "step": 2932
    },
    {
      "epoch": 0.77,
      "grad_norm": 28.647340774536133,
      "kl": 0.0,
      "learning_rate": 1.1619994765768123e-07,
      "logps/chosen": -220.11572265625,
      "logps/rejected": -307.8936462402344,
      "loss": 0.2313,
      "rewards/chosen": 0.6090463399887085,
      "rewards/margins": 3.7676281929016113,
      "rewards/rejected": -3.1585819721221924,
      "step": 2933
    },
    {
      "epoch": 0.77,
      "grad_norm": 31.28106117248535,
      "kl": 0.0,
      "learning_rate": 1.1606909186076942e-07,
      "logps/chosen": -245.37083435058594,
      "logps/rejected": -180.98590087890625,
      "loss": 0.1955,
      "rewards/chosen": 0.789535403251648,
      "rewards/margins": 3.903141975402832,
      "rewards/rejected": -3.1136064529418945,
      "step": 2934
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.39136505126953,
      "kl": 0.0,
      "learning_rate": 1.1593823606385763e-07,
      "logps/chosen": -300.31298828125,
      "logps/rejected": -187.9770050048828,
      "loss": 0.269,
      "rewards/chosen": 2.049201488494873,
      "rewards/margins": 4.9029645919799805,
      "rewards/rejected": -2.8537633419036865,
      "step": 2935
    },
    {
      "epoch": 0.77,
      "grad_norm": 37.13324737548828,
      "kl": 0.0,
      "learning_rate": 1.1580738026694582e-07,
      "logps/chosen": -259.8029479980469,
      "logps/rejected": -213.43118286132812,
      "loss": 0.2899,
      "rewards/chosen": 1.5232011079788208,
      "rewards/margins": 4.917179584503174,
      "rewards/rejected": -3.3939785957336426,
      "step": 2936
    },
    {
      "epoch": 0.77,
      "grad_norm": 39.28556442260742,
      "kl": 0.0,
      "learning_rate": 1.1567652447003402e-07,
      "logps/chosen": -246.21910095214844,
      "logps/rejected": -221.6072235107422,
      "loss": 0.3189,
      "rewards/chosen": 1.0758863687515259,
      "rewards/margins": 5.4353718757629395,
      "rewards/rejected": -4.359485626220703,
      "step": 2937
    },
    {
      "epoch": 0.77,
      "grad_norm": 35.176692962646484,
      "kl": 0.0,
      "learning_rate": 1.1554566867312221e-07,
      "logps/chosen": -128.11904907226562,
      "logps/rejected": -278.5266418457031,
      "loss": 0.2562,
      "rewards/chosen": 0.3561898171901703,
      "rewards/margins": 2.309230089187622,
      "rewards/rejected": -1.953040361404419,
      "step": 2938
    },
    {
      "epoch": 0.77,
      "grad_norm": 35.85581970214844,
      "kl": 0.0,
      "learning_rate": 1.1541481287621041e-07,
      "logps/chosen": -228.60194396972656,
      "logps/rejected": -264.4442138671875,
      "loss": 0.3275,
      "rewards/chosen": 0.6116616129875183,
      "rewards/margins": 4.661001205444336,
      "rewards/rejected": -4.049339771270752,
      "step": 2939
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.08247375488281,
      "kl": 0.0,
      "learning_rate": 1.1528395707929861e-07,
      "logps/chosen": -179.61607360839844,
      "logps/rejected": -192.24085998535156,
      "loss": 0.3021,
      "rewards/chosen": 1.5291475057601929,
      "rewards/margins": 3.2262120246887207,
      "rewards/rejected": -1.6970643997192383,
      "step": 2940
    },
    {
      "epoch": 0.77,
      "grad_norm": 35.77885055541992,
      "kl": 0.0,
      "learning_rate": 1.1515310128238682e-07,
      "logps/chosen": -248.62643432617188,
      "logps/rejected": -270.5456848144531,
      "loss": 0.16,
      "rewards/chosen": 2.307846784591675,
      "rewards/margins": 4.3499555587768555,
      "rewards/rejected": -2.0421087741851807,
      "step": 2941
    },
    {
      "epoch": 0.77,
      "grad_norm": 33.389156341552734,
      "kl": 0.0,
      "learning_rate": 1.15022245485475e-07,
      "logps/chosen": -264.442138671875,
      "logps/rejected": -233.51083374023438,
      "loss": 0.3071,
      "rewards/chosen": -0.4067654609680176,
      "rewards/margins": 1.7154138088226318,
      "rewards/rejected": -2.1221792697906494,
      "step": 2942
    },
    {
      "epoch": 0.77,
      "grad_norm": 34.11969757080078,
      "kl": 0.0,
      "learning_rate": 1.148913896885632e-07,
      "logps/chosen": -278.32305908203125,
      "logps/rejected": -266.03045654296875,
      "loss": 0.3154,
      "rewards/chosen": -0.757024347782135,
      "rewards/margins": 2.179401397705078,
      "rewards/rejected": -2.9364256858825684,
      "step": 2943
    },
    {
      "epoch": 0.77,
      "grad_norm": 29.74340057373047,
      "kl": 0.0,
      "learning_rate": 1.147605338916514e-07,
      "logps/chosen": -124.81636047363281,
      "logps/rejected": -243.23130798339844,
      "loss": 0.2078,
      "rewards/chosen": 1.5924134254455566,
      "rewards/margins": 5.299116611480713,
      "rewards/rejected": -3.7067031860351562,
      "step": 2944
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.440956115722656,
      "kl": 0.0,
      "learning_rate": 1.146296780947396e-07,
      "logps/chosen": -161.70448303222656,
      "logps/rejected": -231.23211669921875,
      "loss": 0.2956,
      "rewards/chosen": 1.406182885169983,
      "rewards/margins": 4.8207926750183105,
      "rewards/rejected": -3.414609909057617,
      "step": 2945
    },
    {
      "epoch": 0.77,
      "grad_norm": 37.233802795410156,
      "kl": 0.0,
      "learning_rate": 1.1449882229782778e-07,
      "logps/chosen": -233.98989868164062,
      "logps/rejected": -275.0398254394531,
      "loss": 0.2631,
      "rewards/chosen": 0.17019644379615784,
      "rewards/margins": 3.401771306991577,
      "rewards/rejected": -3.231574773788452,
      "step": 2946
    },
    {
      "epoch": 0.77,
      "grad_norm": 68.66511535644531,
      "kl": 0.0,
      "learning_rate": 1.1436796650091598e-07,
      "logps/chosen": -267.5313720703125,
      "logps/rejected": -220.8348388671875,
      "loss": 0.3068,
      "rewards/chosen": -0.22980618476867676,
      "rewards/margins": 2.2407171726226807,
      "rewards/rejected": -2.4705233573913574,
      "step": 2947
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.196678161621094,
      "kl": 0.0,
      "learning_rate": 1.1423711070400419e-07,
      "logps/chosen": -141.34901428222656,
      "logps/rejected": -292.7916259765625,
      "loss": 0.1982,
      "rewards/chosen": 0.8846850991249084,
      "rewards/margins": 5.876195430755615,
      "rewards/rejected": -4.991510391235352,
      "step": 2948
    },
    {
      "epoch": 0.77,
      "grad_norm": 38.09465408325195,
      "kl": 0.0,
      "learning_rate": 1.1410625490709237e-07,
      "logps/chosen": -175.3059539794922,
      "logps/rejected": -315.62493896484375,
      "loss": 0.2861,
      "rewards/chosen": -0.07770039886236191,
      "rewards/margins": 3.6531319618225098,
      "rewards/rejected": -3.73083233833313,
      "step": 2949
    },
    {
      "epoch": 0.77,
      "grad_norm": 33.7330322265625,
      "kl": 0.0,
      "learning_rate": 1.1397539911018057e-07,
      "logps/chosen": -209.34979248046875,
      "logps/rejected": -225.85885620117188,
      "loss": 0.2615,
      "rewards/chosen": 2.062551498413086,
      "rewards/margins": 4.068078994750977,
      "rewards/rejected": -2.0055277347564697,
      "step": 2950
    },
    {
      "epoch": 0.77,
      "grad_norm": 37.776817321777344,
      "kl": 0.0,
      "learning_rate": 1.1384454331326878e-07,
      "logps/chosen": -213.33070373535156,
      "logps/rejected": -256.4163818359375,
      "loss": 0.314,
      "rewards/chosen": 1.311199426651001,
      "rewards/margins": 3.4920144081115723,
      "rewards/rejected": -2.1808149814605713,
      "step": 2951
    },
    {
      "epoch": 0.77,
      "grad_norm": 34.075008392333984,
      "kl": 0.0,
      "learning_rate": 1.1371368751635697e-07,
      "logps/chosen": -207.1404266357422,
      "logps/rejected": -249.67095947265625,
      "loss": 0.1776,
      "rewards/chosen": 1.8990845680236816,
      "rewards/margins": 5.696239471435547,
      "rewards/rejected": -3.7971549034118652,
      "step": 2952
    },
    {
      "epoch": 0.77,
      "grad_norm": 39.363426208496094,
      "kl": 0.0,
      "learning_rate": 1.1358283171944516e-07,
      "logps/chosen": -206.06753540039062,
      "logps/rejected": -221.67047119140625,
      "loss": 0.2966,
      "rewards/chosen": 0.7883104681968689,
      "rewards/margins": 4.238680839538574,
      "rewards/rejected": -3.4503703117370605,
      "step": 2953
    },
    {
      "epoch": 0.77,
      "grad_norm": 18.948083877563477,
      "kl": 0.0,
      "learning_rate": 1.1345197592253337e-07,
      "logps/chosen": -122.21463775634766,
      "logps/rejected": -260.5376892089844,
      "loss": 0.1369,
      "rewards/chosen": 1.1754182577133179,
      "rewards/margins": 5.67502498626709,
      "rewards/rejected": -4.499606609344482,
      "step": 2954
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.583106994628906,
      "kl": 0.0,
      "learning_rate": 1.1332112012562156e-07,
      "logps/chosen": -174.0437469482422,
      "logps/rejected": -196.47865295410156,
      "loss": 0.3576,
      "rewards/chosen": 0.3868805170059204,
      "rewards/margins": 3.155986785888672,
      "rewards/rejected": -2.769106388092041,
      "step": 2955
    },
    {
      "epoch": 0.77,
      "grad_norm": 34.119384765625,
      "kl": 0.0,
      "learning_rate": 1.1319026432870976e-07,
      "logps/chosen": -209.29136657714844,
      "logps/rejected": -220.05838012695312,
      "loss": 0.1649,
      "rewards/chosen": 1.6317064762115479,
      "rewards/margins": 5.285268783569336,
      "rewards/rejected": -3.653562307357788,
      "step": 2956
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.831085205078125,
      "kl": 0.0,
      "learning_rate": 1.1305940853179795e-07,
      "logps/chosen": -199.46022033691406,
      "logps/rejected": -262.05328369140625,
      "loss": 0.2363,
      "rewards/chosen": 0.814264714717865,
      "rewards/margins": 4.46965217590332,
      "rewards/rejected": -3.6553874015808105,
      "step": 2957
    },
    {
      "epoch": 0.77,
      "grad_norm": 33.77021789550781,
      "kl": 0.0,
      "learning_rate": 1.1292855273488615e-07,
      "logps/chosen": -242.86158752441406,
      "logps/rejected": -310.06060791015625,
      "loss": 0.2165,
      "rewards/chosen": 1.675708532333374,
      "rewards/margins": 5.887651443481445,
      "rewards/rejected": -4.211942672729492,
      "step": 2958
    },
    {
      "epoch": 0.77,
      "grad_norm": 24.862733840942383,
      "kl": 0.0,
      "learning_rate": 1.1279769693797435e-07,
      "logps/chosen": -171.75450134277344,
      "logps/rejected": -218.28988647460938,
      "loss": 0.2095,
      "rewards/chosen": 2.0904273986816406,
      "rewards/margins": 5.917420387268066,
      "rewards/rejected": -3.826992988586426,
      "step": 2959
    },
    {
      "epoch": 0.77,
      "grad_norm": 32.68523406982422,
      "kl": 0.0,
      "learning_rate": 1.1266684114106256e-07,
      "logps/chosen": -218.8000030517578,
      "logps/rejected": -239.5182342529297,
      "loss": 0.2813,
      "rewards/chosen": 1.9658054113388062,
      "rewards/margins": 3.183340549468994,
      "rewards/rejected": -1.2175352573394775,
      "step": 2960
    },
    {
      "epoch": 0.77,
      "grad_norm": 21.125932693481445,
      "kl": 0.0,
      "learning_rate": 1.1253598534415074e-07,
      "logps/chosen": -103.0634536743164,
      "logps/rejected": -305.65045166015625,
      "loss": 0.2661,
      "rewards/chosen": 0.5404638051986694,
      "rewards/margins": 4.619140148162842,
      "rewards/rejected": -4.078676223754883,
      "step": 2961
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.92940902709961,
      "kl": 0.0,
      "learning_rate": 1.1240512954723893e-07,
      "logps/chosen": -186.74815368652344,
      "logps/rejected": -226.94517517089844,
      "loss": 0.1668,
      "rewards/chosen": 1.232695460319519,
      "rewards/margins": 5.6889777183532715,
      "rewards/rejected": -4.456282138824463,
      "step": 2962
    },
    {
      "epoch": 0.78,
      "grad_norm": 30.524091720581055,
      "kl": 0.0,
      "learning_rate": 1.1227427375032713e-07,
      "logps/chosen": -180.26528930664062,
      "logps/rejected": -228.55606079101562,
      "loss": 0.3252,
      "rewards/chosen": 0.7677914500236511,
      "rewards/margins": 4.915521144866943,
      "rewards/rejected": -4.147729873657227,
      "step": 2963
    },
    {
      "epoch": 0.78,
      "grad_norm": 45.41800308227539,
      "kl": 0.0,
      "learning_rate": 1.1214341795341534e-07,
      "logps/chosen": -191.4619598388672,
      "logps/rejected": -252.8148193359375,
      "loss": 0.2961,
      "rewards/chosen": 1.1002200841903687,
      "rewards/margins": 4.552682876586914,
      "rewards/rejected": -3.452462673187256,
      "step": 2964
    },
    {
      "epoch": 0.78,
      "grad_norm": 26.6116886138916,
      "kl": 0.0,
      "learning_rate": 1.1201256215650352e-07,
      "logps/chosen": -240.11390686035156,
      "logps/rejected": -247.91156005859375,
      "loss": 0.3104,
      "rewards/chosen": 2.9228763580322266,
      "rewards/margins": 6.118831634521484,
      "rewards/rejected": -3.195955514907837,
      "step": 2965
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.6483268737793,
      "kl": 0.0,
      "learning_rate": 1.1188170635959172e-07,
      "logps/chosen": -191.32887268066406,
      "logps/rejected": -302.2726745605469,
      "loss": 0.1808,
      "rewards/chosen": 1.0362077951431274,
      "rewards/margins": 4.585421562194824,
      "rewards/rejected": -3.5492136478424072,
      "step": 2966
    },
    {
      "epoch": 0.78,
      "grad_norm": 29.699861526489258,
      "kl": 0.0,
      "learning_rate": 1.1175085056267993e-07,
      "logps/chosen": -234.7059326171875,
      "logps/rejected": -196.4623260498047,
      "loss": 0.1724,
      "rewards/chosen": 0.14948131144046783,
      "rewards/margins": 3.898578405380249,
      "rewards/rejected": -3.7490971088409424,
      "step": 2967
    },
    {
      "epoch": 0.78,
      "grad_norm": 26.997941970825195,
      "kl": 0.0,
      "learning_rate": 1.1161999476576812e-07,
      "logps/chosen": -222.604736328125,
      "logps/rejected": -244.908935546875,
      "loss": 0.1768,
      "rewards/chosen": 1.4389097690582275,
      "rewards/margins": 5.924962043762207,
      "rewards/rejected": -4.4860520362854,
      "step": 2968
    },
    {
      "epoch": 0.78,
      "grad_norm": 34.620880126953125,
      "kl": 0.0,
      "learning_rate": 1.1148913896885631e-07,
      "logps/chosen": -190.08731079101562,
      "logps/rejected": -178.37429809570312,
      "loss": 0.2559,
      "rewards/chosen": 1.443699598312378,
      "rewards/margins": 4.743268013000488,
      "rewards/rejected": -3.2995681762695312,
      "step": 2969
    },
    {
      "epoch": 0.78,
      "grad_norm": 36.19744110107422,
      "kl": 0.0,
      "learning_rate": 1.1135828317194452e-07,
      "logps/chosen": -151.0272674560547,
      "logps/rejected": -201.56234741210938,
      "loss": 0.2313,
      "rewards/chosen": 0.9803556203842163,
      "rewards/margins": 3.7992358207702637,
      "rewards/rejected": -2.818880081176758,
      "step": 2970
    },
    {
      "epoch": 0.78,
      "grad_norm": 33.23929214477539,
      "kl": 0.0,
      "learning_rate": 1.1122742737503271e-07,
      "logps/chosen": -140.99327087402344,
      "logps/rejected": -244.1494903564453,
      "loss": 0.2124,
      "rewards/chosen": 1.0255264043807983,
      "rewards/margins": 5.782019138336182,
      "rewards/rejected": -4.756492614746094,
      "step": 2971
    },
    {
      "epoch": 0.78,
      "grad_norm": 28.24236297607422,
      "kl": 0.0,
      "learning_rate": 1.1109657157812091e-07,
      "logps/chosen": -147.13204956054688,
      "logps/rejected": -311.97467041015625,
      "loss": 0.2424,
      "rewards/chosen": 1.267746925354004,
      "rewards/margins": 6.412136077880859,
      "rewards/rejected": -5.1443891525268555,
      "step": 2972
    },
    {
      "epoch": 0.78,
      "grad_norm": 26.959043502807617,
      "kl": 0.0,
      "learning_rate": 1.109657157812091e-07,
      "logps/chosen": -223.3488311767578,
      "logps/rejected": -358.9703369140625,
      "loss": 0.2572,
      "rewards/chosen": 1.4740769863128662,
      "rewards/margins": 6.306148529052734,
      "rewards/rejected": -4.832071304321289,
      "step": 2973
    },
    {
      "epoch": 0.78,
      "grad_norm": 27.128389358520508,
      "kl": 0.0,
      "learning_rate": 1.108348599842973e-07,
      "logps/chosen": -102.9526596069336,
      "logps/rejected": -211.68307495117188,
      "loss": 0.2965,
      "rewards/chosen": -0.20596718788146973,
      "rewards/margins": 3.6257193088531494,
      "rewards/rejected": -3.831686496734619,
      "step": 2974
    },
    {
      "epoch": 0.78,
      "grad_norm": 33.70097351074219,
      "kl": 0.0,
      "learning_rate": 1.107040041873855e-07,
      "logps/chosen": -219.6284942626953,
      "logps/rejected": -213.84817504882812,
      "loss": 0.1994,
      "rewards/chosen": 3.291337728500366,
      "rewards/margins": 6.678530693054199,
      "rewards/rejected": -3.387192726135254,
      "step": 2975
    },
    {
      "epoch": 0.78,
      "grad_norm": 36.46519088745117,
      "kl": 0.0,
      "learning_rate": 1.1057314839047368e-07,
      "logps/chosen": -214.6738739013672,
      "logps/rejected": -232.84051513671875,
      "loss": 0.2328,
      "rewards/chosen": -0.8670411109924316,
      "rewards/margins": 2.0714240074157715,
      "rewards/rejected": -2.938465118408203,
      "step": 2976
    },
    {
      "epoch": 0.78,
      "grad_norm": 37.8907585144043,
      "kl": 0.0,
      "learning_rate": 1.1044229259356189e-07,
      "logps/chosen": -226.720458984375,
      "logps/rejected": -200.7624969482422,
      "loss": 0.2166,
      "rewards/chosen": 0.6200340986251831,
      "rewards/margins": 4.319954872131348,
      "rewards/rejected": -3.699920654296875,
      "step": 2977
    },
    {
      "epoch": 0.78,
      "grad_norm": 40.9567756652832,
      "kl": 0.0,
      "learning_rate": 1.1031143679665009e-07,
      "logps/chosen": -232.58010864257812,
      "logps/rejected": -379.1448669433594,
      "loss": 0.2943,
      "rewards/chosen": -0.21439041197299957,
      "rewards/margins": 4.33332633972168,
      "rewards/rejected": -4.5477166175842285,
      "step": 2978
    },
    {
      "epoch": 0.78,
      "grad_norm": 40.09104919433594,
      "kl": 0.0,
      "learning_rate": 1.1018058099973828e-07,
      "logps/chosen": -167.21389770507812,
      "logps/rejected": -303.8032531738281,
      "loss": 0.307,
      "rewards/chosen": 1.0971133708953857,
      "rewards/margins": 3.571532964706421,
      "rewards/rejected": -2.474419593811035,
      "step": 2979
    },
    {
      "epoch": 0.78,
      "grad_norm": 25.115385055541992,
      "kl": 0.0,
      "learning_rate": 1.1004972520282648e-07,
      "logps/chosen": -311.6871032714844,
      "logps/rejected": -211.0572509765625,
      "loss": 0.1853,
      "rewards/chosen": 2.4156370162963867,
      "rewards/margins": 6.687005043029785,
      "rewards/rejected": -4.271368026733398,
      "step": 2980
    },
    {
      "epoch": 0.78,
      "grad_norm": 25.411376953125,
      "kl": 0.0,
      "learning_rate": 1.0991886940591467e-07,
      "logps/chosen": -204.73744201660156,
      "logps/rejected": -330.9629821777344,
      "loss": 0.2114,
      "rewards/chosen": 0.5547571778297424,
      "rewards/margins": 6.988034725189209,
      "rewards/rejected": -6.433277606964111,
      "step": 2981
    },
    {
      "epoch": 0.78,
      "grad_norm": 34.39225387573242,
      "kl": 0.0,
      "learning_rate": 1.0978801360900287e-07,
      "logps/chosen": -265.6847229003906,
      "logps/rejected": -253.92202758789062,
      "loss": 0.2749,
      "rewards/chosen": 2.182427167892456,
      "rewards/margins": 5.48234748840332,
      "rewards/rejected": -3.299920082092285,
      "step": 2982
    },
    {
      "epoch": 0.78,
      "grad_norm": 37.2737922668457,
      "kl": 0.0,
      "learning_rate": 1.0965715781209108e-07,
      "logps/chosen": -220.30810546875,
      "logps/rejected": -286.2281188964844,
      "loss": 0.3222,
      "rewards/chosen": 0.6856479644775391,
      "rewards/margins": 3.8363304138183594,
      "rewards/rejected": -3.1506824493408203,
      "step": 2983
    },
    {
      "epoch": 0.78,
      "grad_norm": 34.21880340576172,
      "kl": 0.0,
      "learning_rate": 1.0952630201517926e-07,
      "logps/chosen": -256.91973876953125,
      "logps/rejected": -266.81805419921875,
      "loss": 0.2503,
      "rewards/chosen": 2.428946018218994,
      "rewards/margins": 6.026227951049805,
      "rewards/rejected": -3.5972819328308105,
      "step": 2984
    },
    {
      "epoch": 0.78,
      "grad_norm": 37.171661376953125,
      "kl": 0.0,
      "learning_rate": 1.0939544621826746e-07,
      "logps/chosen": -223.8563232421875,
      "logps/rejected": -173.02964782714844,
      "loss": 0.3763,
      "rewards/chosen": -0.8342050313949585,
      "rewards/margins": 2.1872963905334473,
      "rewards/rejected": -3.0215015411376953,
      "step": 2985
    },
    {
      "epoch": 0.78,
      "grad_norm": 30.91156768798828,
      "kl": 0.0,
      "learning_rate": 1.0926459042135567e-07,
      "logps/chosen": -216.64736938476562,
      "logps/rejected": -358.8420715332031,
      "loss": 0.2425,
      "rewards/chosen": 1.1421846151351929,
      "rewards/margins": 3.779019355773926,
      "rewards/rejected": -2.6368348598480225,
      "step": 2986
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.80834197998047,
      "kl": 0.0,
      "learning_rate": 1.0913373462444386e-07,
      "logps/chosen": -257.1531677246094,
      "logps/rejected": -254.27359008789062,
      "loss": 0.2887,
      "rewards/chosen": 0.9386414885520935,
      "rewards/margins": 4.0823516845703125,
      "rewards/rejected": -3.143710136413574,
      "step": 2987
    },
    {
      "epoch": 0.78,
      "grad_norm": 26.716569900512695,
      "kl": 0.0,
      "learning_rate": 1.0900287882753205e-07,
      "logps/chosen": -203.5930938720703,
      "logps/rejected": -369.82818603515625,
      "loss": 0.2132,
      "rewards/chosen": 5.119094371795654,
      "rewards/margins": 8.760660171508789,
      "rewards/rejected": -3.6415653228759766,
      "step": 2988
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.3159294128418,
      "kl": 0.0,
      "learning_rate": 1.0887202303062026e-07,
      "logps/chosen": -204.94692993164062,
      "logps/rejected": -323.8594665527344,
      "loss": 0.2114,
      "rewards/chosen": 1.6632729768753052,
      "rewards/margins": 5.176769733428955,
      "rewards/rejected": -3.5134966373443604,
      "step": 2989
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.48981475830078,
      "kl": 0.0,
      "learning_rate": 1.0874116723370845e-07,
      "logps/chosen": -217.63912963867188,
      "logps/rejected": -200.53121948242188,
      "loss": 0.2789,
      "rewards/chosen": 0.8979160189628601,
      "rewards/margins": 4.462857723236084,
      "rewards/rejected": -3.564941644668579,
      "step": 2990
    },
    {
      "epoch": 0.78,
      "grad_norm": 24.33729362487793,
      "kl": 0.0,
      "learning_rate": 1.0861031143679665e-07,
      "logps/chosen": -182.7748565673828,
      "logps/rejected": -279.8739013671875,
      "loss": 0.1476,
      "rewards/chosen": 3.3167998790740967,
      "rewards/margins": 7.395058631896973,
      "rewards/rejected": -4.078258991241455,
      "step": 2991
    },
    {
      "epoch": 0.78,
      "grad_norm": 37.343414306640625,
      "kl": 0.0,
      "learning_rate": 1.0847945563988483e-07,
      "logps/chosen": -201.48919677734375,
      "logps/rejected": -130.3191375732422,
      "loss": 0.3314,
      "rewards/chosen": 0.16055479645729065,
      "rewards/margins": 2.0605573654174805,
      "rewards/rejected": -1.9000025987625122,
      "step": 2992
    },
    {
      "epoch": 0.78,
      "grad_norm": 64.43939971923828,
      "kl": 0.0,
      "learning_rate": 1.0834859984297304e-07,
      "logps/chosen": -174.33096313476562,
      "logps/rejected": -232.03768920898438,
      "loss": 0.3909,
      "rewards/chosen": 0.12005829811096191,
      "rewards/margins": 1.6579828262329102,
      "rewards/rejected": -1.5379245281219482,
      "step": 2993
    },
    {
      "epoch": 0.78,
      "grad_norm": 31.608854293823242,
      "kl": 0.0,
      "learning_rate": 1.0821774404606124e-07,
      "logps/chosen": -276.39581298828125,
      "logps/rejected": -246.9667510986328,
      "loss": 0.2036,
      "rewards/chosen": 2.626638412475586,
      "rewards/margins": 6.046180725097656,
      "rewards/rejected": -3.4195423126220703,
      "step": 2994
    },
    {
      "epoch": 0.78,
      "grad_norm": 33.15080642700195,
      "kl": 0.0,
      "learning_rate": 1.0808688824914943e-07,
      "logps/chosen": -141.77513122558594,
      "logps/rejected": -343.1261901855469,
      "loss": 0.1914,
      "rewards/chosen": 1.7029200792312622,
      "rewards/margins": 6.784095764160156,
      "rewards/rejected": -5.081175804138184,
      "step": 2995
    },
    {
      "epoch": 0.78,
      "grad_norm": 32.62825393676758,
      "kl": 0.0,
      "learning_rate": 1.0795603245223763e-07,
      "logps/chosen": -179.11143493652344,
      "logps/rejected": -298.8634948730469,
      "loss": 0.2007,
      "rewards/chosen": 1.274958610534668,
      "rewards/margins": 5.084338665008545,
      "rewards/rejected": -3.809380054473877,
      "step": 2996
    },
    {
      "epoch": 0.78,
      "grad_norm": 48.24810791015625,
      "kl": 0.0,
      "learning_rate": 1.0782517665532582e-07,
      "logps/chosen": -235.58192443847656,
      "logps/rejected": -326.9761657714844,
      "loss": 0.17,
      "rewards/chosen": 1.9319114685058594,
      "rewards/margins": 6.2909417152404785,
      "rewards/rejected": -4.359030246734619,
      "step": 2997
    },
    {
      "epoch": 0.78,
      "grad_norm": 26.545345306396484,
      "kl": 0.0,
      "learning_rate": 1.0769432085841402e-07,
      "logps/chosen": -223.30014038085938,
      "logps/rejected": -226.85134887695312,
      "loss": 0.2063,
      "rewards/chosen": 1.05927312374115,
      "rewards/margins": 6.737154960632324,
      "rewards/rejected": -5.677881717681885,
      "step": 2998
    },
    {
      "epoch": 0.78,
      "grad_norm": 33.72455978393555,
      "kl": 0.0,
      "learning_rate": 1.0756346506150223e-07,
      "logps/chosen": -100.30730438232422,
      "logps/rejected": -345.95928955078125,
      "loss": 0.2833,
      "rewards/chosen": 0.41382449865341187,
      "rewards/margins": 5.438047885894775,
      "rewards/rejected": -5.024223327636719,
      "step": 2999
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.084556579589844,
      "kl": 0.0,
      "learning_rate": 1.0743260926459041e-07,
      "logps/chosen": -166.98902893066406,
      "logps/rejected": -267.23040771484375,
      "loss": 0.2217,
      "rewards/chosen": 0.7845678925514221,
      "rewards/margins": 5.768924236297607,
      "rewards/rejected": -4.98435640335083,
      "step": 3000
    },
    {
      "epoch": 0.79,
      "grad_norm": 36.8436393737793,
      "kl": 0.0,
      "learning_rate": 1.0730175346767861e-07,
      "logps/chosen": -138.873046875,
      "logps/rejected": -246.13360595703125,
      "loss": 0.2673,
      "rewards/chosen": 0.9685492515563965,
      "rewards/margins": 3.8232574462890625,
      "rewards/rejected": -2.854708194732666,
      "step": 3001
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.18307113647461,
      "kl": 0.0,
      "learning_rate": 1.0717089767076682e-07,
      "logps/chosen": -263.03448486328125,
      "logps/rejected": -228.5595703125,
      "loss": 0.2002,
      "rewards/chosen": 1.27859365940094,
      "rewards/margins": 4.943618297576904,
      "rewards/rejected": -3.665024757385254,
      "step": 3002
    },
    {
      "epoch": 0.79,
      "grad_norm": 30.899982452392578,
      "kl": 0.0,
      "learning_rate": 1.0704004187385501e-07,
      "logps/chosen": -220.91151428222656,
      "logps/rejected": -276.31787109375,
      "loss": 0.3037,
      "rewards/chosen": 2.9530606269836426,
      "rewards/margins": 6.3132548332214355,
      "rewards/rejected": -3.360194206237793,
      "step": 3003
    },
    {
      "epoch": 0.79,
      "grad_norm": 32.476722717285156,
      "kl": 0.0,
      "learning_rate": 1.069091860769432e-07,
      "logps/chosen": -192.40228271484375,
      "logps/rejected": -217.82337951660156,
      "loss": 0.3322,
      "rewards/chosen": -0.3618360459804535,
      "rewards/margins": 2.2618818283081055,
      "rewards/rejected": -2.623717784881592,
      "step": 3004
    },
    {
      "epoch": 0.79,
      "grad_norm": 29.309284210205078,
      "kl": 0.0,
      "learning_rate": 1.067783302800314e-07,
      "logps/chosen": -203.76046752929688,
      "logps/rejected": -190.60350036621094,
      "loss": 0.3283,
      "rewards/chosen": 0.03439127281308174,
      "rewards/margins": 3.6848862171173096,
      "rewards/rejected": -3.6504950523376465,
      "step": 3005
    },
    {
      "epoch": 0.79,
      "grad_norm": 31.234689712524414,
      "kl": 0.0,
      "learning_rate": 1.066474744831196e-07,
      "logps/chosen": -197.9490203857422,
      "logps/rejected": -288.17608642578125,
      "loss": 0.1237,
      "rewards/chosen": 1.1934818029403687,
      "rewards/margins": 4.739086627960205,
      "rewards/rejected": -3.545604705810547,
      "step": 3006
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.367952346801758,
      "kl": 0.0,
      "learning_rate": 1.0651661868620779e-07,
      "logps/chosen": -253.59230041503906,
      "logps/rejected": -246.80653381347656,
      "loss": 0.3031,
      "rewards/chosen": -0.3414022922515869,
      "rewards/margins": 3.4762983322143555,
      "rewards/rejected": -3.8177006244659424,
      "step": 3007
    },
    {
      "epoch": 0.79,
      "grad_norm": 31.36299705505371,
      "kl": 0.0,
      "learning_rate": 1.06385762889296e-07,
      "logps/chosen": -146.8282928466797,
      "logps/rejected": -191.821044921875,
      "loss": 0.3346,
      "rewards/chosen": 0.6221804022789001,
      "rewards/margins": 3.2980802059173584,
      "rewards/rejected": -2.6758997440338135,
      "step": 3008
    },
    {
      "epoch": 0.79,
      "grad_norm": 39.00789260864258,
      "kl": 0.0,
      "learning_rate": 1.0625490709238419e-07,
      "logps/chosen": -156.52456665039062,
      "logps/rejected": -229.74844360351562,
      "loss": 0.329,
      "rewards/chosen": -0.1007121205329895,
      "rewards/margins": 2.1996805667877197,
      "rewards/rejected": -2.3003926277160645,
      "step": 3009
    },
    {
      "epoch": 0.79,
      "grad_norm": 29.693687438964844,
      "kl": 0.0,
      "learning_rate": 1.0612405129547239e-07,
      "logps/chosen": -211.4034423828125,
      "logps/rejected": -303.6903076171875,
      "loss": 0.3138,
      "rewards/chosen": 0.26724064350128174,
      "rewards/margins": 4.005509853363037,
      "rewards/rejected": -3.738269090652466,
      "step": 3010
    },
    {
      "epoch": 0.79,
      "grad_norm": 37.14421081542969,
      "kl": 0.0,
      "learning_rate": 1.0599319549856057e-07,
      "logps/chosen": -326.74212646484375,
      "logps/rejected": -236.1087646484375,
      "loss": 0.3227,
      "rewards/chosen": 0.06870659440755844,
      "rewards/margins": 3.472843647003174,
      "rewards/rejected": -3.404137134552002,
      "step": 3011
    },
    {
      "epoch": 0.79,
      "grad_norm": 37.60954284667969,
      "kl": 0.0,
      "learning_rate": 1.0586233970164878e-07,
      "logps/chosen": -205.22088623046875,
      "logps/rejected": -304.60211181640625,
      "loss": 0.2637,
      "rewards/chosen": 2.5140395164489746,
      "rewards/margins": 6.879363536834717,
      "rewards/rejected": -4.365324020385742,
      "step": 3012
    },
    {
      "epoch": 0.79,
      "grad_norm": 40.19622802734375,
      "kl": 0.0,
      "learning_rate": 1.0573148390473697e-07,
      "logps/chosen": -229.6803741455078,
      "logps/rejected": -237.748291015625,
      "loss": 0.3501,
      "rewards/chosen": 0.07104342430830002,
      "rewards/margins": 2.646733045578003,
      "rewards/rejected": -2.5756895542144775,
      "step": 3013
    },
    {
      "epoch": 0.79,
      "grad_norm": 33.84101486206055,
      "kl": 0.0,
      "learning_rate": 1.0560062810782517e-07,
      "logps/chosen": -198.72361755371094,
      "logps/rejected": -315.6944274902344,
      "loss": 0.2828,
      "rewards/chosen": 1.207344889640808,
      "rewards/margins": 6.238935470581055,
      "rewards/rejected": -5.031590461730957,
      "step": 3014
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.22218894958496,
      "kl": 0.0,
      "learning_rate": 1.0546977231091337e-07,
      "logps/chosen": -249.1780242919922,
      "logps/rejected": -194.42227172851562,
      "loss": 0.1329,
      "rewards/chosen": 2.790569543838501,
      "rewards/margins": 6.9953813552856445,
      "rewards/rejected": -4.2048115730285645,
      "step": 3015
    },
    {
      "epoch": 0.79,
      "grad_norm": 39.59652328491211,
      "kl": 0.0,
      "learning_rate": 1.0533891651400156e-07,
      "logps/chosen": -318.8900451660156,
      "logps/rejected": -175.8162078857422,
      "loss": 0.1602,
      "rewards/chosen": 1.4444025754928589,
      "rewards/margins": 5.619593620300293,
      "rewards/rejected": -4.1751909255981445,
      "step": 3016
    },
    {
      "epoch": 0.79,
      "grad_norm": 34.01449203491211,
      "kl": 0.0,
      "learning_rate": 1.0520806071708976e-07,
      "logps/chosen": -234.33010864257812,
      "logps/rejected": -186.9846954345703,
      "loss": 0.2612,
      "rewards/chosen": 0.5609415769577026,
      "rewards/margins": 2.225095510482788,
      "rewards/rejected": -1.6641539335250854,
      "step": 3017
    },
    {
      "epoch": 0.79,
      "grad_norm": 38.461822509765625,
      "kl": 0.0,
      "learning_rate": 1.0507720492017797e-07,
      "logps/chosen": -148.9252471923828,
      "logps/rejected": -353.3999328613281,
      "loss": 0.3001,
      "rewards/chosen": 1.1917644739151,
      "rewards/margins": 4.060915946960449,
      "rewards/rejected": -2.8691513538360596,
      "step": 3018
    },
    {
      "epoch": 0.79,
      "grad_norm": 34.158775329589844,
      "kl": 0.0,
      "learning_rate": 1.0494634912326615e-07,
      "logps/chosen": -278.2842102050781,
      "logps/rejected": -328.3468322753906,
      "loss": 0.3007,
      "rewards/chosen": -1.3085110187530518,
      "rewards/margins": 4.603449821472168,
      "rewards/rejected": -5.911961078643799,
      "step": 3019
    },
    {
      "epoch": 0.79,
      "grad_norm": 33.55435562133789,
      "kl": 0.0,
      "learning_rate": 1.0481549332635435e-07,
      "logps/chosen": -193.94454956054688,
      "logps/rejected": -240.3863983154297,
      "loss": 0.2211,
      "rewards/chosen": 0.9673369526863098,
      "rewards/margins": 4.179528713226318,
      "rewards/rejected": -3.2121918201446533,
      "step": 3020
    },
    {
      "epoch": 0.79,
      "grad_norm": 31.77188491821289,
      "kl": 0.0,
      "learning_rate": 1.0468463752944256e-07,
      "logps/chosen": -275.8807678222656,
      "logps/rejected": -209.6526336669922,
      "loss": 0.1991,
      "rewards/chosen": 2.488818645477295,
      "rewards/margins": 6.772157669067383,
      "rewards/rejected": -4.283339023590088,
      "step": 3021
    },
    {
      "epoch": 0.79,
      "grad_norm": 30.585309982299805,
      "kl": 0.0,
      "learning_rate": 1.0455378173253075e-07,
      "logps/chosen": -213.80178833007812,
      "logps/rejected": -218.0395050048828,
      "loss": 0.1788,
      "rewards/chosen": 1.2606884241104126,
      "rewards/margins": 5.7575297355651855,
      "rewards/rejected": -4.4968414306640625,
      "step": 3022
    },
    {
      "epoch": 0.79,
      "grad_norm": 23.492050170898438,
      "kl": 0.0,
      "learning_rate": 1.0442292593561894e-07,
      "logps/chosen": -174.97213745117188,
      "logps/rejected": -341.42987060546875,
      "loss": 0.181,
      "rewards/chosen": 0.9902437329292297,
      "rewards/margins": 6.303700923919678,
      "rewards/rejected": -5.313457012176514,
      "step": 3023
    },
    {
      "epoch": 0.79,
      "grad_norm": 33.66410827636719,
      "kl": 0.0,
      "learning_rate": 1.0429207013870715e-07,
      "logps/chosen": -193.3638153076172,
      "logps/rejected": -289.1599426269531,
      "loss": 0.3278,
      "rewards/chosen": -0.5050497651100159,
      "rewards/margins": 3.2536935806274414,
      "rewards/rejected": -3.7587432861328125,
      "step": 3024
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.773303985595703,
      "kl": 0.0,
      "learning_rate": 1.0416121434179534e-07,
      "logps/chosen": -164.4617156982422,
      "logps/rejected": -325.7907409667969,
      "loss": 0.1888,
      "rewards/chosen": 1.4648340940475464,
      "rewards/margins": 8.07848834991455,
      "rewards/rejected": -6.613654136657715,
      "step": 3025
    },
    {
      "epoch": 0.79,
      "grad_norm": 26.393327713012695,
      "kl": 0.0,
      "learning_rate": 1.0403035854488354e-07,
      "logps/chosen": -232.78707885742188,
      "logps/rejected": -268.3695983886719,
      "loss": 0.2201,
      "rewards/chosen": 0.41276469826698303,
      "rewards/margins": 4.607279300689697,
      "rewards/rejected": -4.194514751434326,
      "step": 3026
    },
    {
      "epoch": 0.79,
      "grad_norm": 29.967241287231445,
      "kl": 0.0,
      "learning_rate": 1.0389950274797172e-07,
      "logps/chosen": -176.5338134765625,
      "logps/rejected": -262.23046875,
      "loss": 0.2127,
      "rewards/chosen": 1.2708486318588257,
      "rewards/margins": 4.850653648376465,
      "rewards/rejected": -3.5798051357269287,
      "step": 3027
    },
    {
      "epoch": 0.79,
      "grad_norm": 33.31254196166992,
      "kl": 0.0,
      "learning_rate": 1.0376864695105993e-07,
      "logps/chosen": -221.40109252929688,
      "logps/rejected": -205.8263702392578,
      "loss": 0.229,
      "rewards/chosen": 1.289694905281067,
      "rewards/margins": 4.560299873352051,
      "rewards/rejected": -3.2706050872802734,
      "step": 3028
    },
    {
      "epoch": 0.79,
      "grad_norm": 32.369956970214844,
      "kl": 0.0,
      "learning_rate": 1.0363779115414813e-07,
      "logps/chosen": -283.633544921875,
      "logps/rejected": -215.62583923339844,
      "loss": 0.2673,
      "rewards/chosen": 1.1404141187667847,
      "rewards/margins": 4.866934299468994,
      "rewards/rejected": -3.726520299911499,
      "step": 3029
    },
    {
      "epoch": 0.79,
      "grad_norm": 28.29862403869629,
      "kl": 0.0,
      "learning_rate": 1.0350693535723632e-07,
      "logps/chosen": -282.5250549316406,
      "logps/rejected": -267.06903076171875,
      "loss": 0.2235,
      "rewards/chosen": 1.0757641792297363,
      "rewards/margins": 5.065523147583008,
      "rewards/rejected": -3.9897589683532715,
      "step": 3030
    },
    {
      "epoch": 0.79,
      "grad_norm": 29.409870147705078,
      "kl": 0.0,
      "learning_rate": 1.0337607956032452e-07,
      "logps/chosen": -221.21969604492188,
      "logps/rejected": -300.745849609375,
      "loss": 0.3934,
      "rewards/chosen": -0.5117583274841309,
      "rewards/margins": 2.25506591796875,
      "rewards/rejected": -2.766824245452881,
      "step": 3031
    },
    {
      "epoch": 0.79,
      "grad_norm": 30.991165161132812,
      "kl": 0.0,
      "learning_rate": 1.0324522376341271e-07,
      "logps/chosen": -215.63917541503906,
      "logps/rejected": -270.92657470703125,
      "loss": 0.2509,
      "rewards/chosen": 1.370503306388855,
      "rewards/margins": 4.434596061706543,
      "rewards/rejected": -3.0640928745269775,
      "step": 3032
    },
    {
      "epoch": 0.79,
      "grad_norm": 34.13918685913086,
      "kl": 0.0,
      "learning_rate": 1.0311436796650091e-07,
      "logps/chosen": -212.4232635498047,
      "logps/rejected": -166.4032440185547,
      "loss": 0.1785,
      "rewards/chosen": -0.247951939702034,
      "rewards/margins": 4.787047386169434,
      "rewards/rejected": -5.034999370574951,
      "step": 3033
    },
    {
      "epoch": 0.79,
      "grad_norm": 31.52064323425293,
      "kl": 0.0,
      "learning_rate": 1.029835121695891e-07,
      "logps/chosen": -270.621826171875,
      "logps/rejected": -242.9385223388672,
      "loss": 0.2086,
      "rewards/chosen": 0.8697998523712158,
      "rewards/margins": 4.359752655029297,
      "rewards/rejected": -3.489952564239502,
      "step": 3034
    },
    {
      "epoch": 0.79,
      "grad_norm": 41.24197006225586,
      "kl": 0.0,
      "learning_rate": 1.028526563726773e-07,
      "logps/chosen": -266.4942626953125,
      "logps/rejected": -229.0302734375,
      "loss": 0.1731,
      "rewards/chosen": 1.4553301334381104,
      "rewards/margins": 5.062695503234863,
      "rewards/rejected": -3.607365369796753,
      "step": 3035
    },
    {
      "epoch": 0.79,
      "grad_norm": 34.90055465698242,
      "kl": 0.0,
      "learning_rate": 1.027218005757655e-07,
      "logps/chosen": -273.8673400878906,
      "logps/rejected": -152.23277282714844,
      "loss": 0.2045,
      "rewards/chosen": 0.7700100541114807,
      "rewards/margins": 2.528456449508667,
      "rewards/rejected": -1.7584463357925415,
      "step": 3036
    },
    {
      "epoch": 0.79,
      "grad_norm": 32.30648422241211,
      "kl": 0.0,
      "learning_rate": 1.0259094477885371e-07,
      "logps/chosen": -193.05447387695312,
      "logps/rejected": -302.3185729980469,
      "loss": 0.1595,
      "rewards/chosen": 1.5815995931625366,
      "rewards/margins": 6.047704696655273,
      "rewards/rejected": -4.466104984283447,
      "step": 3037
    },
    {
      "epoch": 0.8,
      "grad_norm": 37.760581970214844,
      "kl": 0.0,
      "learning_rate": 1.0246008898194189e-07,
      "logps/chosen": -229.99197387695312,
      "logps/rejected": -275.3894958496094,
      "loss": 0.242,
      "rewards/chosen": 0.6548804044723511,
      "rewards/margins": 4.6089677810668945,
      "rewards/rejected": -3.954087257385254,
      "step": 3038
    },
    {
      "epoch": 0.8,
      "grad_norm": 79.86377716064453,
      "kl": 0.0,
      "learning_rate": 1.0232923318503009e-07,
      "logps/chosen": -220.7820587158203,
      "logps/rejected": -213.56016540527344,
      "loss": 0.2907,
      "rewards/chosen": -0.5391662120819092,
      "rewards/margins": 3.075031042098999,
      "rewards/rejected": -3.614197254180908,
      "step": 3039
    },
    {
      "epoch": 0.8,
      "grad_norm": 32.71912384033203,
      "kl": 0.0,
      "learning_rate": 1.021983773881183e-07,
      "logps/chosen": -200.76583862304688,
      "logps/rejected": -203.33946228027344,
      "loss": 0.2753,
      "rewards/chosen": 0.9402883648872375,
      "rewards/margins": 5.0086469650268555,
      "rewards/rejected": -4.068358421325684,
      "step": 3040
    },
    {
      "epoch": 0.8,
      "grad_norm": 40.06270217895508,
      "kl": 0.0,
      "learning_rate": 1.0206752159120649e-07,
      "logps/chosen": -197.82125854492188,
      "logps/rejected": -200.48231506347656,
      "loss": 0.1958,
      "rewards/chosen": 1.3599319458007812,
      "rewards/margins": 4.730039596557617,
      "rewards/rejected": -3.370107650756836,
      "step": 3041
    },
    {
      "epoch": 0.8,
      "grad_norm": 29.32779884338379,
      "kl": 0.0,
      "learning_rate": 1.0193666579429467e-07,
      "logps/chosen": -238.9820556640625,
      "logps/rejected": -223.41419982910156,
      "loss": 0.1964,
      "rewards/chosen": 2.0803043842315674,
      "rewards/margins": 6.614651679992676,
      "rewards/rejected": -4.5343475341796875,
      "step": 3042
    },
    {
      "epoch": 0.8,
      "grad_norm": 28.266084671020508,
      "kl": 0.0,
      "learning_rate": 1.0180580999738287e-07,
      "logps/chosen": -229.38027954101562,
      "logps/rejected": -307.1300048828125,
      "loss": 0.2579,
      "rewards/chosen": 2.4810798168182373,
      "rewards/margins": 6.782350540161133,
      "rewards/rejected": -4.301270484924316,
      "step": 3043
    },
    {
      "epoch": 0.8,
      "grad_norm": 32.78583908081055,
      "kl": 0.0,
      "learning_rate": 1.0167495420047108e-07,
      "logps/chosen": -164.74696350097656,
      "logps/rejected": -225.92079162597656,
      "loss": 0.2339,
      "rewards/chosen": 1.194899320602417,
      "rewards/margins": 5.182594299316406,
      "rewards/rejected": -3.98769474029541,
      "step": 3044
    },
    {
      "epoch": 0.8,
      "grad_norm": 28.55518913269043,
      "kl": 0.0,
      "learning_rate": 1.0154409840355928e-07,
      "logps/chosen": -168.16722106933594,
      "logps/rejected": -226.09352111816406,
      "loss": 0.2739,
      "rewards/chosen": 0.7534551620483398,
      "rewards/margins": 3.9352455139160156,
      "rewards/rejected": -3.181790351867676,
      "step": 3045
    },
    {
      "epoch": 0.8,
      "grad_norm": 38.69511032104492,
      "kl": 0.0,
      "learning_rate": 1.0141324260664746e-07,
      "logps/chosen": -159.96542358398438,
      "logps/rejected": -276.47259521484375,
      "loss": 0.2454,
      "rewards/chosen": 0.6250556111335754,
      "rewards/margins": 4.5782928466796875,
      "rewards/rejected": -3.9532370567321777,
      "step": 3046
    },
    {
      "epoch": 0.8,
      "grad_norm": 25.491989135742188,
      "kl": 0.0,
      "learning_rate": 1.0128238680973567e-07,
      "logps/chosen": -162.81649780273438,
      "logps/rejected": -222.93453979492188,
      "loss": 0.1371,
      "rewards/chosen": 2.8242900371551514,
      "rewards/margins": 6.870565414428711,
      "rewards/rejected": -4.046275615692139,
      "step": 3047
    },
    {
      "epoch": 0.8,
      "grad_norm": 72.17855834960938,
      "kl": 0.0,
      "learning_rate": 1.0115153101282386e-07,
      "logps/chosen": -173.59512329101562,
      "logps/rejected": -192.33859252929688,
      "loss": 0.2457,
      "rewards/chosen": 2.209728240966797,
      "rewards/margins": 4.834383010864258,
      "rewards/rejected": -2.624654769897461,
      "step": 3048
    },
    {
      "epoch": 0.8,
      "grad_norm": 30.696731567382812,
      "kl": 0.0,
      "learning_rate": 1.0102067521591206e-07,
      "logps/chosen": -226.53488159179688,
      "logps/rejected": -261.52020263671875,
      "loss": 0.1878,
      "rewards/chosen": 1.8455479145050049,
      "rewards/margins": 6.685124397277832,
      "rewards/rejected": -4.839576721191406,
      "step": 3049
    },
    {
      "epoch": 0.8,
      "grad_norm": 33.9281120300293,
      "kl": 0.0,
      "learning_rate": 1.0088981941900026e-07,
      "logps/chosen": -170.5023651123047,
      "logps/rejected": -269.6197509765625,
      "loss": 0.2826,
      "rewards/chosen": -0.06187135726213455,
      "rewards/margins": 3.0018527507781982,
      "rewards/rejected": -3.0637240409851074,
      "step": 3050
    },
    {
      "epoch": 0.8,
      "grad_norm": 30.158000946044922,
      "kl": 0.0,
      "learning_rate": 1.0075896362208845e-07,
      "logps/chosen": -136.28103637695312,
      "logps/rejected": -260.2313537597656,
      "loss": 0.1391,
      "rewards/chosen": 0.9478456377983093,
      "rewards/margins": 5.134873390197754,
      "rewards/rejected": -4.187027931213379,
      "step": 3051
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.88115692138672,
      "kl": 0.0,
      "learning_rate": 1.0062810782517665e-07,
      "logps/chosen": -159.8663330078125,
      "logps/rejected": -262.3153076171875,
      "loss": 0.3343,
      "rewards/chosen": 0.8258453607559204,
      "rewards/margins": 4.084915637969971,
      "rewards/rejected": -3.25907039642334,
      "step": 3052
    },
    {
      "epoch": 0.8,
      "grad_norm": 30.788469314575195,
      "kl": 0.0,
      "learning_rate": 1.0049725202826486e-07,
      "logps/chosen": -225.1278533935547,
      "logps/rejected": -225.47254943847656,
      "loss": 0.2801,
      "rewards/chosen": 1.7009352445602417,
      "rewards/margins": 4.015796661376953,
      "rewards/rejected": -2.314861297607422,
      "step": 3053
    },
    {
      "epoch": 0.8,
      "grad_norm": 37.72449493408203,
      "kl": 0.0,
      "learning_rate": 1.0036639623135304e-07,
      "logps/chosen": -222.17906188964844,
      "logps/rejected": -288.82110595703125,
      "loss": 0.1386,
      "rewards/chosen": 2.187119245529175,
      "rewards/margins": 6.962368965148926,
      "rewards/rejected": -4.77524995803833,
      "step": 3054
    },
    {
      "epoch": 0.8,
      "grad_norm": 37.279109954833984,
      "kl": 0.0,
      "learning_rate": 1.0023554043444124e-07,
      "logps/chosen": -167.89822387695312,
      "logps/rejected": -264.6224670410156,
      "loss": 0.2888,
      "rewards/chosen": 1.1163854598999023,
      "rewards/margins": 3.8963310718536377,
      "rewards/rejected": -2.7799456119537354,
      "step": 3055
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.867244720458984,
      "kl": 0.0,
      "learning_rate": 1.0010468463752945e-07,
      "logps/chosen": -253.61399841308594,
      "logps/rejected": -304.2154235839844,
      "loss": 0.2146,
      "rewards/chosen": 1.490746021270752,
      "rewards/margins": 6.7905964851379395,
      "rewards/rejected": -5.2998504638671875,
      "step": 3056
    },
    {
      "epoch": 0.8,
      "grad_norm": 38.14105987548828,
      "kl": 0.0,
      "learning_rate": 9.997382884061764e-08,
      "logps/chosen": -143.0299530029297,
      "logps/rejected": -239.80638122558594,
      "loss": 0.2257,
      "rewards/chosen": 1.0738029479980469,
      "rewards/margins": 3.2807700634002686,
      "rewards/rejected": -2.2069671154022217,
      "step": 3057
    },
    {
      "epoch": 0.8,
      "grad_norm": 31.968135833740234,
      "kl": 0.0,
      "learning_rate": 9.984297304370583e-08,
      "logps/chosen": -222.02590942382812,
      "logps/rejected": -259.3979187011719,
      "loss": 0.18,
      "rewards/chosen": 1.1077613830566406,
      "rewards/margins": 6.395566463470459,
      "rewards/rejected": -5.287805080413818,
      "step": 3058
    },
    {
      "epoch": 0.8,
      "grad_norm": 30.759674072265625,
      "kl": 0.0,
      "learning_rate": 9.971211724679402e-08,
      "logps/chosen": -201.220458984375,
      "logps/rejected": -269.8891296386719,
      "loss": 0.1788,
      "rewards/chosen": 2.2144436836242676,
      "rewards/margins": 7.6725993156433105,
      "rewards/rejected": -5.458155632019043,
      "step": 3059
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.30893325805664,
      "kl": 0.0,
      "learning_rate": 9.958126144988223e-08,
      "logps/chosen": -235.98963928222656,
      "logps/rejected": -171.18214416503906,
      "loss": 0.3627,
      "rewards/chosen": 0.8601173162460327,
      "rewards/margins": 3.5811686515808105,
      "rewards/rejected": -2.7210514545440674,
      "step": 3060
    },
    {
      "epoch": 0.8,
      "grad_norm": 36.614986419677734,
      "kl": 0.0,
      "learning_rate": 9.945040565297043e-08,
      "logps/chosen": -98.68263244628906,
      "logps/rejected": -207.23736572265625,
      "loss": 0.2558,
      "rewards/chosen": 0.7655553221702576,
      "rewards/margins": 3.5949623584747314,
      "rewards/rejected": -2.829406976699829,
      "step": 3061
    },
    {
      "epoch": 0.8,
      "grad_norm": 32.950923919677734,
      "kl": 0.0,
      "learning_rate": 9.931954985605861e-08,
      "logps/chosen": -197.07855224609375,
      "logps/rejected": -275.22430419921875,
      "loss": 0.2385,
      "rewards/chosen": 0.7969144582748413,
      "rewards/margins": 4.643563747406006,
      "rewards/rejected": -3.846649408340454,
      "step": 3062
    },
    {
      "epoch": 0.8,
      "grad_norm": 36.137454986572266,
      "kl": 0.0,
      "learning_rate": 9.918869405914682e-08,
      "logps/chosen": -171.88201904296875,
      "logps/rejected": -267.5704040527344,
      "loss": 0.314,
      "rewards/chosen": 2.4939815998077393,
      "rewards/margins": 4.7634735107421875,
      "rewards/rejected": -2.2694921493530273,
      "step": 3063
    },
    {
      "epoch": 0.8,
      "grad_norm": 31.99434471130371,
      "kl": 0.0,
      "learning_rate": 9.905783826223502e-08,
      "logps/chosen": -248.55218505859375,
      "logps/rejected": -202.24285888671875,
      "loss": 0.3361,
      "rewards/chosen": -0.03006848692893982,
      "rewards/margins": 2.202803134918213,
      "rewards/rejected": -2.2328715324401855,
      "step": 3064
    },
    {
      "epoch": 0.8,
      "grad_norm": 43.168006896972656,
      "kl": 0.0,
      "learning_rate": 9.89269824653232e-08,
      "logps/chosen": -233.45132446289062,
      "logps/rejected": -284.4450988769531,
      "loss": 0.1988,
      "rewards/chosen": 0.9019581079483032,
      "rewards/margins": 4.239256858825684,
      "rewards/rejected": -3.33729887008667,
      "step": 3065
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.45249938964844,
      "kl": 0.0,
      "learning_rate": 9.879612666841141e-08,
      "logps/chosen": -278.5670471191406,
      "logps/rejected": -264.644775390625,
      "loss": 0.2263,
      "rewards/chosen": 2.1087911128997803,
      "rewards/margins": 5.218087196350098,
      "rewards/rejected": -3.1092960834503174,
      "step": 3066
    },
    {
      "epoch": 0.8,
      "grad_norm": 24.53069305419922,
      "kl": 0.0,
      "learning_rate": 9.86652708714996e-08,
      "logps/chosen": -190.2322540283203,
      "logps/rejected": -289.7329406738281,
      "loss": 0.1922,
      "rewards/chosen": 1.9357092380523682,
      "rewards/margins": 6.088913917541504,
      "rewards/rejected": -4.153204441070557,
      "step": 3067
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.11714553833008,
      "kl": 0.0,
      "learning_rate": 9.85344150745878e-08,
      "logps/chosen": -245.92459106445312,
      "logps/rejected": -246.92396545410156,
      "loss": 0.183,
      "rewards/chosen": 0.07540301233530045,
      "rewards/margins": 3.0833206176757812,
      "rewards/rejected": -3.007917642593384,
      "step": 3068
    },
    {
      "epoch": 0.8,
      "grad_norm": 27.566394805908203,
      "kl": 0.0,
      "learning_rate": 9.8403559277676e-08,
      "logps/chosen": -217.20533752441406,
      "logps/rejected": -254.9765167236328,
      "loss": 0.2821,
      "rewards/chosen": 0.8280597925186157,
      "rewards/margins": 5.826291561126709,
      "rewards/rejected": -4.998231887817383,
      "step": 3069
    },
    {
      "epoch": 0.8,
      "grad_norm": 35.68161392211914,
      "kl": 0.0,
      "learning_rate": 9.827270348076419e-08,
      "logps/chosen": -216.37933349609375,
      "logps/rejected": -239.38841247558594,
      "loss": 0.2848,
      "rewards/chosen": 1.006296157836914,
      "rewards/margins": 3.833810329437256,
      "rewards/rejected": -2.827514171600342,
      "step": 3070
    },
    {
      "epoch": 0.8,
      "grad_norm": 31.20342445373535,
      "kl": 0.0,
      "learning_rate": 9.814184768385239e-08,
      "logps/chosen": -115.98530578613281,
      "logps/rejected": -222.46414184570312,
      "loss": 0.2491,
      "rewards/chosen": 1.8454002141952515,
      "rewards/margins": 4.2929582595825195,
      "rewards/rejected": -2.4475579261779785,
      "step": 3071
    },
    {
      "epoch": 0.8,
      "grad_norm": 33.1669807434082,
      "kl": 0.0,
      "learning_rate": 9.80109918869406e-08,
      "logps/chosen": -305.29730224609375,
      "logps/rejected": -164.39962768554688,
      "loss": 0.3077,
      "rewards/chosen": 1.2443612813949585,
      "rewards/margins": 3.3249335289001465,
      "rewards/rejected": -2.0805721282958984,
      "step": 3072
    },
    {
      "epoch": 0.8,
      "grad_norm": 26.35659408569336,
      "kl": 0.0,
      "learning_rate": 9.788013609002878e-08,
      "logps/chosen": -216.8447265625,
      "logps/rejected": -254.26968383789062,
      "loss": 0.2024,
      "rewards/chosen": 2.339702606201172,
      "rewards/margins": 7.2026872634887695,
      "rewards/rejected": -4.862984657287598,
      "step": 3073
    },
    {
      "epoch": 0.8,
      "grad_norm": 34.57036590576172,
      "kl": 0.0,
      "learning_rate": 9.774928029311698e-08,
      "logps/chosen": -225.0780792236328,
      "logps/rejected": -183.37353515625,
      "loss": 0.1963,
      "rewards/chosen": 1.5948823690414429,
      "rewards/margins": 3.4738454818725586,
      "rewards/rejected": -1.8789631128311157,
      "step": 3074
    },
    {
      "epoch": 0.8,
      "grad_norm": 55.75965881347656,
      "kl": 0.0,
      "learning_rate": 9.761842449620519e-08,
      "logps/chosen": -236.33316040039062,
      "logps/rejected": -229.29071044921875,
      "loss": 0.3662,
      "rewards/chosen": -0.06309998035430908,
      "rewards/margins": 4.684689521789551,
      "rewards/rejected": -4.74778938293457,
      "step": 3075
    },
    {
      "epoch": 0.81,
      "grad_norm": 29.502851486206055,
      "kl": 0.0,
      "learning_rate": 9.748756869929338e-08,
      "logps/chosen": -151.8082733154297,
      "logps/rejected": -215.3000946044922,
      "loss": 0.1216,
      "rewards/chosen": 1.3713454008102417,
      "rewards/margins": 5.170321464538574,
      "rewards/rejected": -3.798976182937622,
      "step": 3076
    },
    {
      "epoch": 0.81,
      "grad_norm": 28.440292358398438,
      "kl": 0.0,
      "learning_rate": 9.735671290238156e-08,
      "logps/chosen": -153.1015167236328,
      "logps/rejected": -242.10238647460938,
      "loss": 0.1821,
      "rewards/chosen": 1.123049259185791,
      "rewards/margins": 6.207904815673828,
      "rewards/rejected": -5.084855556488037,
      "step": 3077
    },
    {
      "epoch": 0.81,
      "grad_norm": 39.055419921875,
      "kl": 0.0,
      "learning_rate": 9.722585710546976e-08,
      "logps/chosen": -214.06842041015625,
      "logps/rejected": -252.7290802001953,
      "loss": 0.2904,
      "rewards/chosen": 0.05434975028038025,
      "rewards/margins": 3.1375062465667725,
      "rewards/rejected": -3.0831565856933594,
      "step": 3078
    },
    {
      "epoch": 0.81,
      "grad_norm": 25.828947067260742,
      "kl": 0.0,
      "learning_rate": 9.709500130855797e-08,
      "logps/chosen": -201.25909423828125,
      "logps/rejected": -267.3767395019531,
      "loss": 0.1955,
      "rewards/chosen": 3.0395989418029785,
      "rewards/margins": 6.232514381408691,
      "rewards/rejected": -3.192915439605713,
      "step": 3079
    },
    {
      "epoch": 0.81,
      "grad_norm": 30.39990997314453,
      "kl": 0.0,
      "learning_rate": 9.696414551164617e-08,
      "logps/chosen": -180.36355590820312,
      "logps/rejected": -194.8758544921875,
      "loss": 0.2182,
      "rewards/chosen": 0.9802899360656738,
      "rewards/margins": 4.716524124145508,
      "rewards/rejected": -3.736234188079834,
      "step": 3080
    },
    {
      "epoch": 0.81,
      "grad_norm": 38.28114700317383,
      "kl": 0.0,
      "learning_rate": 9.683328971473435e-08,
      "logps/chosen": -188.90740966796875,
      "logps/rejected": -207.32540893554688,
      "loss": 0.2517,
      "rewards/chosen": 1.9970073699951172,
      "rewards/margins": 4.750757694244385,
      "rewards/rejected": -2.7537503242492676,
      "step": 3081
    },
    {
      "epoch": 0.81,
      "grad_norm": 29.68705177307129,
      "kl": 0.0,
      "learning_rate": 9.670243391782256e-08,
      "logps/chosen": -161.58554077148438,
      "logps/rejected": -221.57876586914062,
      "loss": 0.2469,
      "rewards/chosen": 0.7848741412162781,
      "rewards/margins": 5.099874019622803,
      "rewards/rejected": -4.315000057220459,
      "step": 3082
    },
    {
      "epoch": 0.81,
      "grad_norm": 27.6368465423584,
      "kl": 0.0,
      "learning_rate": 9.657157812091075e-08,
      "logps/chosen": -253.8367462158203,
      "logps/rejected": -281.7640380859375,
      "loss": 0.1273,
      "rewards/chosen": 2.3699400424957275,
      "rewards/margins": 6.810979843139648,
      "rewards/rejected": -4.4410400390625,
      "step": 3083
    },
    {
      "epoch": 0.81,
      "grad_norm": 21.8061466217041,
      "kl": 0.0,
      "learning_rate": 9.644072232399895e-08,
      "logps/chosen": -178.3246307373047,
      "logps/rejected": -343.13031005859375,
      "loss": 0.2539,
      "rewards/chosen": -0.23348580300807953,
      "rewards/margins": 4.196106433868408,
      "rewards/rejected": -4.429592132568359,
      "step": 3084
    },
    {
      "epoch": 0.81,
      "grad_norm": 33.39321517944336,
      "kl": 0.0,
      "learning_rate": 9.630986652708715e-08,
      "logps/chosen": -177.02105712890625,
      "logps/rejected": -295.70208740234375,
      "loss": 0.2628,
      "rewards/chosen": -0.0624721497297287,
      "rewards/margins": 5.96860933303833,
      "rewards/rejected": -6.031081676483154,
      "step": 3085
    },
    {
      "epoch": 0.81,
      "grad_norm": 39.35239028930664,
      "kl": 0.0,
      "learning_rate": 9.617901073017534e-08,
      "logps/chosen": -204.71322631835938,
      "logps/rejected": -252.45391845703125,
      "loss": 0.2637,
      "rewards/chosen": -0.5764102935791016,
      "rewards/margins": 2.669811725616455,
      "rewards/rejected": -3.2462220191955566,
      "step": 3086
    },
    {
      "epoch": 0.81,
      "grad_norm": 29.97749900817871,
      "kl": 0.0,
      "learning_rate": 9.604815493326354e-08,
      "logps/chosen": -156.9673614501953,
      "logps/rejected": -241.162109375,
      "loss": 0.2395,
      "rewards/chosen": 1.5875059366226196,
      "rewards/margins": 5.665582656860352,
      "rewards/rejected": -4.0780768394470215,
      "step": 3087
    },
    {
      "epoch": 0.81,
      "grad_norm": 34.644081115722656,
      "kl": 0.0,
      "learning_rate": 9.591729913635175e-08,
      "logps/chosen": -209.1474609375,
      "logps/rejected": -223.16012573242188,
      "loss": 0.2186,
      "rewards/chosen": 0.9284906983375549,
      "rewards/margins": 5.58828067779541,
      "rewards/rejected": -4.6597900390625,
      "step": 3088
    },
    {
      "epoch": 0.81,
      "grad_norm": 48.65402603149414,
      "kl": 0.0,
      "learning_rate": 9.578644333943993e-08,
      "logps/chosen": -222.0018768310547,
      "logps/rejected": -224.7173309326172,
      "loss": 0.3318,
      "rewards/chosen": 0.18279579281806946,
      "rewards/margins": 2.2575557231903076,
      "rewards/rejected": -2.0747599601745605,
      "step": 3089
    },
    {
      "epoch": 0.81,
      "grad_norm": 47.96308517456055,
      "kl": 0.0,
      "learning_rate": 9.565558754252813e-08,
      "logps/chosen": -331.3664245605469,
      "logps/rejected": -217.60836791992188,
      "loss": 0.2445,
      "rewards/chosen": 0.6762347221374512,
      "rewards/margins": 5.298969745635986,
      "rewards/rejected": -4.622735023498535,
      "step": 3090
    },
    {
      "epoch": 0.81,
      "grad_norm": 31.31795310974121,
      "kl": 0.0,
      "learning_rate": 9.552473174561634e-08,
      "logps/chosen": -204.9393310546875,
      "logps/rejected": -170.1268310546875,
      "loss": 0.3128,
      "rewards/chosen": 1.282555103302002,
      "rewards/margins": 4.115183353424072,
      "rewards/rejected": -2.8326282501220703,
      "step": 3091
    },
    {
      "epoch": 0.81,
      "grad_norm": 34.949859619140625,
      "kl": 0.0,
      "learning_rate": 9.539387594870452e-08,
      "logps/chosen": -173.3063201904297,
      "logps/rejected": -293.59075927734375,
      "loss": 0.2331,
      "rewards/chosen": 1.1828242540359497,
      "rewards/margins": 6.227013111114502,
      "rewards/rejected": -5.044188976287842,
      "step": 3092
    },
    {
      "epoch": 0.81,
      "grad_norm": 42.05825424194336,
      "kl": 0.0,
      "learning_rate": 9.526302015179272e-08,
      "logps/chosen": -229.2870635986328,
      "logps/rejected": -262.56317138671875,
      "loss": 0.2754,
      "rewards/chosen": 2.785039186477661,
      "rewards/margins": 5.348743438720703,
      "rewards/rejected": -2.563704013824463,
      "step": 3093
    },
    {
      "epoch": 0.81,
      "grad_norm": 31.66534996032715,
      "kl": 0.0,
      "learning_rate": 9.513216435488091e-08,
      "logps/chosen": -170.36338806152344,
      "logps/rejected": -229.820556640625,
      "loss": 0.2134,
      "rewards/chosen": 0.22713103890419006,
      "rewards/margins": 3.0389466285705566,
      "rewards/rejected": -2.8118155002593994,
      "step": 3094
    },
    {
      "epoch": 0.81,
      "grad_norm": 40.741703033447266,
      "kl": 0.0,
      "learning_rate": 9.500130855796912e-08,
      "logps/chosen": -271.0154113769531,
      "logps/rejected": -269.4881591796875,
      "loss": 0.3646,
      "rewards/chosen": 0.5402659177780151,
      "rewards/margins": 4.598776340484619,
      "rewards/rejected": -4.0585103034973145,
      "step": 3095
    },
    {
      "epoch": 0.81,
      "grad_norm": 37.812774658203125,
      "kl": 0.0,
      "learning_rate": 9.48704527610573e-08,
      "logps/chosen": -165.02536010742188,
      "logps/rejected": -289.88604736328125,
      "loss": 0.2696,
      "rewards/chosen": -0.4038521647453308,
      "rewards/margins": 1.8678739070892334,
      "rewards/rejected": -2.271726131439209,
      "step": 3096
    },
    {
      "epoch": 0.81,
      "grad_norm": 35.640708923339844,
      "kl": 0.0,
      "learning_rate": 9.47395969641455e-08,
      "logps/chosen": -293.63604736328125,
      "logps/rejected": -247.36587524414062,
      "loss": 0.1799,
      "rewards/chosen": 3.4284310340881348,
      "rewards/margins": 6.821096420288086,
      "rewards/rejected": -3.392665386199951,
      "step": 3097
    },
    {
      "epoch": 0.81,
      "grad_norm": 27.39713478088379,
      "kl": 0.0,
      "learning_rate": 9.460874116723371e-08,
      "logps/chosen": -226.48678588867188,
      "logps/rejected": -239.13442993164062,
      "loss": 0.2349,
      "rewards/chosen": 2.1605045795440674,
      "rewards/margins": 7.0987749099731445,
      "rewards/rejected": -4.938270568847656,
      "step": 3098
    },
    {
      "epoch": 0.81,
      "grad_norm": 23.795522689819336,
      "kl": 0.0,
      "learning_rate": 9.44778853703219e-08,
      "logps/chosen": -133.2270965576172,
      "logps/rejected": -317.48931884765625,
      "loss": 0.1622,
      "rewards/chosen": 2.4678852558135986,
      "rewards/margins": 7.570594787597656,
      "rewards/rejected": -5.102709770202637,
      "step": 3099
    },
    {
      "epoch": 0.81,
      "grad_norm": 48.198516845703125,
      "kl": 0.0,
      "learning_rate": 9.434702957341009e-08,
      "logps/chosen": -268.0385437011719,
      "logps/rejected": -235.6272430419922,
      "loss": 0.2053,
      "rewards/chosen": -1.1146745681762695,
      "rewards/margins": 4.273561477661133,
      "rewards/rejected": -5.388236045837402,
      "step": 3100
    },
    {
      "epoch": 0.81,
      "grad_norm": 28.210222244262695,
      "kl": 0.0,
      "learning_rate": 9.42161737764983e-08,
      "logps/chosen": -179.96897888183594,
      "logps/rejected": -268.2703857421875,
      "loss": 0.174,
      "rewards/chosen": 1.6665213108062744,
      "rewards/margins": 7.456060409545898,
      "rewards/rejected": -5.789538860321045,
      "step": 3101
    },
    {
      "epoch": 0.81,
      "grad_norm": 43.28695297241211,
      "kl": 0.0,
      "learning_rate": 9.408531797958649e-08,
      "logps/chosen": -325.22027587890625,
      "logps/rejected": -166.30465698242188,
      "loss": 0.3118,
      "rewards/chosen": 0.15811419486999512,
      "rewards/margins": 3.1421120166778564,
      "rewards/rejected": -2.9839978218078613,
      "step": 3102
    },
    {
      "epoch": 0.81,
      "grad_norm": 36.104244232177734,
      "kl": 0.0,
      "learning_rate": 9.395446218267469e-08,
      "logps/chosen": -261.4429016113281,
      "logps/rejected": -263.02490234375,
      "loss": 0.2642,
      "rewards/chosen": 1.4801288843154907,
      "rewards/margins": 3.8422746658325195,
      "rewards/rejected": -2.3621459007263184,
      "step": 3103
    },
    {
      "epoch": 0.81,
      "grad_norm": 27.173694610595703,
      "kl": 0.0,
      "learning_rate": 9.382360638576289e-08,
      "logps/chosen": -144.93190002441406,
      "logps/rejected": -255.46478271484375,
      "loss": 0.26,
      "rewards/chosen": 1.242961049079895,
      "rewards/margins": 5.085175037384033,
      "rewards/rejected": -3.8422141075134277,
      "step": 3104
    },
    {
      "epoch": 0.81,
      "grad_norm": 46.38798904418945,
      "kl": 0.0,
      "learning_rate": 9.369275058885108e-08,
      "logps/chosen": -229.40740966796875,
      "logps/rejected": -259.7017822265625,
      "loss": 0.349,
      "rewards/chosen": -0.6156241297721863,
      "rewards/margins": 3.493295431137085,
      "rewards/rejected": -4.108919620513916,
      "step": 3105
    },
    {
      "epoch": 0.81,
      "grad_norm": 33.67306137084961,
      "kl": 0.0,
      "learning_rate": 9.356189479193928e-08,
      "logps/chosen": -202.85719299316406,
      "logps/rejected": -230.45826721191406,
      "loss": 0.2143,
      "rewards/chosen": 1.6097205877304077,
      "rewards/margins": 4.408946990966797,
      "rewards/rejected": -2.7992265224456787,
      "step": 3106
    },
    {
      "epoch": 0.81,
      "grad_norm": 28.656518936157227,
      "kl": 0.0,
      "learning_rate": 9.343103899502749e-08,
      "logps/chosen": -175.84097290039062,
      "logps/rejected": -281.7679443359375,
      "loss": 0.2748,
      "rewards/chosen": 1.2173833847045898,
      "rewards/margins": 5.868319034576416,
      "rewards/rejected": -4.650935649871826,
      "step": 3107
    },
    {
      "epoch": 0.81,
      "grad_norm": 47.980682373046875,
      "kl": 0.0,
      "learning_rate": 9.330018319811567e-08,
      "logps/chosen": -256.8486633300781,
      "logps/rejected": -277.4222412109375,
      "loss": 0.295,
      "rewards/chosen": 1.240675687789917,
      "rewards/margins": 6.917579650878906,
      "rewards/rejected": -5.676904201507568,
      "step": 3108
    },
    {
      "epoch": 0.81,
      "grad_norm": 41.02346420288086,
      "kl": 0.0,
      "learning_rate": 9.316932740120387e-08,
      "logps/chosen": -216.6624298095703,
      "logps/rejected": -325.7963562011719,
      "loss": 0.2024,
      "rewards/chosen": 0.9395179748535156,
      "rewards/margins": 3.6543073654174805,
      "rewards/rejected": -2.714789390563965,
      "step": 3109
    },
    {
      "epoch": 0.81,
      "grad_norm": 36.90616226196289,
      "kl": 0.0,
      "learning_rate": 9.303847160429206e-08,
      "logps/chosen": -180.5063018798828,
      "logps/rejected": -304.2553405761719,
      "loss": 0.2122,
      "rewards/chosen": 1.2775695323944092,
      "rewards/margins": 5.875765800476074,
      "rewards/rejected": -4.598196506500244,
      "step": 3110
    },
    {
      "epoch": 0.81,
      "grad_norm": 39.39779281616211,
      "kl": 0.0,
      "learning_rate": 9.290761580738027e-08,
      "logps/chosen": -241.53070068359375,
      "logps/rejected": -229.9088897705078,
      "loss": 0.3191,
      "rewards/chosen": 2.081984043121338,
      "rewards/margins": 6.383558750152588,
      "rewards/rejected": -4.30157470703125,
      "step": 3111
    },
    {
      "epoch": 0.81,
      "grad_norm": 32.545494079589844,
      "kl": 0.0,
      "learning_rate": 9.277676001046845e-08,
      "logps/chosen": -128.02120971679688,
      "logps/rejected": -202.7642059326172,
      "loss": 0.2003,
      "rewards/chosen": 2.5098001956939697,
      "rewards/margins": 5.570107460021973,
      "rewards/rejected": -3.060307502746582,
      "step": 3112
    },
    {
      "epoch": 0.81,
      "grad_norm": 26.97892189025879,
      "kl": 0.0,
      "learning_rate": 9.264590421355665e-08,
      "logps/chosen": -230.6981964111328,
      "logps/rejected": -196.49899291992188,
      "loss": 0.2041,
      "rewards/chosen": 2.649827241897583,
      "rewards/margins": 5.650113105773926,
      "rewards/rejected": -3.000286102294922,
      "step": 3113
    },
    {
      "epoch": 0.81,
      "grad_norm": 43.1445198059082,
      "kl": 0.0,
      "learning_rate": 9.251504841664486e-08,
      "logps/chosen": -194.0313720703125,
      "logps/rejected": -234.02328491210938,
      "loss": 0.2563,
      "rewards/chosen": 0.5206071138381958,
      "rewards/margins": 2.5637826919555664,
      "rewards/rejected": -2.043175458908081,
      "step": 3114
    },
    {
      "epoch": 0.82,
      "grad_norm": 34.82023620605469,
      "kl": 0.0,
      "learning_rate": 9.238419261973306e-08,
      "logps/chosen": -172.72352600097656,
      "logps/rejected": -207.0159912109375,
      "loss": 0.3734,
      "rewards/chosen": 0.12107408046722412,
      "rewards/margins": 3.2228050231933594,
      "rewards/rejected": -3.1017308235168457,
      "step": 3115
    },
    {
      "epoch": 0.82,
      "grad_norm": 38.627567291259766,
      "kl": 0.0,
      "learning_rate": 9.225333682282124e-08,
      "logps/chosen": -227.6630859375,
      "logps/rejected": -323.3218688964844,
      "loss": 0.2393,
      "rewards/chosen": 1.4282304048538208,
      "rewards/margins": 6.921871662139893,
      "rewards/rejected": -5.493641376495361,
      "step": 3116
    },
    {
      "epoch": 0.82,
      "grad_norm": 35.75371551513672,
      "kl": 0.0,
      "learning_rate": 9.212248102590945e-08,
      "logps/chosen": -199.0106201171875,
      "logps/rejected": -221.68994140625,
      "loss": 0.2503,
      "rewards/chosen": 1.679624319076538,
      "rewards/margins": 4.361454010009766,
      "rewards/rejected": -2.6818299293518066,
      "step": 3117
    },
    {
      "epoch": 0.82,
      "grad_norm": 24.521865844726562,
      "kl": 0.0,
      "learning_rate": 9.199162522899764e-08,
      "logps/chosen": -169.25497436523438,
      "logps/rejected": -308.73388671875,
      "loss": 0.1887,
      "rewards/chosen": 1.4655717611312866,
      "rewards/margins": 6.660384178161621,
      "rewards/rejected": -5.194812297821045,
      "step": 3118
    },
    {
      "epoch": 0.82,
      "grad_norm": 30.80224609375,
      "kl": 0.0,
      "learning_rate": 9.186076943208584e-08,
      "logps/chosen": -221.1312713623047,
      "logps/rejected": -227.04100036621094,
      "loss": 0.2005,
      "rewards/chosen": 1.6574556827545166,
      "rewards/margins": 5.227529525756836,
      "rewards/rejected": -3.5700738430023193,
      "step": 3119
    },
    {
      "epoch": 0.82,
      "grad_norm": 29.5427188873291,
      "kl": 0.0,
      "learning_rate": 9.172991363517404e-08,
      "logps/chosen": -170.79373168945312,
      "logps/rejected": -238.59603881835938,
      "loss": 0.1312,
      "rewards/chosen": 1.5359221696853638,
      "rewards/margins": 7.736850738525391,
      "rewards/rejected": -6.200928688049316,
      "step": 3120
    },
    {
      "epoch": 0.82,
      "grad_norm": 32.508872985839844,
      "kl": 0.0,
      "learning_rate": 9.159905783826223e-08,
      "logps/chosen": -310.5272521972656,
      "logps/rejected": -301.6024475097656,
      "loss": 0.1886,
      "rewards/chosen": 1.9752588272094727,
      "rewards/margins": 6.883115291595459,
      "rewards/rejected": -4.907856464385986,
      "step": 3121
    },
    {
      "epoch": 0.82,
      "grad_norm": 38.51951217651367,
      "kl": 0.0,
      "learning_rate": 9.146820204135043e-08,
      "logps/chosen": -228.05850219726562,
      "logps/rejected": -237.06768798828125,
      "loss": 0.3218,
      "rewards/chosen": -1.9714559316635132,
      "rewards/margins": 1.3739887475967407,
      "rewards/rejected": -3.345444679260254,
      "step": 3122
    },
    {
      "epoch": 0.82,
      "grad_norm": 31.491409301757812,
      "kl": 0.0,
      "learning_rate": 9.133734624443861e-08,
      "logps/chosen": -211.30628967285156,
      "logps/rejected": -281.8726501464844,
      "loss": 0.1737,
      "rewards/chosen": 1.4899225234985352,
      "rewards/margins": 5.676398277282715,
      "rewards/rejected": -4.18647575378418,
      "step": 3123
    },
    {
      "epoch": 0.82,
      "grad_norm": 26.910266876220703,
      "kl": 0.0,
      "learning_rate": 9.120649044752682e-08,
      "logps/chosen": -184.70957946777344,
      "logps/rejected": -263.70806884765625,
      "loss": 0.3086,
      "rewards/chosen": 0.3036360740661621,
      "rewards/margins": 4.226426124572754,
      "rewards/rejected": -3.922790288925171,
      "step": 3124
    },
    {
      "epoch": 0.82,
      "grad_norm": 39.30774688720703,
      "kl": 0.0,
      "learning_rate": 9.107563465061502e-08,
      "logps/chosen": -245.2247314453125,
      "logps/rejected": -242.0779266357422,
      "loss": 0.2454,
      "rewards/chosen": 0.7935837507247925,
      "rewards/margins": 4.356839179992676,
      "rewards/rejected": -3.563255548477173,
      "step": 3125
    },
    {
      "epoch": 0.82,
      "grad_norm": 36.49482345581055,
      "kl": 0.0,
      "learning_rate": 9.094477885370321e-08,
      "logps/chosen": -235.6526641845703,
      "logps/rejected": -206.50897216796875,
      "loss": 0.1504,
      "rewards/chosen": 2.0932252407073975,
      "rewards/margins": 6.678851127624512,
      "rewards/rejected": -4.585626125335693,
      "step": 3126
    },
    {
      "epoch": 0.82,
      "grad_norm": 35.809295654296875,
      "kl": 0.0,
      "learning_rate": 9.081392305679141e-08,
      "logps/chosen": -160.95736694335938,
      "logps/rejected": -175.06797790527344,
      "loss": 0.3326,
      "rewards/chosen": 0.812410831451416,
      "rewards/margins": 2.593329906463623,
      "rewards/rejected": -1.7809189558029175,
      "step": 3127
    },
    {
      "epoch": 0.82,
      "grad_norm": 38.98195266723633,
      "kl": 0.0,
      "learning_rate": 9.06830672598796e-08,
      "logps/chosen": -209.9936065673828,
      "logps/rejected": -307.116943359375,
      "loss": 0.2013,
      "rewards/chosen": 2.0579047203063965,
      "rewards/margins": 6.153413772583008,
      "rewards/rejected": -4.095509052276611,
      "step": 3128
    },
    {
      "epoch": 0.82,
      "grad_norm": 21.438058853149414,
      "kl": 0.0,
      "learning_rate": 9.05522114629678e-08,
      "logps/chosen": -157.38998413085938,
      "logps/rejected": -233.13430786132812,
      "loss": 0.3075,
      "rewards/chosen": -0.4898746609687805,
      "rewards/margins": 3.3792076110839844,
      "rewards/rejected": -3.86908221244812,
      "step": 3129
    },
    {
      "epoch": 0.82,
      "grad_norm": 29.17632293701172,
      "kl": 0.0,
      "learning_rate": 9.042135566605601e-08,
      "logps/chosen": -199.85702514648438,
      "logps/rejected": -213.35757446289062,
      "loss": 0.2223,
      "rewards/chosen": 2.3872687816619873,
      "rewards/margins": 5.311424255371094,
      "rewards/rejected": -2.9241557121276855,
      "step": 3130
    },
    {
      "epoch": 0.82,
      "grad_norm": 29.56111717224121,
      "kl": 0.0,
      "learning_rate": 9.029049986914419e-08,
      "logps/chosen": -213.77813720703125,
      "logps/rejected": -226.4279327392578,
      "loss": 0.2771,
      "rewards/chosen": 1.5997400283813477,
      "rewards/margins": 5.683679580688477,
      "rewards/rejected": -4.083939552307129,
      "step": 3131
    },
    {
      "epoch": 0.82,
      "grad_norm": 33.34666442871094,
      "kl": 0.0,
      "learning_rate": 9.015964407223239e-08,
      "logps/chosen": -209.387451171875,
      "logps/rejected": -244.4696044921875,
      "loss": 0.2269,
      "rewards/chosen": 0.500552237033844,
      "rewards/margins": 4.081018447875977,
      "rewards/rejected": -3.5804662704467773,
      "step": 3132
    },
    {
      "epoch": 0.82,
      "grad_norm": 30.667217254638672,
      "kl": 0.0,
      "learning_rate": 9.00287882753206e-08,
      "logps/chosen": -215.02833557128906,
      "logps/rejected": -169.5201873779297,
      "loss": 0.2921,
      "rewards/chosen": 0.9950622320175171,
      "rewards/margins": 3.234675407409668,
      "rewards/rejected": -2.2396130561828613,
      "step": 3133
    },
    {
      "epoch": 0.82,
      "grad_norm": 34.83656692504883,
      "kl": 0.0,
      "learning_rate": 8.98979324784088e-08,
      "logps/chosen": -212.655517578125,
      "logps/rejected": -229.4166259765625,
      "loss": 0.1796,
      "rewards/chosen": 1.0849932432174683,
      "rewards/margins": 5.354401111602783,
      "rewards/rejected": -4.269407749176025,
      "step": 3134
    },
    {
      "epoch": 0.82,
      "grad_norm": 25.507356643676758,
      "kl": 0.0,
      "learning_rate": 8.976707668149698e-08,
      "logps/chosen": -231.75711059570312,
      "logps/rejected": -250.7727813720703,
      "loss": 0.2336,
      "rewards/chosen": 1.0825836658477783,
      "rewards/margins": 4.7187910079956055,
      "rewards/rejected": -3.636207103729248,
      "step": 3135
    },
    {
      "epoch": 0.82,
      "grad_norm": 33.45425796508789,
      "kl": 0.0,
      "learning_rate": 8.963622088458519e-08,
      "logps/chosen": -207.46600341796875,
      "logps/rejected": -223.568359375,
      "loss": 0.3145,
      "rewards/chosen": 1.58919095993042,
      "rewards/margins": 3.960312843322754,
      "rewards/rejected": -2.371121883392334,
      "step": 3136
    },
    {
      "epoch": 0.82,
      "grad_norm": 30.995628356933594,
      "kl": 0.0,
      "learning_rate": 8.950536508767338e-08,
      "logps/chosen": -165.9071502685547,
      "logps/rejected": -265.2596130371094,
      "loss": 0.183,
      "rewards/chosen": 1.7723157405853271,
      "rewards/margins": 5.585575580596924,
      "rewards/rejected": -3.8132598400115967,
      "step": 3137
    },
    {
      "epoch": 0.82,
      "grad_norm": 44.489688873291016,
      "kl": 0.0,
      "learning_rate": 8.937450929076158e-08,
      "logps/chosen": -276.55706787109375,
      "logps/rejected": -195.18429565429688,
      "loss": 0.2868,
      "rewards/chosen": 2.160369396209717,
      "rewards/margins": 3.251467227935791,
      "rewards/rejected": -1.0910977125167847,
      "step": 3138
    },
    {
      "epoch": 0.82,
      "grad_norm": 30.777244567871094,
      "kl": 0.0,
      "learning_rate": 8.924365349384978e-08,
      "logps/chosen": -179.30703735351562,
      "logps/rejected": -178.9294891357422,
      "loss": 0.2333,
      "rewards/chosen": 1.8039159774780273,
      "rewards/margins": 4.568762302398682,
      "rewards/rejected": -2.7648463249206543,
      "step": 3139
    },
    {
      "epoch": 0.82,
      "grad_norm": 92.86982727050781,
      "kl": 0.0,
      "learning_rate": 8.911279769693797e-08,
      "logps/chosen": -151.19696044921875,
      "logps/rejected": -204.7473602294922,
      "loss": 0.2595,
      "rewards/chosen": 1.152949571609497,
      "rewards/margins": 5.322902679443359,
      "rewards/rejected": -4.169952869415283,
      "step": 3140
    },
    {
      "epoch": 0.82,
      "grad_norm": 36.63436508178711,
      "kl": 0.0,
      "learning_rate": 8.898194190002617e-08,
      "logps/chosen": -215.37210083007812,
      "logps/rejected": -314.5213317871094,
      "loss": 0.2001,
      "rewards/chosen": 2.3977222442626953,
      "rewards/margins": 5.928170204162598,
      "rewards/rejected": -3.5304479598999023,
      "step": 3141
    },
    {
      "epoch": 0.82,
      "grad_norm": 35.54846954345703,
      "kl": 0.0,
      "learning_rate": 8.885108610311436e-08,
      "logps/chosen": -189.39260864257812,
      "logps/rejected": -171.74534606933594,
      "loss": 0.2853,
      "rewards/chosen": 0.25655680894851685,
      "rewards/margins": 3.7083852291107178,
      "rewards/rejected": -3.4518284797668457,
      "step": 3142
    },
    {
      "epoch": 0.82,
      "grad_norm": 39.20547103881836,
      "kl": 0.0,
      "learning_rate": 8.872023030620256e-08,
      "logps/chosen": -197.0155029296875,
      "logps/rejected": -288.4905700683594,
      "loss": 0.2787,
      "rewards/chosen": 0.012832609005272388,
      "rewards/margins": 3.237461805343628,
      "rewards/rejected": -3.2246291637420654,
      "step": 3143
    },
    {
      "epoch": 0.82,
      "grad_norm": 38.3444938659668,
      "kl": 0.0,
      "learning_rate": 8.858937450929076e-08,
      "logps/chosen": -202.22515869140625,
      "logps/rejected": -263.79852294921875,
      "loss": 0.2313,
      "rewards/chosen": 2.373213529586792,
      "rewards/margins": 5.190075874328613,
      "rewards/rejected": -2.8168625831604004,
      "step": 3144
    },
    {
      "epoch": 0.82,
      "grad_norm": 29.71002769470215,
      "kl": 0.0,
      "learning_rate": 8.845851871237895e-08,
      "logps/chosen": -299.419921875,
      "logps/rejected": -257.6999206542969,
      "loss": 0.0992,
      "rewards/chosen": 2.6316373348236084,
      "rewards/margins": 6.821929931640625,
      "rewards/rejected": -4.190292835235596,
      "step": 3145
    },
    {
      "epoch": 0.82,
      "grad_norm": 26.98137092590332,
      "kl": 0.0,
      "learning_rate": 8.832766291546716e-08,
      "logps/chosen": -159.2989959716797,
      "logps/rejected": -230.6924591064453,
      "loss": 0.2329,
      "rewards/chosen": 0.03763216733932495,
      "rewards/margins": 4.52135705947876,
      "rewards/rejected": -4.483725070953369,
      "step": 3146
    },
    {
      "epoch": 0.82,
      "grad_norm": 33.412445068359375,
      "kl": 0.0,
      "learning_rate": 8.819680711855534e-08,
      "logps/chosen": -233.58352661132812,
      "logps/rejected": -160.78826904296875,
      "loss": 0.2634,
      "rewards/chosen": 0.21745315194129944,
      "rewards/margins": 3.5812926292419434,
      "rewards/rejected": -3.3638393878936768,
      "step": 3147
    },
    {
      "epoch": 0.82,
      "grad_norm": 36.100624084472656,
      "kl": 0.0,
      "learning_rate": 8.806595132164354e-08,
      "logps/chosen": -197.14633178710938,
      "logps/rejected": -150.0326385498047,
      "loss": 0.3934,
      "rewards/chosen": 0.8015373349189758,
      "rewards/margins": 2.3115899562835693,
      "rewards/rejected": -1.5100525617599487,
      "step": 3148
    },
    {
      "epoch": 0.82,
      "grad_norm": 36.14215850830078,
      "kl": 0.0,
      "learning_rate": 8.793509552473175e-08,
      "logps/chosen": -221.76010131835938,
      "logps/rejected": -222.3566131591797,
      "loss": 0.1804,
      "rewards/chosen": 0.9301916360855103,
      "rewards/margins": 5.47138786315918,
      "rewards/rejected": -4.541196346282959,
      "step": 3149
    },
    {
      "epoch": 0.82,
      "grad_norm": 33.10061264038086,
      "kl": 0.0,
      "learning_rate": 8.780423972781995e-08,
      "logps/chosen": -146.35745239257812,
      "logps/rejected": -314.1618347167969,
      "loss": 0.2803,
      "rewards/chosen": 0.9166402220726013,
      "rewards/margins": 2.8424408435821533,
      "rewards/rejected": -1.9258005619049072,
      "step": 3150
    },
    {
      "epoch": 0.82,
      "grad_norm": 25.759798049926758,
      "kl": 0.0,
      "learning_rate": 8.767338393090813e-08,
      "logps/chosen": -185.44004821777344,
      "logps/rejected": -284.9614562988281,
      "loss": 0.1916,
      "rewards/chosen": 1.393906593322754,
      "rewards/margins": 7.029627799987793,
      "rewards/rejected": -5.635721206665039,
      "step": 3151
    },
    {
      "epoch": 0.82,
      "grad_norm": 25.99704933166504,
      "kl": 0.0,
      "learning_rate": 8.754252813399634e-08,
      "logps/chosen": -290.52667236328125,
      "logps/rejected": -226.89105224609375,
      "loss": 0.1421,
      "rewards/chosen": 1.078086018562317,
      "rewards/margins": 5.805100917816162,
      "rewards/rejected": -4.727015018463135,
      "step": 3152
    },
    {
      "epoch": 0.83,
      "grad_norm": 33.87999725341797,
      "kl": 0.0,
      "learning_rate": 8.741167233708453e-08,
      "logps/chosen": -184.73353576660156,
      "logps/rejected": -241.3988037109375,
      "loss": 0.2319,
      "rewards/chosen": 1.0597847700119019,
      "rewards/margins": 4.234382152557373,
      "rewards/rejected": -3.1745972633361816,
      "step": 3153
    },
    {
      "epoch": 0.83,
      "grad_norm": 42.77838897705078,
      "kl": 0.0,
      "learning_rate": 8.728081654017272e-08,
      "logps/chosen": -162.68724060058594,
      "logps/rejected": -234.38644409179688,
      "loss": 0.2463,
      "rewards/chosen": 1.4074424505233765,
      "rewards/margins": 3.403412342071533,
      "rewards/rejected": -1.9959698915481567,
      "step": 3154
    },
    {
      "epoch": 0.83,
      "grad_norm": 29.633657455444336,
      "kl": 0.0,
      "learning_rate": 8.714996074326093e-08,
      "logps/chosen": -206.44908142089844,
      "logps/rejected": -217.1102752685547,
      "loss": 0.1632,
      "rewards/chosen": 1.5870789289474487,
      "rewards/margins": 4.748828887939453,
      "rewards/rejected": -3.161750078201294,
      "step": 3155
    },
    {
      "epoch": 0.83,
      "grad_norm": 33.112728118896484,
      "kl": 0.0,
      "learning_rate": 8.701910494634912e-08,
      "logps/chosen": -136.0078582763672,
      "logps/rejected": -235.0076904296875,
      "loss": 0.2412,
      "rewards/chosen": 1.6538118124008179,
      "rewards/margins": 4.994295120239258,
      "rewards/rejected": -3.3404834270477295,
      "step": 3156
    },
    {
      "epoch": 0.83,
      "grad_norm": 37.93458557128906,
      "kl": 0.0,
      "learning_rate": 8.688824914943732e-08,
      "logps/chosen": -231.58319091796875,
      "logps/rejected": -197.50265502929688,
      "loss": 0.2361,
      "rewards/chosen": 1.7503066062927246,
      "rewards/margins": 4.487408638000488,
      "rewards/rejected": -2.7371020317077637,
      "step": 3157
    },
    {
      "epoch": 0.83,
      "grad_norm": 45.65302276611328,
      "kl": 0.0,
      "learning_rate": 8.67573933525255e-08,
      "logps/chosen": -142.3944854736328,
      "logps/rejected": -182.17701721191406,
      "loss": 0.2098,
      "rewards/chosen": 1.1196633577346802,
      "rewards/margins": 4.7605438232421875,
      "rewards/rejected": -3.640880584716797,
      "step": 3158
    },
    {
      "epoch": 0.83,
      "grad_norm": 38.333126068115234,
      "kl": 0.0,
      "learning_rate": 8.662653755561371e-08,
      "logps/chosen": -209.46014404296875,
      "logps/rejected": -217.16123962402344,
      "loss": 0.2337,
      "rewards/chosen": -0.3171456456184387,
      "rewards/margins": 2.737488031387329,
      "rewards/rejected": -3.054633617401123,
      "step": 3159
    },
    {
      "epoch": 0.83,
      "grad_norm": 28.92974853515625,
      "kl": 0.0,
      "learning_rate": 8.64956817587019e-08,
      "logps/chosen": -213.71473693847656,
      "logps/rejected": -350.8567199707031,
      "loss": 0.2021,
      "rewards/chosen": 1.1655019521713257,
      "rewards/margins": 4.887118339538574,
      "rewards/rejected": -3.721616506576538,
      "step": 3160
    },
    {
      "epoch": 0.83,
      "grad_norm": 34.88011169433594,
      "kl": 0.0,
      "learning_rate": 8.63648259617901e-08,
      "logps/chosen": -208.52581787109375,
      "logps/rejected": -267.2923889160156,
      "loss": 0.2536,
      "rewards/chosen": 1.8001902103424072,
      "rewards/margins": 5.021884918212891,
      "rewards/rejected": -3.2216944694519043,
      "step": 3161
    },
    {
      "epoch": 0.83,
      "grad_norm": 37.72634506225586,
      "kl": 0.0,
      "learning_rate": 8.62339701648783e-08,
      "logps/chosen": -271.94342041015625,
      "logps/rejected": -235.6877899169922,
      "loss": 0.2743,
      "rewards/chosen": 1.3427650928497314,
      "rewards/margins": 5.064581871032715,
      "rewards/rejected": -3.7218167781829834,
      "step": 3162
    },
    {
      "epoch": 0.83,
      "grad_norm": 31.424816131591797,
      "kl": 0.0,
      "learning_rate": 8.61031143679665e-08,
      "logps/chosen": -225.39549255371094,
      "logps/rejected": -277.0132141113281,
      "loss": 0.2178,
      "rewards/chosen": 1.022281527519226,
      "rewards/margins": 4.791353702545166,
      "rewards/rejected": -3.7690722942352295,
      "step": 3163
    },
    {
      "epoch": 0.83,
      "grad_norm": 47.377498626708984,
      "kl": 0.0,
      "learning_rate": 8.597225857105469e-08,
      "logps/chosen": -183.3324737548828,
      "logps/rejected": -308.13092041015625,
      "loss": 0.2461,
      "rewards/chosen": 1.7639703750610352,
      "rewards/margins": 6.307750225067139,
      "rewards/rejected": -4.5437798500061035,
      "step": 3164
    },
    {
      "epoch": 0.83,
      "grad_norm": 26.83221435546875,
      "kl": 0.0,
      "learning_rate": 8.58414027741429e-08,
      "logps/chosen": -236.23025512695312,
      "logps/rejected": -288.96246337890625,
      "loss": 0.1597,
      "rewards/chosen": 1.581776738166809,
      "rewards/margins": 7.115726470947266,
      "rewards/rejected": -5.533949851989746,
      "step": 3165
    },
    {
      "epoch": 0.83,
      "grad_norm": 33.347965240478516,
      "kl": 0.0,
      "learning_rate": 8.571054697723108e-08,
      "logps/chosen": -196.5367889404297,
      "logps/rejected": -342.7577819824219,
      "loss": 0.1677,
      "rewards/chosen": 1.9373396635055542,
      "rewards/margins": 6.118312358856201,
      "rewards/rejected": -4.180972576141357,
      "step": 3166
    },
    {
      "epoch": 0.83,
      "grad_norm": 49.83609390258789,
      "kl": 0.0,
      "learning_rate": 8.557969118031928e-08,
      "logps/chosen": -233.40382385253906,
      "logps/rejected": -167.3038330078125,
      "loss": 0.2879,
      "rewards/chosen": 2.240593433380127,
      "rewards/margins": 5.706521034240723,
      "rewards/rejected": -3.4659276008605957,
      "step": 3167
    },
    {
      "epoch": 0.83,
      "grad_norm": 33.22333526611328,
      "kl": 0.0,
      "learning_rate": 8.544883538340749e-08,
      "logps/chosen": -177.73777770996094,
      "logps/rejected": -172.5456085205078,
      "loss": 0.1696,
      "rewards/chosen": 2.7628121376037598,
      "rewards/margins": 4.378305912017822,
      "rewards/rejected": -1.6154937744140625,
      "step": 3168
    },
    {
      "epoch": 0.83,
      "grad_norm": 35.4307975769043,
      "kl": 0.0,
      "learning_rate": 8.531797958649568e-08,
      "logps/chosen": -221.61648559570312,
      "logps/rejected": -269.2848815917969,
      "loss": 0.1578,
      "rewards/chosen": 1.386556625366211,
      "rewards/margins": 5.333159923553467,
      "rewards/rejected": -3.946603298187256,
      "step": 3169
    },
    {
      "epoch": 0.83,
      "grad_norm": 28.6766357421875,
      "kl": 0.0,
      "learning_rate": 8.518712378958387e-08,
      "logps/chosen": -216.0562286376953,
      "logps/rejected": -271.5375671386719,
      "loss": 0.2548,
      "rewards/chosen": -1.8031291961669922,
      "rewards/margins": 2.5379562377929688,
      "rewards/rejected": -4.341085433959961,
      "step": 3170
    },
    {
      "epoch": 0.83,
      "grad_norm": 28.462066650390625,
      "kl": 0.0,
      "learning_rate": 8.505626799267208e-08,
      "logps/chosen": -202.88999938964844,
      "logps/rejected": -323.1724853515625,
      "loss": 0.24,
      "rewards/chosen": 2.3474764823913574,
      "rewards/margins": 6.388828277587891,
      "rewards/rejected": -4.041351795196533,
      "step": 3171
    },
    {
      "epoch": 0.83,
      "grad_norm": 47.872802734375,
      "kl": 0.0,
      "learning_rate": 8.492541219576027e-08,
      "logps/chosen": -279.97625732421875,
      "logps/rejected": -255.40150451660156,
      "loss": 0.1864,
      "rewards/chosen": 2.105032444000244,
      "rewards/margins": 5.517726898193359,
      "rewards/rejected": -3.4126944541931152,
      "step": 3172
    },
    {
      "epoch": 0.83,
      "grad_norm": 36.41221618652344,
      "kl": 0.0,
      "learning_rate": 8.479455639884847e-08,
      "logps/chosen": -246.18795776367188,
      "logps/rejected": -206.72479248046875,
      "loss": 0.3049,
      "rewards/chosen": 1.8870315551757812,
      "rewards/margins": 4.641598224639893,
      "rewards/rejected": -2.7545666694641113,
      "step": 3173
    },
    {
      "epoch": 0.83,
      "grad_norm": 29.25550651550293,
      "kl": 0.0,
      "learning_rate": 8.466370060193665e-08,
      "logps/chosen": -235.82455444335938,
      "logps/rejected": -185.5269775390625,
      "loss": 0.2105,
      "rewards/chosen": 2.9374399185180664,
      "rewards/margins": 4.32605504989624,
      "rewards/rejected": -1.3886151313781738,
      "step": 3174
    },
    {
      "epoch": 0.83,
      "grad_norm": 32.950496673583984,
      "kl": 0.0,
      "learning_rate": 8.453284480502486e-08,
      "logps/chosen": -181.21908569335938,
      "logps/rejected": -235.7977752685547,
      "loss": 0.2509,
      "rewards/chosen": 1.955785870552063,
      "rewards/margins": 4.768850803375244,
      "rewards/rejected": -2.8130650520324707,
      "step": 3175
    },
    {
      "epoch": 0.83,
      "grad_norm": 38.27532958984375,
      "kl": 0.0,
      "learning_rate": 8.440198900811306e-08,
      "logps/chosen": -173.7947998046875,
      "logps/rejected": -211.60086059570312,
      "loss": 0.3025,
      "rewards/chosen": 0.642396867275238,
      "rewards/margins": 2.6172077655792236,
      "rewards/rejected": -1.9748109579086304,
      "step": 3176
    },
    {
      "epoch": 0.83,
      "grad_norm": 28.526336669921875,
      "kl": 0.0,
      "learning_rate": 8.427113321120125e-08,
      "logps/chosen": -205.12777709960938,
      "logps/rejected": -226.22775268554688,
      "loss": 0.2181,
      "rewards/chosen": 2.4885826110839844,
      "rewards/margins": 5.492410659790039,
      "rewards/rejected": -3.0038280487060547,
      "step": 3177
    },
    {
      "epoch": 0.83,
      "grad_norm": 30.261062622070312,
      "kl": 0.0,
      "learning_rate": 8.414027741428945e-08,
      "logps/chosen": -155.34446716308594,
      "logps/rejected": -308.4963073730469,
      "loss": 0.1552,
      "rewards/chosen": 1.2393455505371094,
      "rewards/margins": 6.379164218902588,
      "rewards/rejected": -5.1398186683654785,
      "step": 3178
    },
    {
      "epoch": 0.83,
      "grad_norm": 51.739192962646484,
      "kl": 0.0,
      "learning_rate": 8.400942161737765e-08,
      "logps/chosen": -218.99700927734375,
      "logps/rejected": -197.27005004882812,
      "loss": 0.2523,
      "rewards/chosen": 2.2029366493225098,
      "rewards/margins": 4.013391017913818,
      "rewards/rejected": -1.8104544878005981,
      "step": 3179
    },
    {
      "epoch": 0.83,
      "grad_norm": 37.45707702636719,
      "kl": 0.0,
      "learning_rate": 8.387856582046584e-08,
      "logps/chosen": -247.09036254882812,
      "logps/rejected": -280.1000061035156,
      "loss": 0.2628,
      "rewards/chosen": -0.34871169924736023,
      "rewards/margins": 3.9044902324676514,
      "rewards/rejected": -4.253201961517334,
      "step": 3180
    },
    {
      "epoch": 0.83,
      "grad_norm": 35.54933547973633,
      "kl": 0.0,
      "learning_rate": 8.374771002355404e-08,
      "logps/chosen": -289.2021179199219,
      "logps/rejected": -234.57896423339844,
      "loss": 0.1353,
      "rewards/chosen": 2.026050329208374,
      "rewards/margins": 6.521479606628418,
      "rewards/rejected": -4.495429039001465,
      "step": 3181
    },
    {
      "epoch": 0.83,
      "grad_norm": 36.762245178222656,
      "kl": 0.0,
      "learning_rate": 8.361685422664223e-08,
      "logps/chosen": -194.93182373046875,
      "logps/rejected": -270.8766174316406,
      "loss": 0.2034,
      "rewards/chosen": -0.8206627368927002,
      "rewards/margins": 2.1558444499969482,
      "rewards/rejected": -2.9765071868896484,
      "step": 3182
    },
    {
      "epoch": 0.83,
      "grad_norm": 26.107078552246094,
      "kl": 0.0,
      "learning_rate": 8.348599842973043e-08,
      "logps/chosen": -207.82778930664062,
      "logps/rejected": -352.070068359375,
      "loss": 0.125,
      "rewards/chosen": 1.0949733257293701,
      "rewards/margins": 7.711993217468262,
      "rewards/rejected": -6.6170196533203125,
      "step": 3183
    },
    {
      "epoch": 0.83,
      "grad_norm": 21.953126907348633,
      "kl": 0.0,
      "learning_rate": 8.335514263281864e-08,
      "logps/chosen": -177.77029418945312,
      "logps/rejected": -306.3573913574219,
      "loss": 0.1084,
      "rewards/chosen": 2.5120620727539062,
      "rewards/margins": 7.031912803649902,
      "rewards/rejected": -4.519850730895996,
      "step": 3184
    },
    {
      "epoch": 0.83,
      "grad_norm": 30.031909942626953,
      "kl": 0.0,
      "learning_rate": 8.322428683590682e-08,
      "logps/chosen": -233.8975372314453,
      "logps/rejected": -269.39111328125,
      "loss": 0.1616,
      "rewards/chosen": 1.1202813386917114,
      "rewards/margins": 6.912496566772461,
      "rewards/rejected": -5.792215347290039,
      "step": 3185
    },
    {
      "epoch": 0.83,
      "grad_norm": 33.76377868652344,
      "kl": 0.0,
      "learning_rate": 8.309343103899502e-08,
      "logps/chosen": -234.36184692382812,
      "logps/rejected": -317.62469482421875,
      "loss": 0.0828,
      "rewards/chosen": 0.8755607604980469,
      "rewards/margins": 5.862431049346924,
      "rewards/rejected": -4.986870288848877,
      "step": 3186
    },
    {
      "epoch": 0.83,
      "grad_norm": 25.309988021850586,
      "kl": 0.0,
      "learning_rate": 8.296257524208323e-08,
      "logps/chosen": -219.3397674560547,
      "logps/rejected": -196.34658813476562,
      "loss": 0.1625,
      "rewards/chosen": 0.9604638814926147,
      "rewards/margins": 5.255552768707275,
      "rewards/rejected": -4.295088768005371,
      "step": 3187
    },
    {
      "epoch": 0.83,
      "grad_norm": 24.442615509033203,
      "kl": 0.0,
      "learning_rate": 8.283171944517142e-08,
      "logps/chosen": -199.65635681152344,
      "logps/rejected": -239.56736755371094,
      "loss": 0.2406,
      "rewards/chosen": 1.292781949043274,
      "rewards/margins": 4.967615604400635,
      "rewards/rejected": -3.6748335361480713,
      "step": 3188
    },
    {
      "epoch": 0.83,
      "grad_norm": 27.33976173400879,
      "kl": 0.0,
      "learning_rate": 8.27008636482596e-08,
      "logps/chosen": -217.6470947265625,
      "logps/rejected": -220.995849609375,
      "loss": 0.1736,
      "rewards/chosen": 1.8959113359451294,
      "rewards/margins": 5.904285907745361,
      "rewards/rejected": -4.0083746910095215,
      "step": 3189
    },
    {
      "epoch": 0.83,
      "grad_norm": 29.29962158203125,
      "kl": 0.0,
      "learning_rate": 8.25700078513478e-08,
      "logps/chosen": -264.2421875,
      "logps/rejected": -306.50390625,
      "loss": 0.2428,
      "rewards/chosen": 1.8079992532730103,
      "rewards/margins": 7.141074180603027,
      "rewards/rejected": -5.333075046539307,
      "step": 3190
    },
    {
      "epoch": 0.84,
      "grad_norm": 39.233707427978516,
      "kl": 0.0,
      "learning_rate": 8.243915205443601e-08,
      "logps/chosen": -252.67047119140625,
      "logps/rejected": -298.96563720703125,
      "loss": 0.2574,
      "rewards/chosen": 2.291105270385742,
      "rewards/margins": 5.163873195648193,
      "rewards/rejected": -2.872767925262451,
      "step": 3191
    },
    {
      "epoch": 0.84,
      "grad_norm": 28.222354888916016,
      "kl": 0.0,
      "learning_rate": 8.230829625752421e-08,
      "logps/chosen": -224.46841430664062,
      "logps/rejected": -273.3043518066406,
      "loss": 0.1879,
      "rewards/chosen": 1.3577877283096313,
      "rewards/margins": 4.819271087646484,
      "rewards/rejected": -3.4614834785461426,
      "step": 3192
    },
    {
      "epoch": 0.84,
      "grad_norm": 35.241722106933594,
      "kl": 0.0,
      "learning_rate": 8.217744046061239e-08,
      "logps/chosen": -227.81884765625,
      "logps/rejected": -307.7884826660156,
      "loss": 0.2358,
      "rewards/chosen": 1.4391993284225464,
      "rewards/margins": 5.772876739501953,
      "rewards/rejected": -4.333677291870117,
      "step": 3193
    },
    {
      "epoch": 0.84,
      "grad_norm": 23.206933975219727,
      "kl": 0.0,
      "learning_rate": 8.20465846637006e-08,
      "logps/chosen": -206.38482666015625,
      "logps/rejected": -225.6373291015625,
      "loss": 0.1811,
      "rewards/chosen": 1.8850288391113281,
      "rewards/margins": 5.365200519561768,
      "rewards/rejected": -3.4801716804504395,
      "step": 3194
    },
    {
      "epoch": 0.84,
      "grad_norm": 25.119455337524414,
      "kl": 0.0,
      "learning_rate": 8.19157288667888e-08,
      "logps/chosen": -173.25401306152344,
      "logps/rejected": -264.8511657714844,
      "loss": 0.2923,
      "rewards/chosen": -0.01973581314086914,
      "rewards/margins": 3.4208624362945557,
      "rewards/rejected": -3.440598249435425,
      "step": 3195
    },
    {
      "epoch": 0.84,
      "grad_norm": 31.192039489746094,
      "kl": 0.0,
      "learning_rate": 8.178487306987699e-08,
      "logps/chosen": -157.18247985839844,
      "logps/rejected": -212.47877502441406,
      "loss": 0.3075,
      "rewards/chosen": 1.0312573909759521,
      "rewards/margins": 3.534039258956909,
      "rewards/rejected": -2.502781867980957,
      "step": 3196
    },
    {
      "epoch": 0.84,
      "grad_norm": 24.054067611694336,
      "kl": 0.0,
      "learning_rate": 8.165401727296519e-08,
      "logps/chosen": -145.00535583496094,
      "logps/rejected": -198.6532745361328,
      "loss": 0.2582,
      "rewards/chosen": -0.1868196278810501,
      "rewards/margins": 3.4146223068237305,
      "rewards/rejected": -3.6014418601989746,
      "step": 3197
    },
    {
      "epoch": 0.84,
      "grad_norm": 28.315120697021484,
      "kl": 0.0,
      "learning_rate": 8.152316147605338e-08,
      "logps/chosen": -310.1528625488281,
      "logps/rejected": -249.92959594726562,
      "loss": 0.2273,
      "rewards/chosen": -0.11365220695734024,
      "rewards/margins": 4.29476261138916,
      "rewards/rejected": -4.408414840698242,
      "step": 3198
    },
    {
      "epoch": 0.84,
      "grad_norm": 30.878515243530273,
      "kl": 0.0,
      "learning_rate": 8.139230567914158e-08,
      "logps/chosen": -269.9025573730469,
      "logps/rejected": -238.60348510742188,
      "loss": 0.1857,
      "rewards/chosen": 1.1058082580566406,
      "rewards/margins": 5.0078125,
      "rewards/rejected": -3.9020044803619385,
      "step": 3199
    },
    {
      "epoch": 0.84,
      "grad_norm": 36.15157699584961,
      "kl": 0.0,
      "learning_rate": 8.126144988222979e-08,
      "logps/chosen": -282.2198791503906,
      "logps/rejected": -272.2900085449219,
      "loss": 0.2517,
      "rewards/chosen": 0.5260896682739258,
      "rewards/margins": 5.039384841918945,
      "rewards/rejected": -4.5132951736450195,
      "step": 3200
    },
    {
      "epoch": 0.84,
      "grad_norm": 27.950164794921875,
      "kl": 0.0,
      "learning_rate": 8.113059408531797e-08,
      "logps/chosen": -236.32823181152344,
      "logps/rejected": -230.14259338378906,
      "loss": 0.1538,
      "rewards/chosen": 3.0094211101531982,
      "rewards/margins": 7.373923301696777,
      "rewards/rejected": -4.364501953125,
      "step": 3201
    },
    {
      "epoch": 0.84,
      "grad_norm": 38.57584762573242,
      "kl": 0.0,
      "learning_rate": 8.099973828840617e-08,
      "logps/chosen": -192.27294921875,
      "logps/rejected": -292.67828369140625,
      "loss": 0.1894,
      "rewards/chosen": 1.4443457126617432,
      "rewards/margins": 6.036561965942383,
      "rewards/rejected": -4.5922160148620605,
      "step": 3202
    },
    {
      "epoch": 0.84,
      "grad_norm": 38.501190185546875,
      "kl": 0.0,
      "learning_rate": 8.086888249149438e-08,
      "logps/chosen": -198.6505126953125,
      "logps/rejected": -278.82122802734375,
      "loss": 0.231,
      "rewards/chosen": 1.5485109090805054,
      "rewards/margins": 4.2544755935668945,
      "rewards/rejected": -2.7059645652770996,
      "step": 3203
    },
    {
      "epoch": 0.84,
      "grad_norm": 31.10076141357422,
      "kl": 0.0,
      "learning_rate": 8.073802669458257e-08,
      "logps/chosen": -126.0052490234375,
      "logps/rejected": -273.44488525390625,
      "loss": 0.2894,
      "rewards/chosen": 0.5634855031967163,
      "rewards/margins": 4.985409259796143,
      "rewards/rejected": -4.421923637390137,
      "step": 3204
    },
    {
      "epoch": 0.84,
      "grad_norm": 25.549251556396484,
      "kl": 0.0,
      "learning_rate": 8.060717089767076e-08,
      "logps/chosen": -222.45462036132812,
      "logps/rejected": -254.40975952148438,
      "loss": 0.2332,
      "rewards/chosen": 0.2601969540119171,
      "rewards/margins": 3.998560905456543,
      "rewards/rejected": -3.7383639812469482,
      "step": 3205
    },
    {
      "epoch": 0.84,
      "grad_norm": 33.01713562011719,
      "kl": 0.0,
      "learning_rate": 8.047631510075895e-08,
      "logps/chosen": -177.3705596923828,
      "logps/rejected": -257.90863037109375,
      "loss": 0.2629,
      "rewards/chosen": 0.653936505317688,
      "rewards/margins": 4.622261047363281,
      "rewards/rejected": -3.9683244228363037,
      "step": 3206
    },
    {
      "epoch": 0.84,
      "grad_norm": 38.29364776611328,
      "kl": 0.0,
      "learning_rate": 8.034545930384716e-08,
      "logps/chosen": -144.2410125732422,
      "logps/rejected": -287.2203674316406,
      "loss": 0.2483,
      "rewards/chosen": 0.690515398979187,
      "rewards/margins": 6.036245822906494,
      "rewards/rejected": -5.345730304718018,
      "step": 3207
    },
    {
      "epoch": 0.84,
      "grad_norm": 35.30019760131836,
      "kl": 0.0,
      "learning_rate": 8.021460350693536e-08,
      "logps/chosen": -192.75279235839844,
      "logps/rejected": -280.9093017578125,
      "loss": 0.2415,
      "rewards/chosen": 1.6136605739593506,
      "rewards/margins": 4.508999824523926,
      "rewards/rejected": -2.8953394889831543,
      "step": 3208
    },
    {
      "epoch": 0.84,
      "grad_norm": 30.80791473388672,
      "kl": 0.0,
      "learning_rate": 8.008374771002354e-08,
      "logps/chosen": -219.2487335205078,
      "logps/rejected": -264.60992431640625,
      "loss": 0.3327,
      "rewards/chosen": 0.5078659057617188,
      "rewards/margins": 4.614754676818848,
      "rewards/rejected": -4.106888771057129,
      "step": 3209
    },
    {
      "epoch": 0.84,
      "grad_norm": 28.3983154296875,
      "kl": 0.0,
      "learning_rate": 7.995289191311175e-08,
      "logps/chosen": -202.18222045898438,
      "logps/rejected": -248.34375,
      "loss": 0.153,
      "rewards/chosen": 2.136230707168579,
      "rewards/margins": 6.461572647094727,
      "rewards/rejected": -4.325342178344727,
      "step": 3210
    },
    {
      "epoch": 0.84,
      "grad_norm": 32.261634826660156,
      "kl": 0.0,
      "learning_rate": 7.982203611619995e-08,
      "logps/chosen": -181.50257873535156,
      "logps/rejected": -274.13519287109375,
      "loss": 0.2007,
      "rewards/chosen": 1.8369464874267578,
      "rewards/margins": 4.2242255210876465,
      "rewards/rejected": -2.3872790336608887,
      "step": 3211
    },
    {
      "epoch": 0.84,
      "grad_norm": 31.96755599975586,
      "kl": 0.0,
      "learning_rate": 7.969118031928813e-08,
      "logps/chosen": -151.80557250976562,
      "logps/rejected": -224.90036010742188,
      "loss": 0.4,
      "rewards/chosen": 0.3806571364402771,
      "rewards/margins": 2.918895721435547,
      "rewards/rejected": -2.538238525390625,
      "step": 3212
    },
    {
      "epoch": 0.84,
      "grad_norm": 27.303483963012695,
      "kl": 0.0,
      "learning_rate": 7.956032452237634e-08,
      "logps/chosen": -276.4804992675781,
      "logps/rejected": -318.6486511230469,
      "loss": 0.1806,
      "rewards/chosen": 0.7546684145927429,
      "rewards/margins": 6.069582462310791,
      "rewards/rejected": -5.314914226531982,
      "step": 3213
    },
    {
      "epoch": 0.84,
      "grad_norm": 51.559303283691406,
      "kl": 0.0,
      "learning_rate": 7.942946872546454e-08,
      "logps/chosen": -243.2832794189453,
      "logps/rejected": -192.78231811523438,
      "loss": 0.3287,
      "rewards/chosen": 1.5706136226654053,
      "rewards/margins": 4.6659345626831055,
      "rewards/rejected": -3.0953211784362793,
      "step": 3214
    },
    {
      "epoch": 0.84,
      "grad_norm": 38.36349868774414,
      "kl": 0.0,
      "learning_rate": 7.929861292855273e-08,
      "logps/chosen": -134.96987915039062,
      "logps/rejected": -270.34259033203125,
      "loss": 0.1782,
      "rewards/chosen": 2.149251937866211,
      "rewards/margins": 4.776583194732666,
      "rewards/rejected": -2.627331256866455,
      "step": 3215
    },
    {
      "epoch": 0.84,
      "grad_norm": 29.82472038269043,
      "kl": 0.0,
      "learning_rate": 7.916775713164093e-08,
      "logps/chosen": -255.9649200439453,
      "logps/rejected": -271.6165771484375,
      "loss": 0.2529,
      "rewards/chosen": 1.3246444463729858,
      "rewards/margins": 6.542586803436279,
      "rewards/rejected": -5.217942237854004,
      "step": 3216
    },
    {
      "epoch": 0.84,
      "grad_norm": 37.64690399169922,
      "kl": 0.0,
      "learning_rate": 7.903690133472912e-08,
      "logps/chosen": -192.82557678222656,
      "logps/rejected": -246.595947265625,
      "loss": 0.3331,
      "rewards/chosen": 1.6345527172088623,
      "rewards/margins": 5.916933059692383,
      "rewards/rejected": -4.282380104064941,
      "step": 3217
    },
    {
      "epoch": 0.84,
      "grad_norm": 28.811220169067383,
      "kl": 0.0,
      "learning_rate": 7.890604553781732e-08,
      "logps/chosen": -160.36233520507812,
      "logps/rejected": -200.77005004882812,
      "loss": 0.235,
      "rewards/chosen": 1.1709307432174683,
      "rewards/margins": 4.160086631774902,
      "rewards/rejected": -2.9891560077667236,
      "step": 3218
    },
    {
      "epoch": 0.84,
      "grad_norm": 44.311737060546875,
      "kl": 0.0,
      "learning_rate": 7.877518974090553e-08,
      "logps/chosen": -139.6275634765625,
      "logps/rejected": -238.56863403320312,
      "loss": 0.3518,
      "rewards/chosen": 0.10801827907562256,
      "rewards/margins": 2.9018564224243164,
      "rewards/rejected": -2.7938380241394043,
      "step": 3219
    },
    {
      "epoch": 0.84,
      "grad_norm": 31.64383316040039,
      "kl": 0.0,
      "learning_rate": 7.864433394399371e-08,
      "logps/chosen": -140.96168518066406,
      "logps/rejected": -304.3995666503906,
      "loss": 0.2882,
      "rewards/chosen": 0.13699030876159668,
      "rewards/margins": 8.668340682983398,
      "rewards/rejected": -8.531350135803223,
      "step": 3220
    },
    {
      "epoch": 0.84,
      "grad_norm": 35.97218322753906,
      "kl": 0.0,
      "learning_rate": 7.851347814708191e-08,
      "logps/chosen": -193.40135192871094,
      "logps/rejected": -165.84324645996094,
      "loss": 0.292,
      "rewards/chosen": 2.2008094787597656,
      "rewards/margins": 4.323521614074707,
      "rewards/rejected": -2.1227123737335205,
      "step": 3221
    },
    {
      "epoch": 0.84,
      "grad_norm": 34.96503829956055,
      "kl": 0.0,
      "learning_rate": 7.838262235017012e-08,
      "logps/chosen": -293.71673583984375,
      "logps/rejected": -264.406982421875,
      "loss": 0.2792,
      "rewards/chosen": 1.4762459993362427,
      "rewards/margins": 5.221151828765869,
      "rewards/rejected": -3.744905948638916,
      "step": 3222
    },
    {
      "epoch": 0.84,
      "grad_norm": 32.15884017944336,
      "kl": 0.0,
      "learning_rate": 7.825176655325831e-08,
      "logps/chosen": -148.90626525878906,
      "logps/rejected": -291.37860107421875,
      "loss": 0.2933,
      "rewards/chosen": -0.11567753553390503,
      "rewards/margins": 3.3869001865386963,
      "rewards/rejected": -3.502577781677246,
      "step": 3223
    },
    {
      "epoch": 0.84,
      "grad_norm": 27.66660499572754,
      "kl": 0.0,
      "learning_rate": 7.81209107563465e-08,
      "logps/chosen": -199.25677490234375,
      "logps/rejected": -200.94052124023438,
      "loss": 0.2217,
      "rewards/chosen": 1.435728907585144,
      "rewards/margins": 5.693033218383789,
      "rewards/rejected": -4.2573041915893555,
      "step": 3224
    },
    {
      "epoch": 0.84,
      "grad_norm": 27.243793487548828,
      "kl": 0.0,
      "learning_rate": 7.799005495943469e-08,
      "logps/chosen": -270.1687927246094,
      "logps/rejected": -238.72988891601562,
      "loss": 0.1522,
      "rewards/chosen": 2.963019609451294,
      "rewards/margins": 6.563299655914307,
      "rewards/rejected": -3.6002800464630127,
      "step": 3225
    },
    {
      "epoch": 0.84,
      "grad_norm": 24.84229850769043,
      "kl": 0.0,
      "learning_rate": 7.78591991625229e-08,
      "logps/chosen": -196.7878875732422,
      "logps/rejected": -330.93072509765625,
      "loss": 0.233,
      "rewards/chosen": 2.8932948112487793,
      "rewards/margins": 6.86652946472168,
      "rewards/rejected": -3.9732346534729004,
      "step": 3226
    },
    {
      "epoch": 0.84,
      "grad_norm": 31.127796173095703,
      "kl": 0.0,
      "learning_rate": 7.77283433656111e-08,
      "logps/chosen": -198.16998291015625,
      "logps/rejected": -208.06605529785156,
      "loss": 0.2182,
      "rewards/chosen": -1.5927191972732544,
      "rewards/margins": 2.7119503021240234,
      "rewards/rejected": -4.304669380187988,
      "step": 3227
    },
    {
      "epoch": 0.84,
      "grad_norm": 36.46774673461914,
      "kl": 0.0,
      "learning_rate": 7.759748756869928e-08,
      "logps/chosen": -239.41961669921875,
      "logps/rejected": -234.93087768554688,
      "loss": 0.223,
      "rewards/chosen": 2.2043843269348145,
      "rewards/margins": 4.640742301940918,
      "rewards/rejected": -2.4363577365875244,
      "step": 3228
    },
    {
      "epoch": 0.85,
      "grad_norm": 35.70278549194336,
      "kl": 0.0,
      "learning_rate": 7.746663177178749e-08,
      "logps/chosen": -233.97518920898438,
      "logps/rejected": -235.1247100830078,
      "loss": 0.1729,
      "rewards/chosen": 2.1642262935638428,
      "rewards/margins": 4.9302520751953125,
      "rewards/rejected": -2.7660255432128906,
      "step": 3229
    },
    {
      "epoch": 0.85,
      "grad_norm": 33.79775619506836,
      "kl": 0.0,
      "learning_rate": 7.733577597487569e-08,
      "logps/chosen": -186.15023803710938,
      "logps/rejected": -248.98333740234375,
      "loss": 0.2755,
      "rewards/chosen": 0.7999600172042847,
      "rewards/margins": 5.275784969329834,
      "rewards/rejected": -4.47582483291626,
      "step": 3230
    },
    {
      "epoch": 0.85,
      "grad_norm": 30.210243225097656,
      "kl": 0.0,
      "learning_rate": 7.720492017796388e-08,
      "logps/chosen": -284.4582824707031,
      "logps/rejected": -281.6883544921875,
      "loss": 0.2504,
      "rewards/chosen": 1.0357283353805542,
      "rewards/margins": 5.805393218994141,
      "rewards/rejected": -4.769664764404297,
      "step": 3231
    },
    {
      "epoch": 0.85,
      "grad_norm": 27.326990127563477,
      "kl": 0.0,
      "learning_rate": 7.707406438105208e-08,
      "logps/chosen": -143.6712188720703,
      "logps/rejected": -282.7164306640625,
      "loss": 0.317,
      "rewards/chosen": 0.5529003739356995,
      "rewards/margins": 4.33873176574707,
      "rewards/rejected": -3.7858314514160156,
      "step": 3232
    },
    {
      "epoch": 0.85,
      "grad_norm": 35.79087448120117,
      "kl": 0.0,
      "learning_rate": 7.694320858414027e-08,
      "logps/chosen": -196.32203674316406,
      "logps/rejected": -246.3926544189453,
      "loss": 0.2145,
      "rewards/chosen": 1.676134467124939,
      "rewards/margins": 6.369762897491455,
      "rewards/rejected": -4.693628311157227,
      "step": 3233
    },
    {
      "epoch": 0.85,
      "grad_norm": 31.284025192260742,
      "kl": 0.0,
      "learning_rate": 7.681235278722847e-08,
      "logps/chosen": -155.36402893066406,
      "logps/rejected": -301.85418701171875,
      "loss": 0.3093,
      "rewards/chosen": 0.6385982036590576,
      "rewards/margins": 3.672419309616089,
      "rewards/rejected": -3.0338211059570312,
      "step": 3234
    },
    {
      "epoch": 0.85,
      "grad_norm": 24.24800682067871,
      "kl": 0.0,
      "learning_rate": 7.668149699031668e-08,
      "logps/chosen": -172.85191345214844,
      "logps/rejected": -227.68849182128906,
      "loss": 0.1207,
      "rewards/chosen": 3.319450616836548,
      "rewards/margins": 6.2267866134643555,
      "rewards/rejected": -2.9073362350463867,
      "step": 3235
    },
    {
      "epoch": 0.85,
      "grad_norm": 34.73143005371094,
      "kl": 0.0,
      "learning_rate": 7.655064119340486e-08,
      "logps/chosen": -130.93258666992188,
      "logps/rejected": -256.797119140625,
      "loss": 0.3582,
      "rewards/chosen": -0.7385777235031128,
      "rewards/margins": 4.073555946350098,
      "rewards/rejected": -4.8121337890625,
      "step": 3236
    },
    {
      "epoch": 0.85,
      "grad_norm": 38.271507263183594,
      "kl": 0.0,
      "learning_rate": 7.641978539649306e-08,
      "logps/chosen": -225.26898193359375,
      "logps/rejected": -152.4866180419922,
      "loss": 0.3053,
      "rewards/chosen": 2.025780439376831,
      "rewards/margins": 4.564108371734619,
      "rewards/rejected": -2.538327932357788,
      "step": 3237
    },
    {
      "epoch": 0.85,
      "grad_norm": 32.1751708984375,
      "kl": 0.0,
      "learning_rate": 7.628892959958127e-08,
      "logps/chosen": -260.02423095703125,
      "logps/rejected": -164.86642456054688,
      "loss": 0.2417,
      "rewards/chosen": -0.08353383094072342,
      "rewards/margins": 2.7393460273742676,
      "rewards/rejected": -2.8228797912597656,
      "step": 3238
    },
    {
      "epoch": 0.85,
      "grad_norm": 29.464027404785156,
      "kl": 0.0,
      "learning_rate": 7.615807380266945e-08,
      "logps/chosen": -215.1149139404297,
      "logps/rejected": -328.502197265625,
      "loss": 0.3162,
      "rewards/chosen": 1.711369514465332,
      "rewards/margins": 7.5639872550964355,
      "rewards/rejected": -5.8526177406311035,
      "step": 3239
    },
    {
      "epoch": 0.85,
      "grad_norm": 29.200777053833008,
      "kl": 0.0,
      "learning_rate": 7.602721800575765e-08,
      "logps/chosen": -279.73236083984375,
      "logps/rejected": -186.07730102539062,
      "loss": 0.2846,
      "rewards/chosen": -0.08167314529418945,
      "rewards/margins": 3.102032423019409,
      "rewards/rejected": -3.1837055683135986,
      "step": 3240
    },
    {
      "epoch": 0.85,
      "grad_norm": 20.829153060913086,
      "kl": 0.0,
      "learning_rate": 7.589636220884584e-08,
      "logps/chosen": -168.58822631835938,
      "logps/rejected": -271.2360534667969,
      "loss": 0.1802,
      "rewards/chosen": 2.281903028488159,
      "rewards/margins": 4.783234596252441,
      "rewards/rejected": -2.501331329345703,
      "step": 3241
    },
    {
      "epoch": 0.85,
      "grad_norm": 28.79102325439453,
      "kl": 0.0,
      "learning_rate": 7.576550641193405e-08,
      "logps/chosen": -145.00428771972656,
      "logps/rejected": -232.7147979736328,
      "loss": 0.1955,
      "rewards/chosen": 1.831176996231079,
      "rewards/margins": 5.964174270629883,
      "rewards/rejected": -4.132997035980225,
      "step": 3242
    },
    {
      "epoch": 0.85,
      "grad_norm": 45.27934265136719,
      "kl": 0.0,
      "learning_rate": 7.563465061502223e-08,
      "logps/chosen": -227.49392700195312,
      "logps/rejected": -241.43299865722656,
      "loss": 0.3037,
      "rewards/chosen": 0.29154136776924133,
      "rewards/margins": 2.1040217876434326,
      "rewards/rejected": -1.8124803304672241,
      "step": 3243
    },
    {
      "epoch": 0.85,
      "grad_norm": 38.81378936767578,
      "kl": 0.0,
      "learning_rate": 7.550379481811043e-08,
      "logps/chosen": -193.5105438232422,
      "logps/rejected": -267.61676025390625,
      "loss": 0.2176,
      "rewards/chosen": 1.1929868459701538,
      "rewards/margins": 4.8053669929504395,
      "rewards/rejected": -3.612380266189575,
      "step": 3244
    },
    {
      "epoch": 0.85,
      "grad_norm": 31.02093505859375,
      "kl": 0.0,
      "learning_rate": 7.537293902119864e-08,
      "logps/chosen": -164.437255859375,
      "logps/rejected": -237.98223876953125,
      "loss": 0.2863,
      "rewards/chosen": 0.6644800901412964,
      "rewards/margins": 3.926863670349121,
      "rewards/rejected": -3.262383460998535,
      "step": 3245
    },
    {
      "epoch": 0.85,
      "grad_norm": 31.176836013793945,
      "kl": 0.0,
      "learning_rate": 7.524208322428684e-08,
      "logps/chosen": -250.9615936279297,
      "logps/rejected": -214.7868194580078,
      "loss": 0.1822,
      "rewards/chosen": 1.7779932022094727,
      "rewards/margins": 6.728030681610107,
      "rewards/rejected": -4.950037479400635,
      "step": 3246
    },
    {
      "epoch": 0.85,
      "grad_norm": 26.141632080078125,
      "kl": 0.0,
      "learning_rate": 7.511122742737502e-08,
      "logps/chosen": -206.78175354003906,
      "logps/rejected": -313.44171142578125,
      "loss": 0.2266,
      "rewards/chosen": 1.8484094142913818,
      "rewards/margins": 7.116808891296387,
      "rewards/rejected": -5.268399238586426,
      "step": 3247
    },
    {
      "epoch": 0.85,
      "grad_norm": 40.501155853271484,
      "kl": 0.0,
      "learning_rate": 7.498037163046323e-08,
      "logps/chosen": -258.9842529296875,
      "logps/rejected": -315.1947021484375,
      "loss": 0.2318,
      "rewards/chosen": 1.151440978050232,
      "rewards/margins": 7.068854331970215,
      "rewards/rejected": -5.917413234710693,
      "step": 3248
    },
    {
      "epoch": 0.85,
      "grad_norm": 37.80188751220703,
      "kl": 0.0,
      "learning_rate": 7.484951583355142e-08,
      "logps/chosen": -222.5956268310547,
      "logps/rejected": -309.52630615234375,
      "loss": 0.2064,
      "rewards/chosen": 1.6434623003005981,
      "rewards/margins": 4.907730579376221,
      "rewards/rejected": -3.264268398284912,
      "step": 3249
    },
    {
      "epoch": 0.85,
      "grad_norm": 43.29213333129883,
      "kl": 0.0,
      "learning_rate": 7.471866003663962e-08,
      "logps/chosen": -209.0088348388672,
      "logps/rejected": -259.30157470703125,
      "loss": 0.2872,
      "rewards/chosen": 1.5399799346923828,
      "rewards/margins": 4.152887344360352,
      "rewards/rejected": -2.6129074096679688,
      "step": 3250
    },
    {
      "epoch": 0.85,
      "grad_norm": 35.9322624206543,
      "kl": 0.0,
      "learning_rate": 7.458780423972782e-08,
      "logps/chosen": -174.74900817871094,
      "logps/rejected": -288.7847900390625,
      "loss": 0.2254,
      "rewards/chosen": 1.1389139890670776,
      "rewards/margins": 5.006952285766602,
      "rewards/rejected": -3.8680381774902344,
      "step": 3251
    },
    {
      "epoch": 0.85,
      "grad_norm": 35.05708694458008,
      "kl": 0.0,
      "learning_rate": 7.445694844281601e-08,
      "logps/chosen": -212.56893920898438,
      "logps/rejected": -207.61000061035156,
      "loss": 0.2539,
      "rewards/chosen": 1.387547492980957,
      "rewards/margins": 4.24867057800293,
      "rewards/rejected": -2.8611228466033936,
      "step": 3252
    },
    {
      "epoch": 0.85,
      "grad_norm": 28.2816104888916,
      "kl": 0.0,
      "learning_rate": 7.432609264590421e-08,
      "logps/chosen": -194.02349853515625,
      "logps/rejected": -223.91067504882812,
      "loss": 0.2686,
      "rewards/chosen": 1.438894510269165,
      "rewards/margins": 4.563453197479248,
      "rewards/rejected": -3.124558687210083,
      "step": 3253
    },
    {
      "epoch": 0.85,
      "grad_norm": 38.05796432495117,
      "kl": 0.0,
      "learning_rate": 7.419523684899242e-08,
      "logps/chosen": -141.80540466308594,
      "logps/rejected": -262.2084045410156,
      "loss": 0.2656,
      "rewards/chosen": 0.646595299243927,
      "rewards/margins": 3.703841209411621,
      "rewards/rejected": -3.057245969772339,
      "step": 3254
    },
    {
      "epoch": 0.85,
      "grad_norm": 29.759531021118164,
      "kl": 0.0,
      "learning_rate": 7.40643810520806e-08,
      "logps/chosen": -231.3519744873047,
      "logps/rejected": -239.27000427246094,
      "loss": 0.1509,
      "rewards/chosen": 1.9064635038375854,
      "rewards/margins": 5.785111427307129,
      "rewards/rejected": -3.878648042678833,
      "step": 3255
    },
    {
      "epoch": 0.85,
      "grad_norm": 31.95449447631836,
      "kl": 0.0,
      "learning_rate": 7.39335252551688e-08,
      "logps/chosen": -179.57452392578125,
      "logps/rejected": -195.614501953125,
      "loss": 0.2422,
      "rewards/chosen": 0.5285756587982178,
      "rewards/margins": 3.189023733139038,
      "rewards/rejected": -2.6604480743408203,
      "step": 3256
    },
    {
      "epoch": 0.85,
      "grad_norm": 33.531883239746094,
      "kl": 0.0,
      "learning_rate": 7.3802669458257e-08,
      "logps/chosen": -145.7747039794922,
      "logps/rejected": -197.13243103027344,
      "loss": 0.1107,
      "rewards/chosen": 2.2670633792877197,
      "rewards/margins": 5.13998556137085,
      "rewards/rejected": -2.87292218208313,
      "step": 3257
    },
    {
      "epoch": 0.85,
      "grad_norm": 28.455995559692383,
      "kl": 0.0,
      "learning_rate": 7.36718136613452e-08,
      "logps/chosen": -203.78224182128906,
      "logps/rejected": -243.5115203857422,
      "loss": 0.2562,
      "rewards/chosen": 1.287809133529663,
      "rewards/margins": 4.950334548950195,
      "rewards/rejected": -3.6625256538391113,
      "step": 3258
    },
    {
      "epoch": 0.85,
      "grad_norm": 37.13747787475586,
      "kl": 0.0,
      "learning_rate": 7.354095786443339e-08,
      "logps/chosen": -271.0595397949219,
      "logps/rejected": -266.4194030761719,
      "loss": 0.2399,
      "rewards/chosen": 0.8473371863365173,
      "rewards/margins": 4.8914594650268555,
      "rewards/rejected": -4.044122219085693,
      "step": 3259
    },
    {
      "epoch": 0.85,
      "grad_norm": 40.32930374145508,
      "kl": 0.0,
      "learning_rate": 7.341010206752158e-08,
      "logps/chosen": -184.6184844970703,
      "logps/rejected": -306.64208984375,
      "loss": 0.204,
      "rewards/chosen": 2.8387949466705322,
      "rewards/margins": 6.3892598152160645,
      "rewards/rejected": -3.5504648685455322,
      "step": 3260
    },
    {
      "epoch": 0.85,
      "grad_norm": 33.87854766845703,
      "kl": 0.0,
      "learning_rate": 7.327924627060979e-08,
      "logps/chosen": -162.88327026367188,
      "logps/rejected": -258.2253723144531,
      "loss": 0.2213,
      "rewards/chosen": -1.2969216108322144,
      "rewards/margins": 1.792932391166687,
      "rewards/rejected": -3.0898540019989014,
      "step": 3261
    },
    {
      "epoch": 0.85,
      "grad_norm": 34.421268463134766,
      "kl": 0.0,
      "learning_rate": 7.314839047369799e-08,
      "logps/chosen": -165.06387329101562,
      "logps/rejected": -290.7898254394531,
      "loss": 0.26,
      "rewards/chosen": 0.41494685411453247,
      "rewards/margins": 3.6931369304656982,
      "rewards/rejected": -3.2781901359558105,
      "step": 3262
    },
    {
      "epoch": 0.85,
      "grad_norm": 26.321603775024414,
      "kl": 0.0,
      "learning_rate": 7.301753467678617e-08,
      "logps/chosen": -157.8191680908203,
      "logps/rejected": -317.366943359375,
      "loss": 0.2229,
      "rewards/chosen": 1.980485439300537,
      "rewards/margins": 7.327960968017578,
      "rewards/rejected": -5.347475528717041,
      "step": 3263
    },
    {
      "epoch": 0.85,
      "grad_norm": 34.1356086730957,
      "kl": 0.0,
      "learning_rate": 7.288667887987438e-08,
      "logps/chosen": -121.9466552734375,
      "logps/rejected": -367.5460510253906,
      "loss": 0.195,
      "rewards/chosen": 0.6263151168823242,
      "rewards/margins": 9.286596298217773,
      "rewards/rejected": -8.66028118133545,
      "step": 3264
    },
    {
      "epoch": 0.85,
      "grad_norm": 29.905941009521484,
      "kl": 0.0,
      "learning_rate": 7.275582308296258e-08,
      "logps/chosen": -166.04644775390625,
      "logps/rejected": -160.56764221191406,
      "loss": 0.1754,
      "rewards/chosen": 1.7765103578567505,
      "rewards/margins": 5.152238368988037,
      "rewards/rejected": -3.375727891921997,
      "step": 3265
    },
    {
      "epoch": 0.85,
      "grad_norm": 31.1649169921875,
      "kl": 0.0,
      "learning_rate": 7.262496728605077e-08,
      "logps/chosen": -232.14187622070312,
      "logps/rejected": -367.91448974609375,
      "loss": 0.2476,
      "rewards/chosen": 0.11659705638885498,
      "rewards/margins": 4.91973876953125,
      "rewards/rejected": -4.8031415939331055,
      "step": 3266
    },
    {
      "epoch": 0.86,
      "grad_norm": 30.761837005615234,
      "kl": 0.0,
      "learning_rate": 7.249411148913897e-08,
      "logps/chosen": -301.6856994628906,
      "logps/rejected": -214.73681640625,
      "loss": 0.1979,
      "rewards/chosen": 1.8630826473236084,
      "rewards/margins": 4.209121227264404,
      "rewards/rejected": -2.346038579940796,
      "step": 3267
    },
    {
      "epoch": 0.86,
      "grad_norm": 33.79634094238281,
      "kl": 0.0,
      "learning_rate": 7.236325569222716e-08,
      "logps/chosen": -221.21762084960938,
      "logps/rejected": -207.97463989257812,
      "loss": 0.2969,
      "rewards/chosen": 0.7912971377372742,
      "rewards/margins": 6.043848514556885,
      "rewards/rejected": -5.252551555633545,
      "step": 3268
    },
    {
      "epoch": 0.86,
      "grad_norm": 26.53761863708496,
      "kl": 0.0,
      "learning_rate": 7.223239989531536e-08,
      "logps/chosen": -295.77362060546875,
      "logps/rejected": -300.87371826171875,
      "loss": 0.1873,
      "rewards/chosen": 4.822590351104736,
      "rewards/margins": 8.884696960449219,
      "rewards/rejected": -4.062106609344482,
      "step": 3269
    },
    {
      "epoch": 0.86,
      "grad_norm": 30.040515899658203,
      "kl": 0.0,
      "learning_rate": 7.210154409840356e-08,
      "logps/chosen": -157.37391662597656,
      "logps/rejected": -169.13473510742188,
      "loss": 0.2455,
      "rewards/chosen": 0.8005948066711426,
      "rewards/margins": 3.58113694190979,
      "rewards/rejected": -2.7805421352386475,
      "step": 3270
    },
    {
      "epoch": 0.86,
      "grad_norm": 39.841590881347656,
      "kl": 0.0,
      "learning_rate": 7.197068830149175e-08,
      "logps/chosen": -207.9867706298828,
      "logps/rejected": -267.37017822265625,
      "loss": 0.3974,
      "rewards/chosen": 0.5954592227935791,
      "rewards/margins": 2.9090476036071777,
      "rewards/rejected": -2.3135883808135986,
      "step": 3271
    },
    {
      "epoch": 0.86,
      "grad_norm": 26.90143585205078,
      "kl": 0.0,
      "learning_rate": 7.183983250457995e-08,
      "logps/chosen": -143.56216430664062,
      "logps/rejected": -245.6182403564453,
      "loss": 0.2192,
      "rewards/chosen": 0.7064897418022156,
      "rewards/margins": 5.181339740753174,
      "rewards/rejected": -4.474850177764893,
      "step": 3272
    },
    {
      "epoch": 0.86,
      "grad_norm": 39.29250717163086,
      "kl": 0.0,
      "learning_rate": 7.170897670766814e-08,
      "logps/chosen": -231.05426025390625,
      "logps/rejected": -326.5909118652344,
      "loss": 0.353,
      "rewards/chosen": -0.5797771215438843,
      "rewards/margins": 3.835221767425537,
      "rewards/rejected": -4.414999008178711,
      "step": 3273
    },
    {
      "epoch": 0.86,
      "grad_norm": 26.03618621826172,
      "kl": 0.0,
      "learning_rate": 7.157812091075634e-08,
      "logps/chosen": -222.4735870361328,
      "logps/rejected": -200.89833068847656,
      "loss": 0.3105,
      "rewards/chosen": -0.4272328019142151,
      "rewards/margins": 2.4853272438049316,
      "rewards/rejected": -2.912559986114502,
      "step": 3274
    },
    {
      "epoch": 0.86,
      "grad_norm": 32.362457275390625,
      "kl": 0.0,
      "learning_rate": 7.144726511384454e-08,
      "logps/chosen": -306.9471130371094,
      "logps/rejected": -284.0693359375,
      "loss": 0.23,
      "rewards/chosen": 0.10507598519325256,
      "rewards/margins": 3.6749560832977295,
      "rewards/rejected": -3.5698800086975098,
      "step": 3275
    },
    {
      "epoch": 0.86,
      "grad_norm": 22.9937744140625,
      "kl": 0.0,
      "learning_rate": 7.131640931693273e-08,
      "logps/chosen": -204.49484252929688,
      "logps/rejected": -254.06138610839844,
      "loss": 0.2437,
      "rewards/chosen": 0.7712520360946655,
      "rewards/margins": 5.286528587341309,
      "rewards/rejected": -4.5152764320373535,
      "step": 3276
    },
    {
      "epoch": 0.86,
      "grad_norm": 39.88124465942383,
      "kl": 0.0,
      "learning_rate": 7.118555352002094e-08,
      "logps/chosen": -245.06753540039062,
      "logps/rejected": -268.346923828125,
      "loss": 0.2285,
      "rewards/chosen": 2.043609142303467,
      "rewards/margins": 6.61504602432251,
      "rewards/rejected": -4.571436882019043,
      "step": 3277
    },
    {
      "epoch": 0.86,
      "grad_norm": 36.58185577392578,
      "kl": 0.0,
      "learning_rate": 7.105469772310912e-08,
      "logps/chosen": -263.1695251464844,
      "logps/rejected": -236.58145141601562,
      "loss": 0.1717,
      "rewards/chosen": 0.9724127650260925,
      "rewards/margins": 6.123079776763916,
      "rewards/rejected": -5.150667190551758,
      "step": 3278
    },
    {
      "epoch": 0.86,
      "grad_norm": 37.39385986328125,
      "kl": 0.0,
      "learning_rate": 7.092384192619732e-08,
      "logps/chosen": -216.4535369873047,
      "logps/rejected": -157.31016540527344,
      "loss": 0.1617,
      "rewards/chosen": 2.5863406658172607,
      "rewards/margins": 5.75681209564209,
      "rewards/rejected": -3.17047119140625,
      "step": 3279
    },
    {
      "epoch": 0.86,
      "grad_norm": 33.63765335083008,
      "kl": 0.0,
      "learning_rate": 7.079298612928553e-08,
      "logps/chosen": -185.89892578125,
      "logps/rejected": -231.67864990234375,
      "loss": 0.3398,
      "rewards/chosen": 0.6431490778923035,
      "rewards/margins": 4.488551616668701,
      "rewards/rejected": -3.845402717590332,
      "step": 3280
    },
    {
      "epoch": 0.86,
      "grad_norm": 37.0649528503418,
      "kl": 0.0,
      "learning_rate": 7.066213033237373e-08,
      "logps/chosen": -154.55319213867188,
      "logps/rejected": -238.2505340576172,
      "loss": 0.3235,
      "rewards/chosen": 0.45910343527793884,
      "rewards/margins": 4.324784278869629,
      "rewards/rejected": -3.8656809329986572,
      "step": 3281
    },
    {
      "epoch": 0.86,
      "grad_norm": 35.96867370605469,
      "kl": 0.0,
      "learning_rate": 7.053127453546191e-08,
      "logps/chosen": -263.711181640625,
      "logps/rejected": -323.4737243652344,
      "loss": 0.1867,
      "rewards/chosen": 1.104454755783081,
      "rewards/margins": 6.848848342895508,
      "rewards/rejected": -5.744393825531006,
      "step": 3282
    },
    {
      "epoch": 0.86,
      "grad_norm": 39.129539489746094,
      "kl": 0.0,
      "learning_rate": 7.040041873855012e-08,
      "logps/chosen": -151.68955993652344,
      "logps/rejected": -177.42762756347656,
      "loss": 0.2207,
      "rewards/chosen": 2.1430304050445557,
      "rewards/margins": 5.147785663604736,
      "rewards/rejected": -3.0047552585601807,
      "step": 3283
    },
    {
      "epoch": 0.86,
      "grad_norm": 28.86467742919922,
      "kl": 0.0,
      "learning_rate": 7.026956294163831e-08,
      "logps/chosen": -250.60304260253906,
      "logps/rejected": -249.148681640625,
      "loss": 0.1399,
      "rewards/chosen": 2.5337722301483154,
      "rewards/margins": 6.411539077758789,
      "rewards/rejected": -3.8777670860290527,
      "step": 3284
    },
    {
      "epoch": 0.86,
      "grad_norm": 32.93992233276367,
      "kl": 0.0,
      "learning_rate": 7.013870714472651e-08,
      "logps/chosen": -203.44247436523438,
      "logps/rejected": -197.58848571777344,
      "loss": 0.201,
      "rewards/chosen": 0.29153192043304443,
      "rewards/margins": 5.219209671020508,
      "rewards/rejected": -4.927677631378174,
      "step": 3285
    },
    {
      "epoch": 0.86,
      "grad_norm": 25.979219436645508,
      "kl": 0.0,
      "learning_rate": 7.000785134781471e-08,
      "logps/chosen": -167.48854064941406,
      "logps/rejected": -310.9127197265625,
      "loss": 0.1946,
      "rewards/chosen": 1.70915949344635,
      "rewards/margins": 6.76434850692749,
      "rewards/rejected": -5.05518913269043,
      "step": 3286
    },
    {
      "epoch": 0.86,
      "grad_norm": 25.11945343017578,
      "kl": 0.0,
      "learning_rate": 6.98769955509029e-08,
      "logps/chosen": -223.0096893310547,
      "logps/rejected": -205.82083129882812,
      "loss": 0.2614,
      "rewards/chosen": 1.862809419631958,
      "rewards/margins": 6.054323196411133,
      "rewards/rejected": -4.191514015197754,
      "step": 3287
    },
    {
      "epoch": 0.86,
      "grad_norm": 42.99819564819336,
      "kl": 0.0,
      "learning_rate": 6.97461397539911e-08,
      "logps/chosen": -249.3183135986328,
      "logps/rejected": -287.7580261230469,
      "loss": 0.2844,
      "rewards/chosen": -0.017606837674975395,
      "rewards/margins": 3.4193594455718994,
      "rewards/rejected": -3.4369661808013916,
      "step": 3288
    },
    {
      "epoch": 0.86,
      "grad_norm": 28.284883499145508,
      "kl": 0.0,
      "learning_rate": 6.961528395707931e-08,
      "logps/chosen": -209.82473754882812,
      "logps/rejected": -249.59402465820312,
      "loss": 0.1452,
      "rewards/chosen": 1.9604130983352661,
      "rewards/margins": 7.294702053070068,
      "rewards/rejected": -5.334289073944092,
      "step": 3289
    },
    {
      "epoch": 0.86,
      "grad_norm": 30.614185333251953,
      "kl": 0.0,
      "learning_rate": 6.948442816016749e-08,
      "logps/chosen": -171.9506072998047,
      "logps/rejected": -178.01907348632812,
      "loss": 0.2271,
      "rewards/chosen": 1.5359193086624146,
      "rewards/margins": 4.613659381866455,
      "rewards/rejected": -3.07774019241333,
      "step": 3290
    },
    {
      "epoch": 0.86,
      "grad_norm": 41.53820037841797,
      "kl": 0.0,
      "learning_rate": 6.935357236325569e-08,
      "logps/chosen": -209.81532287597656,
      "logps/rejected": -180.5158233642578,
      "loss": 0.244,
      "rewards/chosen": 1.3790901899337769,
      "rewards/margins": 4.9010844230651855,
      "rewards/rejected": -3.5219943523406982,
      "step": 3291
    },
    {
      "epoch": 0.86,
      "grad_norm": 40.008262634277344,
      "kl": 0.0,
      "learning_rate": 6.922271656634388e-08,
      "logps/chosen": -97.14312744140625,
      "logps/rejected": -326.3070373535156,
      "loss": 0.2114,
      "rewards/chosen": 1.234908938407898,
      "rewards/margins": 5.508755207061768,
      "rewards/rejected": -4.27384614944458,
      "step": 3292
    },
    {
      "epoch": 0.86,
      "grad_norm": 35.759037017822266,
      "kl": 0.0,
      "learning_rate": 6.909186076943209e-08,
      "logps/chosen": -243.43714904785156,
      "logps/rejected": -278.0218200683594,
      "loss": 0.1198,
      "rewards/chosen": 2.5602760314941406,
      "rewards/margins": 5.5889482498168945,
      "rewards/rejected": -3.028672456741333,
      "step": 3293
    },
    {
      "epoch": 0.86,
      "grad_norm": 37.18498229980469,
      "kl": 0.0,
      "learning_rate": 6.896100497252028e-08,
      "logps/chosen": -257.5518493652344,
      "logps/rejected": -204.18756103515625,
      "loss": 0.2803,
      "rewards/chosen": 0.5463675260543823,
      "rewards/margins": 4.813793182373047,
      "rewards/rejected": -4.267425537109375,
      "step": 3294
    },
    {
      "epoch": 0.86,
      "grad_norm": 25.23471450805664,
      "kl": 0.0,
      "learning_rate": 6.883014917560847e-08,
      "logps/chosen": -147.47364807128906,
      "logps/rejected": -310.4903259277344,
      "loss": 0.1438,
      "rewards/chosen": 1.7297723293304443,
      "rewards/margins": 6.705340385437012,
      "rewards/rejected": -4.9755682945251465,
      "step": 3295
    },
    {
      "epoch": 0.86,
      "grad_norm": 31.479604721069336,
      "kl": 0.0,
      "learning_rate": 6.869929337869668e-08,
      "logps/chosen": -287.01898193359375,
      "logps/rejected": -258.6340026855469,
      "loss": 0.184,
      "rewards/chosen": 1.9719114303588867,
      "rewards/margins": 6.917535781860352,
      "rewards/rejected": -4.945624351501465,
      "step": 3296
    },
    {
      "epoch": 0.86,
      "grad_norm": 39.46041488647461,
      "kl": 0.0,
      "learning_rate": 6.856843758178486e-08,
      "logps/chosen": -157.99221801757812,
      "logps/rejected": -350.6339416503906,
      "loss": 0.2945,
      "rewards/chosen": -0.05334752798080444,
      "rewards/margins": 2.8886020183563232,
      "rewards/rejected": -2.9419496059417725,
      "step": 3297
    },
    {
      "epoch": 0.86,
      "grad_norm": 27.979738235473633,
      "kl": 0.0,
      "learning_rate": 6.843758178487306e-08,
      "logps/chosen": -275.239990234375,
      "logps/rejected": -249.71923828125,
      "loss": 0.2181,
      "rewards/chosen": 1.6366877555847168,
      "rewards/margins": 5.247501850128174,
      "rewards/rejected": -3.610814094543457,
      "step": 3298
    },
    {
      "epoch": 0.86,
      "grad_norm": 29.67488670349121,
      "kl": 0.0,
      "learning_rate": 6.830672598796127e-08,
      "logps/chosen": -181.911865234375,
      "logps/rejected": -250.30233764648438,
      "loss": 0.2572,
      "rewards/chosen": 2.5218210220336914,
      "rewards/margins": 5.756906032562256,
      "rewards/rejected": -3.2350850105285645,
      "step": 3299
    },
    {
      "epoch": 0.86,
      "grad_norm": 42.35227966308594,
      "kl": 0.0,
      "learning_rate": 6.817587019104947e-08,
      "logps/chosen": -224.5271453857422,
      "logps/rejected": -224.19619750976562,
      "loss": 0.3821,
      "rewards/chosen": 0.25506705045700073,
      "rewards/margins": 3.4965734481811523,
      "rewards/rejected": -3.241506338119507,
      "step": 3300
    },
    {
      "epoch": 0.86,
      "grad_norm": 38.902870178222656,
      "kl": 0.0,
      "learning_rate": 6.804501439413765e-08,
      "logps/chosen": -217.0912322998047,
      "logps/rejected": -236.94139099121094,
      "loss": 0.3333,
      "rewards/chosen": 0.8560259938240051,
      "rewards/margins": 3.4678268432617188,
      "rewards/rejected": -2.6118009090423584,
      "step": 3301
    },
    {
      "epoch": 0.86,
      "grad_norm": 29.101722717285156,
      "kl": 0.0,
      "learning_rate": 6.791415859722586e-08,
      "logps/chosen": -188.09799194335938,
      "logps/rejected": -235.2723846435547,
      "loss": 0.1716,
      "rewards/chosen": 2.8118784427642822,
      "rewards/margins": 6.987157821655273,
      "rewards/rejected": -4.175279140472412,
      "step": 3302
    },
    {
      "epoch": 0.86,
      "grad_norm": 53.44160079956055,
      "kl": 0.0,
      "learning_rate": 6.778330280031405e-08,
      "logps/chosen": -146.15121459960938,
      "logps/rejected": -212.4782257080078,
      "loss": 0.2992,
      "rewards/chosen": 0.34400415420532227,
      "rewards/margins": 2.6334946155548096,
      "rewards/rejected": -2.2894904613494873,
      "step": 3303
    },
    {
      "epoch": 0.86,
      "grad_norm": 33.913021087646484,
      "kl": 0.0,
      "learning_rate": 6.765244700340225e-08,
      "logps/chosen": -233.7433624267578,
      "logps/rejected": -230.5489959716797,
      "loss": 0.1883,
      "rewards/chosen": 1.3600634336471558,
      "rewards/margins": 5.133440971374512,
      "rewards/rejected": -3.7733774185180664,
      "step": 3304
    },
    {
      "epoch": 0.86,
      "grad_norm": 37.881309509277344,
      "kl": 0.0,
      "learning_rate": 6.752159120649043e-08,
      "logps/chosen": -187.03089904785156,
      "logps/rejected": -321.6445617675781,
      "loss": 0.3124,
      "rewards/chosen": 0.576844334602356,
      "rewards/margins": 4.5928730964660645,
      "rewards/rejected": -4.016028881072998,
      "step": 3305
    },
    {
      "epoch": 0.87,
      "grad_norm": 28.14802360534668,
      "kl": 0.0,
      "learning_rate": 6.739073540957864e-08,
      "logps/chosen": -179.93328857421875,
      "logps/rejected": -330.87994384765625,
      "loss": 0.2017,
      "rewards/chosen": 1.1859331130981445,
      "rewards/margins": 6.022317886352539,
      "rewards/rejected": -4.8363847732543945,
      "step": 3306
    },
    {
      "epoch": 0.87,
      "grad_norm": 30.64641761779785,
      "kl": 0.0,
      "learning_rate": 6.725987961266684e-08,
      "logps/chosen": -189.5594024658203,
      "logps/rejected": -235.72720336914062,
      "loss": 0.1842,
      "rewards/chosen": -0.34744271636009216,
      "rewards/margins": 3.385176420211792,
      "rewards/rejected": -3.732619047164917,
      "step": 3307
    },
    {
      "epoch": 0.87,
      "grad_norm": 35.5704460144043,
      "kl": 0.0,
      "learning_rate": 6.712902381575503e-08,
      "logps/chosen": -184.53591918945312,
      "logps/rejected": -285.756591796875,
      "loss": 0.235,
      "rewards/chosen": 0.026567867025732994,
      "rewards/margins": 5.050618648529053,
      "rewards/rejected": -5.024050712585449,
      "step": 3308
    },
    {
      "epoch": 0.87,
      "grad_norm": 25.643348693847656,
      "kl": 0.0,
      "learning_rate": 6.699816801884323e-08,
      "logps/chosen": -160.59278869628906,
      "logps/rejected": -255.24618530273438,
      "loss": 0.1446,
      "rewards/chosen": 1.3220984935760498,
      "rewards/margins": 6.726505279541016,
      "rewards/rejected": -5.404406547546387,
      "step": 3309
    },
    {
      "epoch": 0.87,
      "grad_norm": 25.281702041625977,
      "kl": 0.0,
      "learning_rate": 6.686731222193143e-08,
      "logps/chosen": -208.46261596679688,
      "logps/rejected": -263.46209716796875,
      "loss": 0.2685,
      "rewards/chosen": 0.5003941059112549,
      "rewards/margins": 3.899710178375244,
      "rewards/rejected": -3.3993160724639893,
      "step": 3310
    },
    {
      "epoch": 0.87,
      "grad_norm": 37.242706298828125,
      "kl": 0.0,
      "learning_rate": 6.673645642501962e-08,
      "logps/chosen": -240.91317749023438,
      "logps/rejected": -254.6995849609375,
      "loss": 0.2309,
      "rewards/chosen": 2.887584924697876,
      "rewards/margins": 5.6153669357299805,
      "rewards/rejected": -2.7277822494506836,
      "step": 3311
    },
    {
      "epoch": 0.87,
      "grad_norm": 58.118492126464844,
      "kl": 0.0,
      "learning_rate": 6.660560062810783e-08,
      "logps/chosen": -233.1113739013672,
      "logps/rejected": -321.74786376953125,
      "loss": 0.2546,
      "rewards/chosen": 1.2985846996307373,
      "rewards/margins": 5.554841995239258,
      "rewards/rejected": -4.256257057189941,
      "step": 3312
    },
    {
      "epoch": 0.87,
      "grad_norm": 41.69536590576172,
      "kl": 0.0,
      "learning_rate": 6.647474483119601e-08,
      "logps/chosen": -250.2501678466797,
      "logps/rejected": -202.795654296875,
      "loss": 0.306,
      "rewards/chosen": 0.21787656843662262,
      "rewards/margins": 3.9046685695648193,
      "rewards/rejected": -3.6867918968200684,
      "step": 3313
    },
    {
      "epoch": 0.87,
      "grad_norm": 29.167028427124023,
      "kl": 0.0,
      "learning_rate": 6.634388903428421e-08,
      "logps/chosen": -247.29339599609375,
      "logps/rejected": -244.97125244140625,
      "loss": 0.2146,
      "rewards/chosen": 1.6185340881347656,
      "rewards/margins": 4.635621070861816,
      "rewards/rejected": -3.017086982727051,
      "step": 3314
    },
    {
      "epoch": 0.87,
      "grad_norm": 31.391908645629883,
      "kl": 0.0,
      "learning_rate": 6.621303323737242e-08,
      "logps/chosen": -205.63795471191406,
      "logps/rejected": -143.65188598632812,
      "loss": 0.1912,
      "rewards/chosen": 3.0524964332580566,
      "rewards/margins": 6.475236892700195,
      "rewards/rejected": -3.4227406978607178,
      "step": 3315
    },
    {
      "epoch": 0.87,
      "grad_norm": 32.993507385253906,
      "kl": 0.0,
      "learning_rate": 6.608217744046062e-08,
      "logps/chosen": -227.3526611328125,
      "logps/rejected": -240.81625366210938,
      "loss": 0.1422,
      "rewards/chosen": 2.2260305881500244,
      "rewards/margins": 5.8847575187683105,
      "rewards/rejected": -3.658726930618286,
      "step": 3316
    },
    {
      "epoch": 0.87,
      "grad_norm": 49.290889739990234,
      "kl": 0.0,
      "learning_rate": 6.59513216435488e-08,
      "logps/chosen": -175.3079376220703,
      "logps/rejected": -264.86151123046875,
      "loss": 0.2859,
      "rewards/chosen": 0.9824588894844055,
      "rewards/margins": 4.101996898651123,
      "rewards/rejected": -3.1195380687713623,
      "step": 3317
    },
    {
      "epoch": 0.87,
      "grad_norm": 29.216691970825195,
      "kl": 0.0,
      "learning_rate": 6.582046584663701e-08,
      "logps/chosen": -156.804443359375,
      "logps/rejected": -283.55572509765625,
      "loss": 0.3219,
      "rewards/chosen": 1.2649195194244385,
      "rewards/margins": 2.263371467590332,
      "rewards/rejected": -0.9984518885612488,
      "step": 3318
    },
    {
      "epoch": 0.87,
      "grad_norm": 29.11150550842285,
      "kl": 0.0,
      "learning_rate": 6.56896100497252e-08,
      "logps/chosen": -234.84364318847656,
      "logps/rejected": -303.1112060546875,
      "loss": 0.1699,
      "rewards/chosen": 1.790556788444519,
      "rewards/margins": 6.315253257751465,
      "rewards/rejected": -4.524696350097656,
      "step": 3319
    },
    {
      "epoch": 0.87,
      "grad_norm": 33.973350524902344,
      "kl": 0.0,
      "learning_rate": 6.55587542528134e-08,
      "logps/chosen": -187.39263916015625,
      "logps/rejected": -275.7961120605469,
      "loss": 0.3065,
      "rewards/chosen": 1.2405842542648315,
      "rewards/margins": 6.566118240356445,
      "rewards/rejected": -5.325533866882324,
      "step": 3320
    },
    {
      "epoch": 0.87,
      "grad_norm": 42.52359390258789,
      "kl": 0.0,
      "learning_rate": 6.542789845590158e-08,
      "logps/chosen": -132.62879943847656,
      "logps/rejected": -302.57452392578125,
      "loss": 0.2687,
      "rewards/chosen": 0.5341070890426636,
      "rewards/margins": 5.116555690765381,
      "rewards/rejected": -4.582448482513428,
      "step": 3321
    },
    {
      "epoch": 0.87,
      "grad_norm": 30.272220611572266,
      "kl": 0.0,
      "learning_rate": 6.529704265898979e-08,
      "logps/chosen": -213.6365966796875,
      "logps/rejected": -249.19174194335938,
      "loss": 0.2952,
      "rewards/chosen": 0.25924426317214966,
      "rewards/margins": 3.283485174179077,
      "rewards/rejected": -3.0242409706115723,
      "step": 3322
    },
    {
      "epoch": 0.87,
      "grad_norm": 24.300634384155273,
      "kl": 0.0,
      "learning_rate": 6.516618686207799e-08,
      "logps/chosen": -166.66082763671875,
      "logps/rejected": -324.5031433105469,
      "loss": 0.2112,
      "rewards/chosen": 1.5318686962127686,
      "rewards/margins": 5.843268394470215,
      "rewards/rejected": -4.311399936676025,
      "step": 3323
    },
    {
      "epoch": 0.87,
      "grad_norm": 33.69001770019531,
      "kl": 0.0,
      "learning_rate": 6.503533106516618e-08,
      "logps/chosen": -290.857421875,
      "logps/rejected": -209.4654083251953,
      "loss": 0.2298,
      "rewards/chosen": 0.6444933414459229,
      "rewards/margins": 4.194701194763184,
      "rewards/rejected": -3.5502076148986816,
      "step": 3324
    },
    {
      "epoch": 0.87,
      "grad_norm": 39.34164810180664,
      "kl": 0.0,
      "learning_rate": 6.490447526825438e-08,
      "logps/chosen": -179.45948791503906,
      "logps/rejected": -268.68505859375,
      "loss": 0.335,
      "rewards/chosen": -0.48030078411102295,
      "rewards/margins": 1.7185760736465454,
      "rewards/rejected": -2.1988768577575684,
      "step": 3325
    },
    {
      "epoch": 0.87,
      "grad_norm": 37.93251037597656,
      "kl": 0.0,
      "learning_rate": 6.477361947134258e-08,
      "logps/chosen": -226.54959106445312,
      "logps/rejected": -286.8560485839844,
      "loss": 0.2178,
      "rewards/chosen": 1.1966824531555176,
      "rewards/margins": 5.566013813018799,
      "rewards/rejected": -4.369331359863281,
      "step": 3326
    },
    {
      "epoch": 0.87,
      "grad_norm": 31.69426155090332,
      "kl": 0.0,
      "learning_rate": 6.464276367443077e-08,
      "logps/chosen": -254.9532470703125,
      "logps/rejected": -292.90997314453125,
      "loss": 0.3137,
      "rewards/chosen": 0.43578511476516724,
      "rewards/margins": 3.768958330154419,
      "rewards/rejected": -3.3331732749938965,
      "step": 3327
    },
    {
      "epoch": 0.87,
      "grad_norm": 38.56787109375,
      "kl": 0.0,
      "learning_rate": 6.451190787751897e-08,
      "logps/chosen": -229.85284423828125,
      "logps/rejected": -233.19021606445312,
      "loss": 0.2724,
      "rewards/chosen": 0.6514629125595093,
      "rewards/margins": 4.2708420753479,
      "rewards/rejected": -3.6193790435791016,
      "step": 3328
    },
    {
      "epoch": 0.87,
      "grad_norm": 24.048837661743164,
      "kl": 0.0,
      "learning_rate": 6.438105208060717e-08,
      "logps/chosen": -177.91517639160156,
      "logps/rejected": -227.61233520507812,
      "loss": 0.195,
      "rewards/chosen": 1.9322575330734253,
      "rewards/margins": 5.3044915199279785,
      "rewards/rejected": -3.3722338676452637,
      "step": 3329
    },
    {
      "epoch": 0.87,
      "grad_norm": 24.384130477905273,
      "kl": 0.0,
      "learning_rate": 6.425019628369536e-08,
      "logps/chosen": -123.97366333007812,
      "logps/rejected": -229.6993865966797,
      "loss": 0.2558,
      "rewards/chosen": 0.7002733945846558,
      "rewards/margins": 3.6762170791625977,
      "rewards/rejected": -2.9759435653686523,
      "step": 3330
    },
    {
      "epoch": 0.87,
      "grad_norm": 35.15760803222656,
      "kl": 0.0,
      "learning_rate": 6.411934048678357e-08,
      "logps/chosen": -119.36579132080078,
      "logps/rejected": -305.31610107421875,
      "loss": 0.1591,
      "rewards/chosen": 2.24487566947937,
      "rewards/margins": 6.488380432128906,
      "rewards/rejected": -4.243504524230957,
      "step": 3331
    },
    {
      "epoch": 0.87,
      "grad_norm": 31.893068313598633,
      "kl": 0.0,
      "learning_rate": 6.398848468987175e-08,
      "logps/chosen": -143.27931213378906,
      "logps/rejected": -316.27020263671875,
      "loss": 0.1997,
      "rewards/chosen": 0.03290824219584465,
      "rewards/margins": 4.4567461013793945,
      "rewards/rejected": -4.423837661743164,
      "step": 3332
    },
    {
      "epoch": 0.87,
      "grad_norm": 29.552108764648438,
      "kl": 0.0,
      "learning_rate": 6.385762889295995e-08,
      "logps/chosen": -142.15072631835938,
      "logps/rejected": -240.77963256835938,
      "loss": 0.243,
      "rewards/chosen": 1.4211618900299072,
      "rewards/margins": 4.207136154174805,
      "rewards/rejected": -2.7859745025634766,
      "step": 3333
    },
    {
      "epoch": 0.87,
      "grad_norm": 48.456520080566406,
      "kl": 0.0,
      "learning_rate": 6.372677309604816e-08,
      "logps/chosen": -230.8395233154297,
      "logps/rejected": -218.60824584960938,
      "loss": 0.2448,
      "rewards/chosen": 1.877655029296875,
      "rewards/margins": 4.602280139923096,
      "rewards/rejected": -2.7246251106262207,
      "step": 3334
    },
    {
      "epoch": 0.87,
      "grad_norm": 47.03510665893555,
      "kl": 0.0,
      "learning_rate": 6.359591729913635e-08,
      "logps/chosen": -210.4639129638672,
      "logps/rejected": -389.0422668457031,
      "loss": 0.1982,
      "rewards/chosen": 1.8622188568115234,
      "rewards/margins": 19.06804656982422,
      "rewards/rejected": -17.205827713012695,
      "step": 3335
    },
    {
      "epoch": 0.87,
      "grad_norm": 49.48558807373047,
      "kl": 0.0,
      "learning_rate": 6.346506150222454e-08,
      "logps/chosen": -173.88287353515625,
      "logps/rejected": -371.12628173828125,
      "loss": 0.326,
      "rewards/chosen": 0.8577790260314941,
      "rewards/margins": 6.012696743011475,
      "rewards/rejected": -5.1549177169799805,
      "step": 3336
    },
    {
      "epoch": 0.87,
      "grad_norm": 39.3420295715332,
      "kl": 0.0,
      "learning_rate": 6.333420570531273e-08,
      "logps/chosen": -222.27012634277344,
      "logps/rejected": -150.44436645507812,
      "loss": 0.2694,
      "rewards/chosen": 0.5377728939056396,
      "rewards/margins": 3.824592113494873,
      "rewards/rejected": -3.2868192195892334,
      "step": 3337
    },
    {
      "epoch": 0.87,
      "grad_norm": 28.24842643737793,
      "kl": 0.0,
      "learning_rate": 6.320334990840094e-08,
      "logps/chosen": -137.2236785888672,
      "logps/rejected": -277.0232238769531,
      "loss": 0.1481,
      "rewards/chosen": 1.2536040544509888,
      "rewards/margins": 6.276458263397217,
      "rewards/rejected": -5.022854328155518,
      "step": 3338
    },
    {
      "epoch": 0.87,
      "grad_norm": 36.254573822021484,
      "kl": 0.0,
      "learning_rate": 6.307249411148914e-08,
      "logps/chosen": -181.7095489501953,
      "logps/rejected": -241.9065399169922,
      "loss": 0.2699,
      "rewards/chosen": 1.2226327657699585,
      "rewards/margins": 4.4673848152160645,
      "rewards/rejected": -3.2447519302368164,
      "step": 3339
    },
    {
      "epoch": 0.87,
      "grad_norm": 25.44312858581543,
      "kl": 0.0,
      "learning_rate": 6.294163831457732e-08,
      "logps/chosen": -195.16006469726562,
      "logps/rejected": -225.5717315673828,
      "loss": 0.1913,
      "rewards/chosen": 2.599803924560547,
      "rewards/margins": 5.383212089538574,
      "rewards/rejected": -2.7834081649780273,
      "step": 3340
    },
    {
      "epoch": 0.87,
      "grad_norm": 44.52004623413086,
      "kl": 0.0,
      "learning_rate": 6.281078251766553e-08,
      "logps/chosen": -195.54818725585938,
      "logps/rejected": -170.70751953125,
      "loss": 0.2647,
      "rewards/chosen": 1.62178635597229,
      "rewards/margins": 3.412325859069824,
      "rewards/rejected": -1.7905396223068237,
      "step": 3341
    },
    {
      "epoch": 0.87,
      "grad_norm": 42.561214447021484,
      "kl": 0.0,
      "learning_rate": 6.267992672075373e-08,
      "logps/chosen": -242.3065948486328,
      "logps/rejected": -249.35528564453125,
      "loss": 0.2645,
      "rewards/chosen": 1.8589394092559814,
      "rewards/margins": 5.069093704223633,
      "rewards/rejected": -3.2101545333862305,
      "step": 3342
    },
    {
      "epoch": 0.87,
      "grad_norm": 26.688722610473633,
      "kl": 0.0,
      "learning_rate": 6.254907092384192e-08,
      "logps/chosen": -158.76341247558594,
      "logps/rejected": -157.71682739257812,
      "loss": 0.3068,
      "rewards/chosen": 0.2880159914493561,
      "rewards/margins": 3.0845084190368652,
      "rewards/rejected": -2.796492338180542,
      "step": 3343
    },
    {
      "epoch": 0.88,
      "grad_norm": 26.89533805847168,
      "kl": 0.0,
      "learning_rate": 6.241821512693012e-08,
      "logps/chosen": -279.21527099609375,
      "logps/rejected": -219.7892303466797,
      "loss": 0.1806,
      "rewards/chosen": 1.976793885231018,
      "rewards/margins": 6.151236534118652,
      "rewards/rejected": -4.174442768096924,
      "step": 3344
    },
    {
      "epoch": 0.88,
      "grad_norm": 39.72605895996094,
      "kl": 0.0,
      "learning_rate": 6.228735933001832e-08,
      "logps/chosen": -227.8719940185547,
      "logps/rejected": -207.28826904296875,
      "loss": 0.3296,
      "rewards/chosen": -0.17927008867263794,
      "rewards/margins": 3.6305737495422363,
      "rewards/rejected": -3.8098437786102295,
      "step": 3345
    },
    {
      "epoch": 0.88,
      "grad_norm": 37.94425582885742,
      "kl": 0.0,
      "learning_rate": 6.215650353310651e-08,
      "logps/chosen": -232.4866943359375,
      "logps/rejected": -313.95916748046875,
      "loss": 0.2654,
      "rewards/chosen": 0.6113022565841675,
      "rewards/margins": 4.760047912597656,
      "rewards/rejected": -4.148745536804199,
      "step": 3346
    },
    {
      "epoch": 0.88,
      "grad_norm": 29.92729949951172,
      "kl": 0.0,
      "learning_rate": 6.202564773619471e-08,
      "logps/chosen": -227.82040405273438,
      "logps/rejected": -236.7136688232422,
      "loss": 0.2515,
      "rewards/chosen": 2.12203049659729,
      "rewards/margins": 4.986681938171387,
      "rewards/rejected": -2.8646512031555176,
      "step": 3347
    },
    {
      "epoch": 0.88,
      "grad_norm": 34.22357940673828,
      "kl": 0.0,
      "learning_rate": 6.18947919392829e-08,
      "logps/chosen": -212.64195251464844,
      "logps/rejected": -177.64585876464844,
      "loss": 0.2566,
      "rewards/chosen": 0.41153281927108765,
      "rewards/margins": 3.8872270584106445,
      "rewards/rejected": -3.475694179534912,
      "step": 3348
    },
    {
      "epoch": 0.88,
      "grad_norm": 31.709651947021484,
      "kl": 0.0,
      "learning_rate": 6.17639361423711e-08,
      "logps/chosen": -227.3643035888672,
      "logps/rejected": -234.74111938476562,
      "loss": 0.2212,
      "rewards/chosen": 1.6882359981536865,
      "rewards/margins": 5.849997520446777,
      "rewards/rejected": -4.16176176071167,
      "step": 3349
    },
    {
      "epoch": 0.88,
      "grad_norm": 32.12677764892578,
      "kl": 0.0,
      "learning_rate": 6.163308034545931e-08,
      "logps/chosen": -222.95790100097656,
      "logps/rejected": -250.52259826660156,
      "loss": 0.21,
      "rewards/chosen": 0.7367315292358398,
      "rewards/margins": 4.7167205810546875,
      "rewards/rejected": -3.9799892902374268,
      "step": 3350
    },
    {
      "epoch": 0.88,
      "grad_norm": 32.62001419067383,
      "kl": 0.0,
      "learning_rate": 6.150222454854749e-08,
      "logps/chosen": -214.60733032226562,
      "logps/rejected": -329.4471130371094,
      "loss": 0.1048,
      "rewards/chosen": 2.412802219390869,
      "rewards/margins": 10.279458999633789,
      "rewards/rejected": -7.866656303405762,
      "step": 3351
    },
    {
      "epoch": 0.88,
      "grad_norm": 45.16449737548828,
      "kl": 0.0,
      "learning_rate": 6.13713687516357e-08,
      "logps/chosen": -269.85308837890625,
      "logps/rejected": -269.24371337890625,
      "loss": 0.3168,
      "rewards/chosen": 0.4667782783508301,
      "rewards/margins": 4.049206256866455,
      "rewards/rejected": -3.582427978515625,
      "step": 3352
    },
    {
      "epoch": 0.88,
      "grad_norm": 32.718441009521484,
      "kl": 0.0,
      "learning_rate": 6.12405129547239e-08,
      "logps/chosen": -177.99945068359375,
      "logps/rejected": -303.58941650390625,
      "loss": 0.3193,
      "rewards/chosen": 0.3383503258228302,
      "rewards/margins": 5.382714748382568,
      "rewards/rejected": -5.0443644523620605,
      "step": 3353
    },
    {
      "epoch": 0.88,
      "grad_norm": 41.32862854003906,
      "kl": 0.0,
      "learning_rate": 6.11096571578121e-08,
      "logps/chosen": -215.1548309326172,
      "logps/rejected": -329.5027770996094,
      "loss": 0.1712,
      "rewards/chosen": 1.8988428115844727,
      "rewards/margins": 6.0316267013549805,
      "rewards/rejected": -4.132783889770508,
      "step": 3354
    },
    {
      "epoch": 0.88,
      "grad_norm": 52.48097229003906,
      "kl": 0.0,
      "learning_rate": 6.097880136090029e-08,
      "logps/chosen": -199.16217041015625,
      "logps/rejected": -284.30108642578125,
      "loss": 0.2402,
      "rewards/chosen": -0.06404250860214233,
      "rewards/margins": 4.585357189178467,
      "rewards/rejected": -4.649399757385254,
      "step": 3355
    },
    {
      "epoch": 0.88,
      "grad_norm": 28.11062240600586,
      "kl": 0.0,
      "learning_rate": 6.084794556398847e-08,
      "logps/chosen": -156.20716857910156,
      "logps/rejected": -228.56581115722656,
      "loss": 0.2018,
      "rewards/chosen": 1.7657132148742676,
      "rewards/margins": 5.493755340576172,
      "rewards/rejected": -3.728041887283325,
      "step": 3356
    },
    {
      "epoch": 0.88,
      "grad_norm": 36.19900131225586,
      "kl": 0.0,
      "learning_rate": 6.071708976707668e-08,
      "logps/chosen": -171.58216857910156,
      "logps/rejected": -188.8887481689453,
      "loss": 0.2677,
      "rewards/chosen": 1.182680606842041,
      "rewards/margins": 2.961630344390869,
      "rewards/rejected": -1.7789497375488281,
      "step": 3357
    },
    {
      "epoch": 0.88,
      "grad_norm": 32.472721099853516,
      "kl": 0.0,
      "learning_rate": 6.058623397016488e-08,
      "logps/chosen": -341.27197265625,
      "logps/rejected": -182.00791931152344,
      "loss": 0.1525,
      "rewards/chosen": 3.77966046333313,
      "rewards/margins": 6.490664482116699,
      "rewards/rejected": -2.7110040187835693,
      "step": 3358
    },
    {
      "epoch": 0.88,
      "grad_norm": 27.687116622924805,
      "kl": 0.0,
      "learning_rate": 6.045537817325307e-08,
      "logps/chosen": -152.61212158203125,
      "logps/rejected": -318.43719482421875,
      "loss": 0.2795,
      "rewards/chosen": 0.9022289514541626,
      "rewards/margins": 4.633611679077148,
      "rewards/rejected": -3.7313828468322754,
      "step": 3359
    },
    {
      "epoch": 0.88,
      "grad_norm": 29.510347366333008,
      "kl": 0.0,
      "learning_rate": 6.032452237634127e-08,
      "logps/chosen": -184.28567504882812,
      "logps/rejected": -226.7508544921875,
      "loss": 0.2104,
      "rewards/chosen": 1.5323501825332642,
      "rewards/margins": 5.288782119750977,
      "rewards/rejected": -3.756431818008423,
      "step": 3360
    },
    {
      "epoch": 0.88,
      "grad_norm": 33.84703063964844,
      "kl": 0.0,
      "learning_rate": 6.019366657942947e-08,
      "logps/chosen": -202.72412109375,
      "logps/rejected": -185.4829559326172,
      "loss": 0.1574,
      "rewards/chosen": 0.6997545957565308,
      "rewards/margins": 3.3935866355895996,
      "rewards/rejected": -2.6938319206237793,
      "step": 3361
    },
    {
      "epoch": 0.88,
      "grad_norm": 36.224327087402344,
      "kl": 0.0,
      "learning_rate": 6.006281078251766e-08,
      "logps/chosen": -192.3488006591797,
      "logps/rejected": -253.87930297851562,
      "loss": 0.3257,
      "rewards/chosen": 0.5731613636016846,
      "rewards/margins": 2.5195703506469727,
      "rewards/rejected": -1.946408987045288,
      "step": 3362
    },
    {
      "epoch": 0.88,
      "grad_norm": 31.756702423095703,
      "kl": 0.0,
      "learning_rate": 5.993195498560586e-08,
      "logps/chosen": -220.3318634033203,
      "logps/rejected": -325.5659484863281,
      "loss": 0.2633,
      "rewards/chosen": 1.128380537033081,
      "rewards/margins": 5.160576820373535,
      "rewards/rejected": -4.032196044921875,
      "step": 3363
    },
    {
      "epoch": 0.88,
      "grad_norm": 29.23659896850586,
      "kl": 0.0,
      "learning_rate": 5.980109918869405e-08,
      "logps/chosen": -245.1772003173828,
      "logps/rejected": -314.14703369140625,
      "loss": 0.2172,
      "rewards/chosen": 1.6287401914596558,
      "rewards/margins": 5.567266464233398,
      "rewards/rejected": -3.9385263919830322,
      "step": 3364
    },
    {
      "epoch": 0.88,
      "grad_norm": 37.184505462646484,
      "kl": 0.0,
      "learning_rate": 5.967024339178225e-08,
      "logps/chosen": -262.8486022949219,
      "logps/rejected": -166.6172637939453,
      "loss": 0.2953,
      "rewards/chosen": 1.73638916015625,
      "rewards/margins": 3.695864200592041,
      "rewards/rejected": -1.9594751596450806,
      "step": 3365
    },
    {
      "epoch": 0.88,
      "grad_norm": 40.81242370605469,
      "kl": 0.0,
      "learning_rate": 5.953938759487045e-08,
      "logps/chosen": -137.64544677734375,
      "logps/rejected": -172.7340087890625,
      "loss": 0.3206,
      "rewards/chosen": 0.6962337493896484,
      "rewards/margins": 4.178898334503174,
      "rewards/rejected": -3.4826645851135254,
      "step": 3366
    },
    {
      "epoch": 0.88,
      "grad_norm": 47.62089920043945,
      "kl": 0.0,
      "learning_rate": 5.940853179795865e-08,
      "logps/chosen": -161.90464782714844,
      "logps/rejected": -241.15768432617188,
      "loss": 0.1864,
      "rewards/chosen": 1.6212987899780273,
      "rewards/margins": 4.349338054656982,
      "rewards/rejected": -2.728039264678955,
      "step": 3367
    },
    {
      "epoch": 0.88,
      "grad_norm": 28.67351531982422,
      "kl": 0.0,
      "learning_rate": 5.927767600104684e-08,
      "logps/chosen": -229.27951049804688,
      "logps/rejected": -155.9366455078125,
      "loss": 0.2764,
      "rewards/chosen": 1.6246337890625,
      "rewards/margins": 5.1172051429748535,
      "rewards/rejected": -3.4925713539123535,
      "step": 3368
    },
    {
      "epoch": 0.88,
      "grad_norm": 21.98739242553711,
      "kl": 0.0,
      "learning_rate": 5.914682020413504e-08,
      "logps/chosen": -253.05848693847656,
      "logps/rejected": -254.97067260742188,
      "loss": 0.3061,
      "rewards/chosen": 0.8975669145584106,
      "rewards/margins": 3.9961724281311035,
      "rewards/rejected": -3.0986056327819824,
      "step": 3369
    },
    {
      "epoch": 0.88,
      "grad_norm": 33.838050842285156,
      "kl": 0.0,
      "learning_rate": 5.901596440722324e-08,
      "logps/chosen": -179.86862182617188,
      "logps/rejected": -205.198486328125,
      "loss": 0.2797,
      "rewards/chosen": 0.18361665308475494,
      "rewards/margins": 3.2206592559814453,
      "rewards/rejected": -3.0370426177978516,
      "step": 3370
    },
    {
      "epoch": 0.88,
      "grad_norm": 40.067752838134766,
      "kl": 0.0,
      "learning_rate": 5.8885108610311434e-08,
      "logps/chosen": -206.2080841064453,
      "logps/rejected": -243.10455322265625,
      "loss": 0.2762,
      "rewards/chosen": 0.2042618691921234,
      "rewards/margins": 3.7257778644561768,
      "rewards/rejected": -3.5215160846710205,
      "step": 3371
    },
    {
      "epoch": 0.88,
      "grad_norm": 21.66316032409668,
      "kl": 0.0,
      "learning_rate": 5.875425281339963e-08,
      "logps/chosen": -236.3871612548828,
      "logps/rejected": -286.03564453125,
      "loss": 0.3134,
      "rewards/chosen": 1.657511591911316,
      "rewards/margins": 5.718562602996826,
      "rewards/rejected": -4.061050891876221,
      "step": 3372
    },
    {
      "epoch": 0.88,
      "grad_norm": 27.895227432250977,
      "kl": 0.0,
      "learning_rate": 5.862339701648783e-08,
      "logps/chosen": -246.6658477783203,
      "logps/rejected": -306.644775390625,
      "loss": 0.2309,
      "rewards/chosen": 0.20239301025867462,
      "rewards/margins": 6.392871856689453,
      "rewards/rejected": -6.190478801727295,
      "step": 3373
    },
    {
      "epoch": 0.88,
      "grad_norm": 25.335817337036133,
      "kl": 0.0,
      "learning_rate": 5.849254121957602e-08,
      "logps/chosen": -218.54864501953125,
      "logps/rejected": -249.16184997558594,
      "loss": 0.2853,
      "rewards/chosen": 1.310225248336792,
      "rewards/margins": 4.546543121337891,
      "rewards/rejected": -3.2363181114196777,
      "step": 3374
    },
    {
      "epoch": 0.88,
      "grad_norm": 40.887332916259766,
      "kl": 0.0,
      "learning_rate": 5.8361685422664225e-08,
      "logps/chosen": -132.6246795654297,
      "logps/rejected": -322.7361755371094,
      "loss": 0.2353,
      "rewards/chosen": 1.1172531843185425,
      "rewards/margins": 5.526000499725342,
      "rewards/rejected": -4.40874719619751,
      "step": 3375
    },
    {
      "epoch": 0.88,
      "grad_norm": 33.22807312011719,
      "kl": 0.0,
      "learning_rate": 5.8230829625752415e-08,
      "logps/chosen": -134.68377685546875,
      "logps/rejected": -236.12823486328125,
      "loss": 0.2749,
      "rewards/chosen": 0.8383082151412964,
      "rewards/margins": 4.732278823852539,
      "rewards/rejected": -3.8939707279205322,
      "step": 3376
    },
    {
      "epoch": 0.88,
      "grad_norm": 21.896162033081055,
      "kl": 0.0,
      "learning_rate": 5.809997382884062e-08,
      "logps/chosen": -179.1840057373047,
      "logps/rejected": -266.8948974609375,
      "loss": 0.141,
      "rewards/chosen": 1.1683275699615479,
      "rewards/margins": 5.748424530029297,
      "rewards/rejected": -4.580097198486328,
      "step": 3377
    },
    {
      "epoch": 0.88,
      "grad_norm": 46.306766510009766,
      "kl": 0.0,
      "learning_rate": 5.7969118031928813e-08,
      "logps/chosen": -234.8936767578125,
      "logps/rejected": -172.52682495117188,
      "loss": 0.321,
      "rewards/chosen": -0.017574980854988098,
      "rewards/margins": 3.2141342163085938,
      "rewards/rejected": -3.2317092418670654,
      "step": 3378
    },
    {
      "epoch": 0.88,
      "grad_norm": 34.68102264404297,
      "kl": 0.0,
      "learning_rate": 5.783826223501701e-08,
      "logps/chosen": -224.55445861816406,
      "logps/rejected": -277.49237060546875,
      "loss": 0.2509,
      "rewards/chosen": 1.5944043397903442,
      "rewards/margins": 5.50456428527832,
      "rewards/rejected": -3.9101598262786865,
      "step": 3379
    },
    {
      "epoch": 0.88,
      "grad_norm": 20.631187438964844,
      "kl": 0.0,
      "learning_rate": 5.7707406438105206e-08,
      "logps/chosen": -171.95504760742188,
      "logps/rejected": -330.640380859375,
      "loss": 0.091,
      "rewards/chosen": 2.3494856357574463,
      "rewards/margins": 7.262475967407227,
      "rewards/rejected": -4.912990570068359,
      "step": 3380
    },
    {
      "epoch": 0.88,
      "grad_norm": 24.329391479492188,
      "kl": 0.0,
      "learning_rate": 5.757655064119341e-08,
      "logps/chosen": -177.57826232910156,
      "logps/rejected": -240.90550231933594,
      "loss": 0.2097,
      "rewards/chosen": 1.9974151849746704,
      "rewards/margins": 6.527149677276611,
      "rewards/rejected": -4.5297346115112305,
      "step": 3381
    },
    {
      "epoch": 0.89,
      "grad_norm": 24.810420989990234,
      "kl": 0.0,
      "learning_rate": 5.74456948442816e-08,
      "logps/chosen": -177.7817840576172,
      "logps/rejected": -236.9996795654297,
      "loss": 0.2142,
      "rewards/chosen": -0.16691847145557404,
      "rewards/margins": 4.240876197814941,
      "rewards/rejected": -4.40779447555542,
      "step": 3382
    },
    {
      "epoch": 0.89,
      "grad_norm": 34.094512939453125,
      "kl": 0.0,
      "learning_rate": 5.73148390473698e-08,
      "logps/chosen": -189.9762725830078,
      "logps/rejected": -209.1165313720703,
      "loss": 0.2814,
      "rewards/chosen": 1.5954288244247437,
      "rewards/margins": 4.066312789916992,
      "rewards/rejected": -2.470883846282959,
      "step": 3383
    },
    {
      "epoch": 0.89,
      "grad_norm": 27.294633865356445,
      "kl": 0.0,
      "learning_rate": 5.718398325045799e-08,
      "logps/chosen": -159.4602813720703,
      "logps/rejected": -260.0506896972656,
      "loss": 0.2759,
      "rewards/chosen": 1.1249538660049438,
      "rewards/margins": 3.517695903778076,
      "rewards/rejected": -2.392742156982422,
      "step": 3384
    },
    {
      "epoch": 0.89,
      "grad_norm": 27.89652442932129,
      "kl": 0.0,
      "learning_rate": 5.7053127453546186e-08,
      "logps/chosen": -237.68202209472656,
      "logps/rejected": -262.3923645019531,
      "loss": 0.1512,
      "rewards/chosen": 2.2936761379241943,
      "rewards/margins": 6.410256385803223,
      "rewards/rejected": -4.116580009460449,
      "step": 3385
    },
    {
      "epoch": 0.89,
      "grad_norm": 35.205177307128906,
      "kl": 0.0,
      "learning_rate": 5.692227165663439e-08,
      "logps/chosen": -226.78341674804688,
      "logps/rejected": -203.07223510742188,
      "loss": 0.2688,
      "rewards/chosen": 0.8353214859962463,
      "rewards/margins": 5.021864414215088,
      "rewards/rejected": -4.186542987823486,
      "step": 3386
    },
    {
      "epoch": 0.89,
      "grad_norm": 29.306995391845703,
      "kl": 0.0,
      "learning_rate": 5.679141585972258e-08,
      "logps/chosen": -136.1898193359375,
      "logps/rejected": -186.20309448242188,
      "loss": 0.2487,
      "rewards/chosen": 0.21132591366767883,
      "rewards/margins": 4.531725883483887,
      "rewards/rejected": -4.320399761199951,
      "step": 3387
    },
    {
      "epoch": 0.89,
      "grad_norm": 33.45652389526367,
      "kl": 0.0,
      "learning_rate": 5.666056006281078e-08,
      "logps/chosen": -280.9603576660156,
      "logps/rejected": -256.38214111328125,
      "loss": 0.141,
      "rewards/chosen": 2.556107997894287,
      "rewards/margins": 6.635317325592041,
      "rewards/rejected": -4.079209327697754,
      "step": 3388
    },
    {
      "epoch": 0.89,
      "grad_norm": 31.559064865112305,
      "kl": 0.0,
      "learning_rate": 5.652970426589898e-08,
      "logps/chosen": -217.60220336914062,
      "logps/rejected": -288.64434814453125,
      "loss": 0.1909,
      "rewards/chosen": 3.050231456756592,
      "rewards/margins": 6.9026713371276855,
      "rewards/rejected": -3.8524398803710938,
      "step": 3389
    },
    {
      "epoch": 0.89,
      "grad_norm": 35.80206298828125,
      "kl": 0.0,
      "learning_rate": 5.639884846898717e-08,
      "logps/chosen": -221.58529663085938,
      "logps/rejected": -273.1722717285156,
      "loss": 0.2664,
      "rewards/chosen": 0.20219017565250397,
      "rewards/margins": 3.6827523708343506,
      "rewards/rejected": -3.480562210083008,
      "step": 3390
    },
    {
      "epoch": 0.89,
      "grad_norm": 36.897422790527344,
      "kl": 0.0,
      "learning_rate": 5.626799267207537e-08,
      "logps/chosen": -169.41078186035156,
      "logps/rejected": -203.75230407714844,
      "loss": 0.3144,
      "rewards/chosen": 0.9681345224380493,
      "rewards/margins": 2.774806022644043,
      "rewards/rejected": -1.806671380996704,
      "step": 3391
    },
    {
      "epoch": 0.89,
      "grad_norm": 34.7119026184082,
      "kl": 0.0,
      "learning_rate": 5.6137136875163565e-08,
      "logps/chosen": -191.32791137695312,
      "logps/rejected": -206.8913116455078,
      "loss": 0.2233,
      "rewards/chosen": 2.613614797592163,
      "rewards/margins": 6.831250190734863,
      "rewards/rejected": -4.217635154724121,
      "step": 3392
    },
    {
      "epoch": 0.89,
      "grad_norm": 29.810110092163086,
      "kl": 0.0,
      "learning_rate": 5.600628107825176e-08,
      "logps/chosen": -232.5635528564453,
      "logps/rejected": -232.62684631347656,
      "loss": 0.1641,
      "rewards/chosen": 1.6024681329727173,
      "rewards/margins": 5.8191447257995605,
      "rewards/rejected": -4.216676712036133,
      "step": 3393
    },
    {
      "epoch": 0.89,
      "grad_norm": 31.482484817504883,
      "kl": 0.0,
      "learning_rate": 5.5875425281339964e-08,
      "logps/chosen": -226.40219116210938,
      "logps/rejected": -293.3888854980469,
      "loss": 0.3131,
      "rewards/chosen": 0.6189436912536621,
      "rewards/margins": 4.934136867523193,
      "rewards/rejected": -4.315193176269531,
      "step": 3394
    },
    {
      "epoch": 0.89,
      "grad_norm": 40.35493850708008,
      "kl": 0.0,
      "learning_rate": 5.5744569484428154e-08,
      "logps/chosen": -156.18832397460938,
      "logps/rejected": -336.05908203125,
      "loss": 0.2243,
      "rewards/chosen": 1.6128734350204468,
      "rewards/margins": 4.650113105773926,
      "rewards/rejected": -3.0372395515441895,
      "step": 3395
    },
    {
      "epoch": 0.89,
      "grad_norm": 36.46339416503906,
      "kl": 0.0,
      "learning_rate": 5.5613713687516356e-08,
      "logps/chosen": -178.1088104248047,
      "logps/rejected": -283.3271484375,
      "loss": 0.2208,
      "rewards/chosen": 1.1255799531936646,
      "rewards/margins": 5.292938232421875,
      "rewards/rejected": -4.1673583984375,
      "step": 3396
    },
    {
      "epoch": 0.89,
      "grad_norm": 41.65345764160156,
      "kl": 0.0,
      "learning_rate": 5.548285789060455e-08,
      "logps/chosen": -280.9771423339844,
      "logps/rejected": -273.1884460449219,
      "loss": 0.3031,
      "rewards/chosen": 0.27189967036247253,
      "rewards/margins": 5.235050678253174,
      "rewards/rejected": -4.963150978088379,
      "step": 3397
    },
    {
      "epoch": 0.89,
      "grad_norm": 37.78427505493164,
      "kl": 0.0,
      "learning_rate": 5.535200209369275e-08,
      "logps/chosen": -246.3995361328125,
      "logps/rejected": -295.0464172363281,
      "loss": 0.222,
      "rewards/chosen": 1.722731351852417,
      "rewards/margins": 5.892146110534668,
      "rewards/rejected": -4.16941499710083,
      "step": 3398
    },
    {
      "epoch": 0.89,
      "grad_norm": 28.25185775756836,
      "kl": 0.0,
      "learning_rate": 5.5221146296780945e-08,
      "logps/chosen": -170.24465942382812,
      "logps/rejected": -228.2877655029297,
      "loss": 0.1923,
      "rewards/chosen": 1.01484215259552,
      "rewards/margins": 3.821547508239746,
      "rewards/rejected": -2.8067054748535156,
      "step": 3399
    },
    {
      "epoch": 0.89,
      "grad_norm": 33.23460388183594,
      "kl": 0.0,
      "learning_rate": 5.509029049986914e-08,
      "logps/chosen": -149.94390869140625,
      "logps/rejected": -215.99136352539062,
      "loss": 0.3428,
      "rewards/chosen": 1.3301762342453003,
      "rewards/margins": 4.284788131713867,
      "rewards/rejected": -2.9546117782592773,
      "step": 3400
    },
    {
      "epoch": 0.89,
      "grad_norm": 30.18558692932129,
      "kl": 0.0,
      "learning_rate": 5.495943470295734e-08,
      "logps/chosen": -178.04258728027344,
      "logps/rejected": -250.28799438476562,
      "loss": 0.1482,
      "rewards/chosen": 1.288857340812683,
      "rewards/margins": 5.212002754211426,
      "rewards/rejected": -3.9231455326080322,
      "step": 3401
    },
    {
      "epoch": 0.89,
      "grad_norm": 32.04692840576172,
      "kl": 0.0,
      "learning_rate": 5.482857890604554e-08,
      "logps/chosen": -170.0665740966797,
      "logps/rejected": -245.35592651367188,
      "loss": 0.2146,
      "rewards/chosen": 1.415221929550171,
      "rewards/margins": 7.141057968139648,
      "rewards/rejected": -5.725835800170898,
      "step": 3402
    },
    {
      "epoch": 0.89,
      "grad_norm": 34.108558654785156,
      "kl": 0.0,
      "learning_rate": 5.469772310913373e-08,
      "logps/chosen": -185.75926208496094,
      "logps/rejected": -248.67950439453125,
      "loss": 0.2838,
      "rewards/chosen": 0.09984666109085083,
      "rewards/margins": 3.3367090225219727,
      "rewards/rejected": -3.2368624210357666,
      "step": 3403
    },
    {
      "epoch": 0.89,
      "grad_norm": 33.87335968017578,
      "kl": 0.0,
      "learning_rate": 5.456686731222193e-08,
      "logps/chosen": -267.390625,
      "logps/rejected": -268.80877685546875,
      "loss": 0.2325,
      "rewards/chosen": 2.265165328979492,
      "rewards/margins": 5.600419044494629,
      "rewards/rejected": -3.3352534770965576,
      "step": 3404
    },
    {
      "epoch": 0.89,
      "grad_norm": 24.61138916015625,
      "kl": 0.0,
      "learning_rate": 5.443601151531013e-08,
      "logps/chosen": -142.63829040527344,
      "logps/rejected": -187.2084197998047,
      "loss": 0.1972,
      "rewards/chosen": 2.221682071685791,
      "rewards/margins": 6.340028762817383,
      "rewards/rejected": -4.118346691131592,
      "step": 3405
    },
    {
      "epoch": 0.89,
      "grad_norm": 32.826377868652344,
      "kl": 0.0,
      "learning_rate": 5.4305155718398324e-08,
      "logps/chosen": -232.07369995117188,
      "logps/rejected": -256.4215087890625,
      "loss": 0.2689,
      "rewards/chosen": 0.41094404458999634,
      "rewards/margins": 4.343574047088623,
      "rewards/rejected": -3.9326298236846924,
      "step": 3406
    },
    {
      "epoch": 0.89,
      "grad_norm": 43.49300765991211,
      "kl": 0.0,
      "learning_rate": 5.417429992148652e-08,
      "logps/chosen": -308.1954040527344,
      "logps/rejected": -250.01596069335938,
      "loss": 0.336,
      "rewards/chosen": 0.7856874465942383,
      "rewards/margins": 3.084690570831299,
      "rewards/rejected": -2.2990031242370605,
      "step": 3407
    },
    {
      "epoch": 0.89,
      "grad_norm": 31.34882354736328,
      "kl": 0.0,
      "learning_rate": 5.4043444124574716e-08,
      "logps/chosen": -118.74658203125,
      "logps/rejected": -236.03623962402344,
      "loss": 0.1779,
      "rewards/chosen": 2.0077106952667236,
      "rewards/margins": 5.9031219482421875,
      "rewards/rejected": -3.895411491394043,
      "step": 3408
    },
    {
      "epoch": 0.89,
      "grad_norm": 36.540462493896484,
      "kl": 0.0,
      "learning_rate": 5.391258832766291e-08,
      "logps/chosen": -235.9467010498047,
      "logps/rejected": -239.76263427734375,
      "loss": 0.3456,
      "rewards/chosen": 0.06817644834518433,
      "rewards/margins": 3.937302350997925,
      "rewards/rejected": -3.8691258430480957,
      "step": 3409
    },
    {
      "epoch": 0.89,
      "grad_norm": 29.51076889038086,
      "kl": 0.0,
      "learning_rate": 5.3781732530751115e-08,
      "logps/chosen": -258.8054504394531,
      "logps/rejected": -237.52342224121094,
      "loss": 0.2351,
      "rewards/chosen": 1.873181700706482,
      "rewards/margins": 6.267869472503662,
      "rewards/rejected": -4.394687652587891,
      "step": 3410
    },
    {
      "epoch": 0.89,
      "grad_norm": 36.962745666503906,
      "kl": 0.0,
      "learning_rate": 5.3650876733839304e-08,
      "logps/chosen": -270.0163879394531,
      "logps/rejected": -256.7657470703125,
      "loss": 0.2353,
      "rewards/chosen": 1.1174057722091675,
      "rewards/margins": 3.3676934242248535,
      "rewards/rejected": -2.2502877712249756,
      "step": 3411
    },
    {
      "epoch": 0.89,
      "grad_norm": 31.154197692871094,
      "kl": 0.0,
      "learning_rate": 5.352002093692751e-08,
      "logps/chosen": -211.06796264648438,
      "logps/rejected": -339.4208984375,
      "loss": 0.2567,
      "rewards/chosen": 1.7467280626296997,
      "rewards/margins": 4.826638698577881,
      "rewards/rejected": -3.0799107551574707,
      "step": 3412
    },
    {
      "epoch": 0.89,
      "grad_norm": 41.743247985839844,
      "kl": 0.0,
      "learning_rate": 5.33891651400157e-08,
      "logps/chosen": -251.5352020263672,
      "logps/rejected": -174.1204376220703,
      "loss": 0.2898,
      "rewards/chosen": 0.8819773197174072,
      "rewards/margins": 5.040376663208008,
      "rewards/rejected": -4.1583991050720215,
      "step": 3413
    },
    {
      "epoch": 0.89,
      "grad_norm": 20.825239181518555,
      "kl": 0.0,
      "learning_rate": 5.325830934310389e-08,
      "logps/chosen": -258.7743225097656,
      "logps/rejected": -294.87408447265625,
      "loss": 0.1343,
      "rewards/chosen": 2.4211437702178955,
      "rewards/margins": 6.7697038650512695,
      "rewards/rejected": -4.348559856414795,
      "step": 3414
    },
    {
      "epoch": 0.89,
      "grad_norm": 38.2694091796875,
      "kl": 0.0,
      "learning_rate": 5.3127453546192095e-08,
      "logps/chosen": -180.42234802246094,
      "logps/rejected": -224.37460327148438,
      "loss": 0.2061,
      "rewards/chosen": 2.412104606628418,
      "rewards/margins": 5.695418357849121,
      "rewards/rejected": -3.283313751220703,
      "step": 3415
    },
    {
      "epoch": 0.89,
      "grad_norm": 41.9177131652832,
      "kl": 0.0,
      "learning_rate": 5.2996597749280285e-08,
      "logps/chosen": -199.71421813964844,
      "logps/rejected": -248.69764709472656,
      "loss": 0.2886,
      "rewards/chosen": 0.07714885473251343,
      "rewards/margins": 3.2543482780456543,
      "rewards/rejected": -3.177199363708496,
      "step": 3416
    },
    {
      "epoch": 0.89,
      "grad_norm": 48.770668029785156,
      "kl": 0.0,
      "learning_rate": 5.286574195236849e-08,
      "logps/chosen": -191.14852905273438,
      "logps/rejected": -266.9022216796875,
      "loss": 0.2607,
      "rewards/chosen": 0.6749199628829956,
      "rewards/margins": 3.958329677581787,
      "rewards/rejected": -3.283409595489502,
      "step": 3417
    },
    {
      "epoch": 0.89,
      "grad_norm": 27.754985809326172,
      "kl": 0.0,
      "learning_rate": 5.2734886155456684e-08,
      "logps/chosen": -191.7516326904297,
      "logps/rejected": -189.91012573242188,
      "loss": 0.1772,
      "rewards/chosen": 0.6567466259002686,
      "rewards/margins": 3.608036756515503,
      "rewards/rejected": -2.9512901306152344,
      "step": 3418
    },
    {
      "epoch": 0.89,
      "grad_norm": 36.7862548828125,
      "kl": 0.0,
      "learning_rate": 5.260403035854488e-08,
      "logps/chosen": -152.87648010253906,
      "logps/rejected": -252.29257202148438,
      "loss": 0.2769,
      "rewards/chosen": 1.3332586288452148,
      "rewards/margins": 3.6783080101013184,
      "rewards/rejected": -2.3450493812561035,
      "step": 3419
    },
    {
      "epoch": 0.9,
      "grad_norm": 36.703426361083984,
      "kl": 0.0,
      "learning_rate": 5.2473174561633076e-08,
      "logps/chosen": -257.3609619140625,
      "logps/rejected": -244.8799591064453,
      "loss": 0.2853,
      "rewards/chosen": 0.4611107110977173,
      "rewards/margins": 5.041869640350342,
      "rewards/rejected": -4.580759048461914,
      "step": 3420
    },
    {
      "epoch": 0.9,
      "grad_norm": 34.300514221191406,
      "kl": 0.0,
      "learning_rate": 5.234231876472128e-08,
      "logps/chosen": -157.11154174804688,
      "logps/rejected": -214.3528289794922,
      "loss": 0.2159,
      "rewards/chosen": 1.7635271549224854,
      "rewards/margins": 5.321822166442871,
      "rewards/rejected": -3.558295249938965,
      "step": 3421
    },
    {
      "epoch": 0.9,
      "grad_norm": 37.2406120300293,
      "kl": 0.0,
      "learning_rate": 5.221146296780947e-08,
      "logps/chosen": -217.28884887695312,
      "logps/rejected": -221.66026306152344,
      "loss": 0.1318,
      "rewards/chosen": 3.0308799743652344,
      "rewards/margins": 7.286211967468262,
      "rewards/rejected": -4.255331993103027,
      "step": 3422
    },
    {
      "epoch": 0.9,
      "grad_norm": 30.882293701171875,
      "kl": 0.0,
      "learning_rate": 5.208060717089767e-08,
      "logps/chosen": -182.9688262939453,
      "logps/rejected": -254.4478302001953,
      "loss": 0.1496,
      "rewards/chosen": 1.0490161180496216,
      "rewards/margins": 4.701062202453613,
      "rewards/rejected": -3.6520462036132812,
      "step": 3423
    },
    {
      "epoch": 0.9,
      "grad_norm": 36.64192581176758,
      "kl": 0.0,
      "learning_rate": 5.194975137398586e-08,
      "logps/chosen": -188.86862182617188,
      "logps/rejected": -242.87950134277344,
      "loss": 0.2503,
      "rewards/chosen": -1.1615270376205444,
      "rewards/margins": 2.616291046142578,
      "rewards/rejected": -3.777817964553833,
      "step": 3424
    },
    {
      "epoch": 0.9,
      "grad_norm": 32.814735412597656,
      "kl": 0.0,
      "learning_rate": 5.181889557707406e-08,
      "logps/chosen": -206.17967224121094,
      "logps/rejected": -267.380615234375,
      "loss": 0.1992,
      "rewards/chosen": 3.037926197052002,
      "rewards/margins": 5.702183246612549,
      "rewards/rejected": -2.664257049560547,
      "step": 3425
    },
    {
      "epoch": 0.9,
      "grad_norm": 37.48391342163086,
      "kl": 0.0,
      "learning_rate": 5.168803978016226e-08,
      "logps/chosen": -156.80770874023438,
      "logps/rejected": -240.1800079345703,
      "loss": 0.2122,
      "rewards/chosen": 1.6631457805633545,
      "rewards/margins": 3.9029107093811035,
      "rewards/rejected": -2.239764928817749,
      "step": 3426
    },
    {
      "epoch": 0.9,
      "grad_norm": 28.892189025878906,
      "kl": 0.0,
      "learning_rate": 5.1557183983250455e-08,
      "logps/chosen": -166.68853759765625,
      "logps/rejected": -338.1997375488281,
      "loss": 0.3027,
      "rewards/chosen": 0.10017256438732147,
      "rewards/margins": 2.704484701156616,
      "rewards/rejected": -2.6043121814727783,
      "step": 3427
    },
    {
      "epoch": 0.9,
      "grad_norm": 25.72585105895996,
      "kl": 0.0,
      "learning_rate": 5.142632818633865e-08,
      "logps/chosen": -125.46946716308594,
      "logps/rejected": -273.8974609375,
      "loss": 0.1832,
      "rewards/chosen": 1.6431037187576294,
      "rewards/margins": 4.477502822875977,
      "rewards/rejected": -2.8343989849090576,
      "step": 3428
    },
    {
      "epoch": 0.9,
      "grad_norm": 32.71539306640625,
      "kl": 0.0,
      "learning_rate": 5.1295472389426854e-08,
      "logps/chosen": -202.12356567382812,
      "logps/rejected": -202.73655700683594,
      "loss": 0.2059,
      "rewards/chosen": -0.09025682508945465,
      "rewards/margins": 3.4574134349823,
      "rewards/rejected": -3.547670364379883,
      "step": 3429
    },
    {
      "epoch": 0.9,
      "grad_norm": 33.782875061035156,
      "kl": 0.0,
      "learning_rate": 5.116461659251504e-08,
      "logps/chosen": -207.37619018554688,
      "logps/rejected": -195.9562225341797,
      "loss": 0.2405,
      "rewards/chosen": 1.3065872192382812,
      "rewards/margins": 3.7297723293304443,
      "rewards/rejected": -2.423185110092163,
      "step": 3430
    },
    {
      "epoch": 0.9,
      "grad_norm": 41.365657806396484,
      "kl": 0.0,
      "learning_rate": 5.1033760795603246e-08,
      "logps/chosen": -129.8498077392578,
      "logps/rejected": -163.44862365722656,
      "loss": 0.2667,
      "rewards/chosen": 0.3854326903820038,
      "rewards/margins": 2.549304962158203,
      "rewards/rejected": -2.163872241973877,
      "step": 3431
    },
    {
      "epoch": 0.9,
      "grad_norm": 44.39329147338867,
      "kl": 0.0,
      "learning_rate": 5.0902904998691435e-08,
      "logps/chosen": -182.01693725585938,
      "logps/rejected": -238.8572998046875,
      "loss": 0.1059,
      "rewards/chosen": 0.8487826585769653,
      "rewards/margins": 4.960758686065674,
      "rewards/rejected": -4.111976146697998,
      "step": 3432
    },
    {
      "epoch": 0.9,
      "grad_norm": 25.658466339111328,
      "kl": 0.0,
      "learning_rate": 5.077204920177964e-08,
      "logps/chosen": -158.74652099609375,
      "logps/rejected": -301.00299072265625,
      "loss": 0.1193,
      "rewards/chosen": 0.1914927214384079,
      "rewards/margins": 5.276811599731445,
      "rewards/rejected": -5.0853190422058105,
      "step": 3433
    },
    {
      "epoch": 0.9,
      "grad_norm": 28.961124420166016,
      "kl": 0.0,
      "learning_rate": 5.0641193404867834e-08,
      "logps/chosen": -187.64295959472656,
      "logps/rejected": -201.07196044921875,
      "loss": 0.1929,
      "rewards/chosen": 1.7851966619491577,
      "rewards/margins": 5.109597682952881,
      "rewards/rejected": -3.3244009017944336,
      "step": 3434
    },
    {
      "epoch": 0.9,
      "grad_norm": 34.09691619873047,
      "kl": 0.0,
      "learning_rate": 5.051033760795603e-08,
      "logps/chosen": -243.12417602539062,
      "logps/rejected": -236.99737548828125,
      "loss": 0.2327,
      "rewards/chosen": 1.3128129243850708,
      "rewards/margins": 5.337040424346924,
      "rewards/rejected": -4.024227619171143,
      "step": 3435
    },
    {
      "epoch": 0.9,
      "grad_norm": 30.38089942932129,
      "kl": 0.0,
      "learning_rate": 5.0379481811044226e-08,
      "logps/chosen": -234.9790802001953,
      "logps/rejected": -238.9598388671875,
      "loss": 0.2465,
      "rewards/chosen": 1.983193039894104,
      "rewards/margins": 7.679162502288818,
      "rewards/rejected": -5.695969581604004,
      "step": 3436
    },
    {
      "epoch": 0.9,
      "grad_norm": 36.361236572265625,
      "kl": 0.0,
      "learning_rate": 5.024862601413243e-08,
      "logps/chosen": -166.4700469970703,
      "logps/rejected": -247.67010498046875,
      "loss": 0.2001,
      "rewards/chosen": 1.0058622360229492,
      "rewards/margins": 3.8643991947174072,
      "rewards/rejected": -2.858536958694458,
      "step": 3437
    },
    {
      "epoch": 0.9,
      "grad_norm": 27.40168571472168,
      "kl": 0.0,
      "learning_rate": 5.011777021722062e-08,
      "logps/chosen": -183.8133087158203,
      "logps/rejected": -230.89952087402344,
      "loss": 0.2022,
      "rewards/chosen": 1.5351980924606323,
      "rewards/margins": 4.581424236297607,
      "rewards/rejected": -3.0462260246276855,
      "step": 3438
    },
    {
      "epoch": 0.9,
      "grad_norm": 37.18417739868164,
      "kl": 0.0,
      "learning_rate": 4.998691442030882e-08,
      "logps/chosen": -236.43589782714844,
      "logps/rejected": -251.51925659179688,
      "loss": 0.2451,
      "rewards/chosen": 1.116813063621521,
      "rewards/margins": 4.605891704559326,
      "rewards/rejected": -3.4890785217285156,
      "step": 3439
    },
    {
      "epoch": 0.9,
      "grad_norm": 37.91105270385742,
      "kl": 0.0,
      "learning_rate": 4.985605862339701e-08,
      "logps/chosen": -191.96932983398438,
      "logps/rejected": -254.31973266601562,
      "loss": 0.2672,
      "rewards/chosen": 1.3671517372131348,
      "rewards/margins": 3.7806217670440674,
      "rewards/rejected": -2.4134700298309326,
      "step": 3440
    },
    {
      "epoch": 0.9,
      "grad_norm": 24.377700805664062,
      "kl": 0.0,
      "learning_rate": 4.9725202826485214e-08,
      "logps/chosen": -262.665283203125,
      "logps/rejected": -226.87684631347656,
      "loss": 0.2126,
      "rewards/chosen": 2.6146717071533203,
      "rewards/margins": 5.3994903564453125,
      "rewards/rejected": -2.784818410873413,
      "step": 3441
    },
    {
      "epoch": 0.9,
      "grad_norm": 33.40414810180664,
      "kl": 0.0,
      "learning_rate": 4.959434702957341e-08,
      "logps/chosen": -248.6764678955078,
      "logps/rejected": -247.5901641845703,
      "loss": 0.2668,
      "rewards/chosen": 1.8479644060134888,
      "rewards/margins": 5.598476886749268,
      "rewards/rejected": -3.7505125999450684,
      "step": 3442
    },
    {
      "epoch": 0.9,
      "grad_norm": 28.74833869934082,
      "kl": 0.0,
      "learning_rate": 4.94634912326616e-08,
      "logps/chosen": -209.05368041992188,
      "logps/rejected": -178.4486083984375,
      "loss": 0.287,
      "rewards/chosen": 0.11143312603235245,
      "rewards/margins": 3.713197708129883,
      "rewards/rejected": -3.601764678955078,
      "step": 3443
    },
    {
      "epoch": 0.9,
      "grad_norm": 34.230064392089844,
      "kl": 0.0,
      "learning_rate": 4.93326354357498e-08,
      "logps/chosen": -129.30055236816406,
      "logps/rejected": -276.4996643066406,
      "loss": 0.2192,
      "rewards/chosen": 2.003117561340332,
      "rewards/margins": 5.9718122482299805,
      "rewards/rejected": -3.9686944484710693,
      "step": 3444
    },
    {
      "epoch": 0.9,
      "grad_norm": 27.552433013916016,
      "kl": 0.0,
      "learning_rate": 4.9201779638838e-08,
      "logps/chosen": -198.81936645507812,
      "logps/rejected": -295.6492004394531,
      "loss": 0.2223,
      "rewards/chosen": 0.5861003994941711,
      "rewards/margins": 5.239863395690918,
      "rewards/rejected": -4.6537628173828125,
      "step": 3445
    },
    {
      "epoch": 0.9,
      "grad_norm": 22.962255477905273,
      "kl": 0.0,
      "learning_rate": 4.9070923841926194e-08,
      "logps/chosen": -241.2476348876953,
      "logps/rejected": -184.06736755371094,
      "loss": 0.1843,
      "rewards/chosen": 0.3511374294757843,
      "rewards/margins": 3.9717328548431396,
      "rewards/rejected": -3.6205954551696777,
      "step": 3446
    },
    {
      "epoch": 0.9,
      "grad_norm": 35.427024841308594,
      "kl": 0.0,
      "learning_rate": 4.894006804501439e-08,
      "logps/chosen": -175.5431671142578,
      "logps/rejected": -299.06829833984375,
      "loss": 0.1853,
      "rewards/chosen": 1.4518764019012451,
      "rewards/margins": 4.794173240661621,
      "rewards/rejected": -3.342296838760376,
      "step": 3447
    },
    {
      "epoch": 0.9,
      "grad_norm": 32.377662658691406,
      "kl": 0.0,
      "learning_rate": 4.880921224810259e-08,
      "logps/chosen": -258.3872985839844,
      "logps/rejected": -223.69058227539062,
      "loss": 0.2336,
      "rewards/chosen": 1.3112188577651978,
      "rewards/margins": 5.9051337242126465,
      "rewards/rejected": -4.593914985656738,
      "step": 3448
    },
    {
      "epoch": 0.9,
      "grad_norm": 31.69407844543457,
      "kl": 0.0,
      "learning_rate": 4.867835645119078e-08,
      "logps/chosen": -195.5200958251953,
      "logps/rejected": -281.95965576171875,
      "loss": 0.2164,
      "rewards/chosen": -0.44065719842910767,
      "rewards/margins": 2.2437925338745117,
      "rewards/rejected": -2.6844496726989746,
      "step": 3449
    },
    {
      "epoch": 0.9,
      "grad_norm": 36.76367950439453,
      "kl": 0.0,
      "learning_rate": 4.8547500654278985e-08,
      "logps/chosen": -146.5417022705078,
      "logps/rejected": -271.20166015625,
      "loss": 0.2169,
      "rewards/chosen": 1.526592493057251,
      "rewards/margins": 4.448322296142578,
      "rewards/rejected": -2.9217300415039062,
      "step": 3450
    },
    {
      "epoch": 0.9,
      "grad_norm": 67.0025863647461,
      "kl": 0.0,
      "learning_rate": 4.8416644857367174e-08,
      "logps/chosen": -212.27645874023438,
      "logps/rejected": -219.5655059814453,
      "loss": 0.2246,
      "rewards/chosen": 0.9446682333946228,
      "rewards/margins": 4.536525726318359,
      "rewards/rejected": -3.591857433319092,
      "step": 3451
    },
    {
      "epoch": 0.9,
      "grad_norm": 37.800968170166016,
      "kl": 0.0,
      "learning_rate": 4.828578906045538e-08,
      "logps/chosen": -203.21331787109375,
      "logps/rejected": -186.6897735595703,
      "loss": 0.2268,
      "rewards/chosen": 2.3236589431762695,
      "rewards/margins": 4.436846733093262,
      "rewards/rejected": -2.113187551498413,
      "step": 3452
    },
    {
      "epoch": 0.9,
      "grad_norm": 29.42912483215332,
      "kl": 0.0,
      "learning_rate": 4.815493326354357e-08,
      "logps/chosen": -216.91082763671875,
      "logps/rejected": -302.6053161621094,
      "loss": 0.1218,
      "rewards/chosen": 3.6041765213012695,
      "rewards/margins": 7.994859218597412,
      "rewards/rejected": -4.390682697296143,
      "step": 3453
    },
    {
      "epoch": 0.9,
      "grad_norm": 43.55042266845703,
      "kl": 0.0,
      "learning_rate": 4.802407746663177e-08,
      "logps/chosen": -172.40695190429688,
      "logps/rejected": -174.08834838867188,
      "loss": 0.3223,
      "rewards/chosen": 0.5470663905143738,
      "rewards/margins": 2.5851056575775146,
      "rewards/rejected": -2.038039207458496,
      "step": 3454
    },
    {
      "epoch": 0.9,
      "grad_norm": 25.590280532836914,
      "kl": 0.0,
      "learning_rate": 4.7893221669719965e-08,
      "logps/chosen": -184.07814025878906,
      "logps/rejected": -215.82809448242188,
      "loss": 0.1457,
      "rewards/chosen": 2.0870168209075928,
      "rewards/margins": 8.137537956237793,
      "rewards/rejected": -6.050520896911621,
      "step": 3455
    },
    {
      "epoch": 0.9,
      "grad_norm": 30.191423416137695,
      "kl": 0.0,
      "learning_rate": 4.776236587280817e-08,
      "logps/chosen": -205.03875732421875,
      "logps/rejected": -246.31654357910156,
      "loss": 0.2024,
      "rewards/chosen": 1.6703300476074219,
      "rewards/margins": 4.7895050048828125,
      "rewards/rejected": -3.1191751956939697,
      "step": 3456
    },
    {
      "epoch": 0.9,
      "grad_norm": 35.2052001953125,
      "kl": 0.0,
      "learning_rate": 4.763151007589636e-08,
      "logps/chosen": -239.1469268798828,
      "logps/rejected": -204.1580047607422,
      "loss": 0.2853,
      "rewards/chosen": 1.1633193492889404,
      "rewards/margins": 3.4346001148223877,
      "rewards/rejected": -2.2712807655334473,
      "step": 3457
    },
    {
      "epoch": 0.9,
      "grad_norm": 40.309967041015625,
      "kl": 0.0,
      "learning_rate": 4.750065427898456e-08,
      "logps/chosen": -141.7136688232422,
      "logps/rejected": -249.14077758789062,
      "loss": 0.2115,
      "rewards/chosen": 1.0800200700759888,
      "rewards/margins": 4.454376697540283,
      "rewards/rejected": -3.374356746673584,
      "step": 3458
    },
    {
      "epoch": 0.91,
      "grad_norm": 25.846149444580078,
      "kl": 0.0,
      "learning_rate": 4.736979848207275e-08,
      "logps/chosen": -199.12954711914062,
      "logps/rejected": -300.17291259765625,
      "loss": 0.1687,
      "rewards/chosen": 0.7269772887229919,
      "rewards/margins": 5.699728012084961,
      "rewards/rejected": -4.972750663757324,
      "step": 3459
    },
    {
      "epoch": 0.91,
      "grad_norm": 30.018512725830078,
      "kl": 0.0,
      "learning_rate": 4.723894268516095e-08,
      "logps/chosen": -225.64785766601562,
      "logps/rejected": -230.19851684570312,
      "loss": 0.2012,
      "rewards/chosen": 2.279531478881836,
      "rewards/margins": 6.719575881958008,
      "rewards/rejected": -4.440044403076172,
      "step": 3460
    },
    {
      "epoch": 0.91,
      "grad_norm": 33.65125274658203,
      "kl": 0.0,
      "learning_rate": 4.710808688824915e-08,
      "logps/chosen": -246.58177185058594,
      "logps/rejected": -316.8072509765625,
      "loss": 0.2009,
      "rewards/chosen": 2.155870199203491,
      "rewards/margins": 6.938811302185059,
      "rewards/rejected": -4.782940864562988,
      "step": 3461
    },
    {
      "epoch": 0.91,
      "grad_norm": 31.399215698242188,
      "kl": 0.0,
      "learning_rate": 4.6977231091337345e-08,
      "logps/chosen": -198.928955078125,
      "logps/rejected": -194.60580444335938,
      "loss": 0.2615,
      "rewards/chosen": 0.9209333658218384,
      "rewards/margins": 4.618176460266113,
      "rewards/rejected": -3.6972432136535645,
      "step": 3462
    },
    {
      "epoch": 0.91,
      "grad_norm": 27.609634399414062,
      "kl": 0.0,
      "learning_rate": 4.684637529442554e-08,
      "logps/chosen": -269.3661193847656,
      "logps/rejected": -219.11439514160156,
      "loss": 0.2958,
      "rewards/chosen": -0.05110529065132141,
      "rewards/margins": 3.0176358222961426,
      "rewards/rejected": -3.0687410831451416,
      "step": 3463
    },
    {
      "epoch": 0.91,
      "grad_norm": 33.409671783447266,
      "kl": 0.0,
      "learning_rate": 4.6715519497513743e-08,
      "logps/chosen": -162.5331268310547,
      "logps/rejected": -324.6145324707031,
      "loss": 0.2649,
      "rewards/chosen": 0.1856052577495575,
      "rewards/margins": 4.66796875,
      "rewards/rejected": -4.482363700866699,
      "step": 3464
    },
    {
      "epoch": 0.91,
      "grad_norm": 30.648984909057617,
      "kl": 0.0,
      "learning_rate": 4.658466370060193e-08,
      "logps/chosen": -200.22683715820312,
      "logps/rejected": -353.6416931152344,
      "loss": 0.2502,
      "rewards/chosen": 2.1871938705444336,
      "rewards/margins": 7.84035587310791,
      "rewards/rejected": -5.653162002563477,
      "step": 3465
    },
    {
      "epoch": 0.91,
      "grad_norm": 37.56059265136719,
      "kl": 0.0,
      "learning_rate": 4.6453807903690136e-08,
      "logps/chosen": -225.29098510742188,
      "logps/rejected": -352.63604736328125,
      "loss": 0.1575,
      "rewards/chosen": 0.413031667470932,
      "rewards/margins": 7.242162227630615,
      "rewards/rejected": -6.82913064956665,
      "step": 3466
    },
    {
      "epoch": 0.91,
      "grad_norm": 30.30290412902832,
      "kl": 0.0,
      "learning_rate": 4.6322952106778325e-08,
      "logps/chosen": -267.28814697265625,
      "logps/rejected": -205.54666137695312,
      "loss": 0.1733,
      "rewards/chosen": 1.5149755477905273,
      "rewards/margins": 3.9995875358581543,
      "rewards/rejected": -2.484611988067627,
      "step": 3467
    },
    {
      "epoch": 0.91,
      "grad_norm": 51.21653747558594,
      "kl": 0.0,
      "learning_rate": 4.619209630986653e-08,
      "logps/chosen": -284.357177734375,
      "logps/rejected": -295.5521240234375,
      "loss": 0.3575,
      "rewards/chosen": 1.1439176797866821,
      "rewards/margins": 3.0836286544799805,
      "rewards/rejected": -1.9397108554840088,
      "step": 3468
    },
    {
      "epoch": 0.91,
      "grad_norm": 40.627769470214844,
      "kl": 0.0,
      "learning_rate": 4.6061240512954724e-08,
      "logps/chosen": -257.2642822265625,
      "logps/rejected": -317.79730224609375,
      "loss": 0.2883,
      "rewards/chosen": 0.018114745616912842,
      "rewards/margins": 4.1845903396606445,
      "rewards/rejected": -4.166475772857666,
      "step": 3469
    },
    {
      "epoch": 0.91,
      "grad_norm": 32.87179183959961,
      "kl": 0.0,
      "learning_rate": 4.593038471604292e-08,
      "logps/chosen": -190.40333557128906,
      "logps/rejected": -224.46026611328125,
      "loss": 0.355,
      "rewards/chosen": 0.24565494060516357,
      "rewards/margins": 4.230736255645752,
      "rewards/rejected": -3.985081434249878,
      "step": 3470
    },
    {
      "epoch": 0.91,
      "grad_norm": 35.715606689453125,
      "kl": 0.0,
      "learning_rate": 4.5799528919131116e-08,
      "logps/chosen": -275.18035888671875,
      "logps/rejected": -206.77891540527344,
      "loss": 0.208,
      "rewards/chosen": 3.2653937339782715,
      "rewards/margins": 5.532741546630859,
      "rewards/rejected": -2.267347812652588,
      "step": 3471
    },
    {
      "epoch": 0.91,
      "grad_norm": 41.31447219848633,
      "kl": 0.0,
      "learning_rate": 4.5668673122219306e-08,
      "logps/chosen": -237.96063232421875,
      "logps/rejected": -322.6827392578125,
      "loss": 0.2782,
      "rewards/chosen": 1.0975438356399536,
      "rewards/margins": 4.81644868850708,
      "rewards/rejected": -3.718904972076416,
      "step": 3472
    },
    {
      "epoch": 0.91,
      "grad_norm": 39.48509979248047,
      "kl": 0.0,
      "learning_rate": 4.553781732530751e-08,
      "logps/chosen": -222.59228515625,
      "logps/rejected": -259.65008544921875,
      "loss": 0.342,
      "rewards/chosen": 1.1126902103424072,
      "rewards/margins": 5.725687026977539,
      "rewards/rejected": -4.612997055053711,
      "step": 3473
    },
    {
      "epoch": 0.91,
      "grad_norm": 28.43281364440918,
      "kl": 0.0,
      "learning_rate": 4.5406961528395704e-08,
      "logps/chosen": -238.9607391357422,
      "logps/rejected": -218.1140594482422,
      "loss": 0.1194,
      "rewards/chosen": 3.4507126808166504,
      "rewards/margins": 6.578547477722168,
      "rewards/rejected": -3.1278350353240967,
      "step": 3474
    },
    {
      "epoch": 0.91,
      "grad_norm": 27.3100528717041,
      "kl": 0.0,
      "learning_rate": 4.52761057314839e-08,
      "logps/chosen": -157.7848358154297,
      "logps/rejected": -222.90573120117188,
      "loss": 0.2354,
      "rewards/chosen": 0.6552536487579346,
      "rewards/margins": 3.9927327632904053,
      "rewards/rejected": -3.3374791145324707,
      "step": 3475
    },
    {
      "epoch": 0.91,
      "grad_norm": 35.11143112182617,
      "kl": 0.0,
      "learning_rate": 4.5145249934572097e-08,
      "logps/chosen": -230.7719268798828,
      "logps/rejected": -308.68560791015625,
      "loss": 0.2438,
      "rewards/chosen": 0.8573986291885376,
      "rewards/margins": 4.772648334503174,
      "rewards/rejected": -3.915249824523926,
      "step": 3476
    },
    {
      "epoch": 0.91,
      "grad_norm": 28.025171279907227,
      "kl": 0.0,
      "learning_rate": 4.50143941376603e-08,
      "logps/chosen": -155.75784301757812,
      "logps/rejected": -286.5352478027344,
      "loss": 0.1556,
      "rewards/chosen": 2.2150192260742188,
      "rewards/margins": 6.869057655334473,
      "rewards/rejected": -4.654038429260254,
      "step": 3477
    },
    {
      "epoch": 0.91,
      "grad_norm": 36.9254264831543,
      "kl": 0.0,
      "learning_rate": 4.488353834074849e-08,
      "logps/chosen": -175.12673950195312,
      "logps/rejected": -235.85813903808594,
      "loss": 0.2853,
      "rewards/chosen": -0.033377885818481445,
      "rewards/margins": 3.532153606414795,
      "rewards/rejected": -3.5655314922332764,
      "step": 3478
    },
    {
      "epoch": 0.91,
      "grad_norm": 34.92323684692383,
      "kl": 0.0,
      "learning_rate": 4.475268254383669e-08,
      "logps/chosen": -174.413818359375,
      "logps/rejected": -258.3853454589844,
      "loss": 0.2737,
      "rewards/chosen": 0.14707216620445251,
      "rewards/margins": 4.230539321899414,
      "rewards/rejected": -4.08346700668335,
      "step": 3479
    },
    {
      "epoch": 0.91,
      "grad_norm": 31.240678787231445,
      "kl": 0.0,
      "learning_rate": 4.462182674692489e-08,
      "logps/chosen": -156.48321533203125,
      "logps/rejected": -216.8871612548828,
      "loss": 0.2726,
      "rewards/chosen": 0.8429794311523438,
      "rewards/margins": 4.134203910827637,
      "rewards/rejected": -3.291224479675293,
      "step": 3480
    },
    {
      "epoch": 0.91,
      "grad_norm": 35.4827880859375,
      "kl": 0.0,
      "learning_rate": 4.4490970950013084e-08,
      "logps/chosen": -247.5883026123047,
      "logps/rejected": -203.5853271484375,
      "loss": 0.209,
      "rewards/chosen": 1.6337577104568481,
      "rewards/margins": 4.678426742553711,
      "rewards/rejected": -3.0446689128875732,
      "step": 3481
    },
    {
      "epoch": 0.91,
      "grad_norm": 30.35540199279785,
      "kl": 0.0,
      "learning_rate": 4.436011515310128e-08,
      "logps/chosen": -260.1644592285156,
      "logps/rejected": -248.724609375,
      "loss": 0.1345,
      "rewards/chosen": 3.037034273147583,
      "rewards/margins": 6.655192852020264,
      "rewards/rejected": -3.6181585788726807,
      "step": 3482
    },
    {
      "epoch": 0.91,
      "grad_norm": 42.778465270996094,
      "kl": 0.0,
      "learning_rate": 4.4229259356189476e-08,
      "logps/chosen": -260.2196044921875,
      "logps/rejected": -259.03521728515625,
      "loss": 0.2166,
      "rewards/chosen": 3.436540126800537,
      "rewards/margins": 4.734576225280762,
      "rewards/rejected": -1.2980358600616455,
      "step": 3483
    },
    {
      "epoch": 0.91,
      "grad_norm": 34.71338653564453,
      "kl": 0.0,
      "learning_rate": 4.409840355927767e-08,
      "logps/chosen": -247.28744506835938,
      "logps/rejected": -287.5233459472656,
      "loss": 0.2157,
      "rewards/chosen": 0.4272666573524475,
      "rewards/margins": 5.451603412628174,
      "rewards/rejected": -5.024336814880371,
      "step": 3484
    },
    {
      "epoch": 0.91,
      "grad_norm": 37.13381576538086,
      "kl": 0.0,
      "learning_rate": 4.3967547762365875e-08,
      "logps/chosen": -227.41146850585938,
      "logps/rejected": -302.9892578125,
      "loss": 0.2826,
      "rewards/chosen": 1.3567723035812378,
      "rewards/margins": 6.252913951873779,
      "rewards/rejected": -4.896141529083252,
      "step": 3485
    },
    {
      "epoch": 0.91,
      "grad_norm": 35.91691589355469,
      "kl": 0.0,
      "learning_rate": 4.3836691965454064e-08,
      "logps/chosen": -266.44287109375,
      "logps/rejected": -257.7109069824219,
      "loss": 0.2031,
      "rewards/chosen": 0.6967478394508362,
      "rewards/margins": 5.014280319213867,
      "rewards/rejected": -4.317532539367676,
      "step": 3486
    },
    {
      "epoch": 0.91,
      "grad_norm": 39.01267623901367,
      "kl": 0.0,
      "learning_rate": 4.370583616854227e-08,
      "logps/chosen": -219.94180297851562,
      "logps/rejected": -304.16461181640625,
      "loss": 0.2245,
      "rewards/chosen": 1.0676984786987305,
      "rewards/margins": 4.392019271850586,
      "rewards/rejected": -3.3243210315704346,
      "step": 3487
    },
    {
      "epoch": 0.91,
      "grad_norm": 36.86568832397461,
      "kl": 0.0,
      "learning_rate": 4.357498037163046e-08,
      "logps/chosen": -193.48902893066406,
      "logps/rejected": -194.73497009277344,
      "loss": 0.1368,
      "rewards/chosen": 2.4176418781280518,
      "rewards/margins": 6.041609287261963,
      "rewards/rejected": -3.623967409133911,
      "step": 3488
    },
    {
      "epoch": 0.91,
      "grad_norm": 32.746612548828125,
      "kl": 0.0,
      "learning_rate": 4.344412457471866e-08,
      "logps/chosen": -237.88595581054688,
      "logps/rejected": -221.19143676757812,
      "loss": 0.2488,
      "rewards/chosen": 2.1624321937561035,
      "rewards/margins": 6.177945137023926,
      "rewards/rejected": -4.015512943267822,
      "step": 3489
    },
    {
      "epoch": 0.91,
      "grad_norm": 33.666107177734375,
      "kl": 0.0,
      "learning_rate": 4.3313268777806855e-08,
      "logps/chosen": -215.32931518554688,
      "logps/rejected": -319.3660888671875,
      "loss": 0.2697,
      "rewards/chosen": 0.6944324970245361,
      "rewards/margins": 6.149470329284668,
      "rewards/rejected": -5.455037593841553,
      "step": 3490
    },
    {
      "epoch": 0.91,
      "grad_norm": 32.387325286865234,
      "kl": 0.0,
      "learning_rate": 4.318241298089505e-08,
      "logps/chosen": -168.0982666015625,
      "logps/rejected": -288.7040100097656,
      "loss": 0.2772,
      "rewards/chosen": 0.8535113334655762,
      "rewards/margins": 4.308161735534668,
      "rewards/rejected": -3.4546501636505127,
      "step": 3491
    },
    {
      "epoch": 0.91,
      "grad_norm": 37.205135345458984,
      "kl": 0.0,
      "learning_rate": 4.305155718398325e-08,
      "logps/chosen": -199.73460388183594,
      "logps/rejected": -291.91644287109375,
      "loss": 0.2238,
      "rewards/chosen": 1.210793137550354,
      "rewards/margins": 3.8120923042297363,
      "rewards/rejected": -2.6012990474700928,
      "step": 3492
    },
    {
      "epoch": 0.91,
      "grad_norm": 27.158855438232422,
      "kl": 0.0,
      "learning_rate": 4.292070138707145e-08,
      "logps/chosen": -92.06619262695312,
      "logps/rejected": -278.1312561035156,
      "loss": 0.2458,
      "rewards/chosen": 0.738717794418335,
      "rewards/margins": 4.438655853271484,
      "rewards/rejected": -3.6999380588531494,
      "step": 3493
    },
    {
      "epoch": 0.91,
      "grad_norm": 37.6988410949707,
      "kl": 0.0,
      "learning_rate": 4.278984559015964e-08,
      "logps/chosen": -102.7883529663086,
      "logps/rejected": -254.30300903320312,
      "loss": 0.2389,
      "rewards/chosen": -0.21390125155448914,
      "rewards/margins": 3.267529249191284,
      "rewards/rejected": -3.4814305305480957,
      "step": 3494
    },
    {
      "epoch": 0.91,
      "grad_norm": 44.969459533691406,
      "kl": 0.0,
      "learning_rate": 4.265898979324784e-08,
      "logps/chosen": -165.9117431640625,
      "logps/rejected": -234.60121154785156,
      "loss": 0.2802,
      "rewards/chosen": 0.9268661141395569,
      "rewards/margins": 3.578329086303711,
      "rewards/rejected": -2.651463031768799,
      "step": 3495
    },
    {
      "epoch": 0.91,
      "grad_norm": 29.521686553955078,
      "kl": 0.0,
      "learning_rate": 4.252813399633604e-08,
      "logps/chosen": -256.52166748046875,
      "logps/rejected": -304.9439697265625,
      "loss": 0.176,
      "rewards/chosen": 2.3784642219543457,
      "rewards/margins": 7.135860443115234,
      "rewards/rejected": -4.757396221160889,
      "step": 3496
    },
    {
      "epoch": 0.92,
      "grad_norm": 36.39180374145508,
      "kl": 0.0,
      "learning_rate": 4.2397278199424234e-08,
      "logps/chosen": -263.2144775390625,
      "logps/rejected": -237.9430389404297,
      "loss": 0.1611,
      "rewards/chosen": 1.8299251794815063,
      "rewards/margins": 4.7226996421813965,
      "rewards/rejected": -2.8927743434906006,
      "step": 3497
    },
    {
      "epoch": 0.92,
      "grad_norm": 28.184282302856445,
      "kl": 0.0,
      "learning_rate": 4.226642240251243e-08,
      "logps/chosen": -142.09413146972656,
      "logps/rejected": -358.89691162109375,
      "loss": 0.2323,
      "rewards/chosen": 0.06838560104370117,
      "rewards/margins": 6.167913913726807,
      "rewards/rejected": -6.0995283126831055,
      "step": 3498
    },
    {
      "epoch": 0.92,
      "grad_norm": 32.088775634765625,
      "kl": 0.0,
      "learning_rate": 4.2135566605600627e-08,
      "logps/chosen": -238.5006561279297,
      "logps/rejected": -242.0651397705078,
      "loss": 0.2115,
      "rewards/chosen": 1.0346657037734985,
      "rewards/margins": 6.6157379150390625,
      "rewards/rejected": -5.5810723304748535,
      "step": 3499
    },
    {
      "epoch": 0.92,
      "grad_norm": 33.76140594482422,
      "kl": 0.0,
      "learning_rate": 4.200471080868882e-08,
      "logps/chosen": -218.50735473632812,
      "logps/rejected": -223.3050994873047,
      "loss": 0.2968,
      "rewards/chosen": 0.24274462461471558,
      "rewards/margins": 2.7823526859283447,
      "rewards/rejected": -2.5396080017089844,
      "step": 3500
    }
  ],
  "logging_steps": 1.0,
  "max_steps": 3821,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}