grpo-llama-3-1-8b-math-ep3-2102 / trainer_state.json
nghind's picture
Model save
6dcc196 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2490,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 163.590625,
"epoch": 0.020080321285140562,
"grad_norm": 0.252015620470047,
"kl": 0.0002622205880470574,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.46484375,
"reward_std": 0.18930117189884185,
"rewards/acc_reward_func": 0.46484375,
"step": 5
},
{
"completion_length": 179.62578125,
"epoch": 0.040160642570281124,
"grad_norm": 0.16817402839660645,
"kl": 0.00034236229257658126,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.41875,
"reward_std": 0.1731734722852707,
"rewards/acc_reward_func": 0.41875,
"step": 10
},
{
"completion_length": 167.68515625,
"epoch": 0.060240963855421686,
"grad_norm": 0.2476990669965744,
"kl": 0.0003678632027003914,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4109375,
"reward_std": 0.17959889471530915,
"rewards/acc_reward_func": 0.4109375,
"step": 15
},
{
"completion_length": 178.2328125,
"epoch": 0.08032128514056225,
"grad_norm": 0.2003251314163208,
"kl": 0.0003451821394264698,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.44296875,
"reward_std": 0.21875501573085784,
"rewards/acc_reward_func": 0.44296875,
"step": 20
},
{
"completion_length": 172.4640625,
"epoch": 0.10040160642570281,
"grad_norm": 0.18285615742206573,
"kl": 0.0003507056972011924,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.471875,
"reward_std": 0.19192979335784913,
"rewards/acc_reward_func": 0.471875,
"step": 25
},
{
"completion_length": 175.68515625,
"epoch": 0.12048192771084337,
"grad_norm": 0.22499169409275055,
"kl": 0.0003521858772728592,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4078125,
"reward_std": 0.1874557167291641,
"rewards/acc_reward_func": 0.4078125,
"step": 30
},
{
"completion_length": 171.62109375,
"epoch": 0.14056224899598393,
"grad_norm": 0.2037433385848999,
"kl": 0.00033322854433208703,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4203125,
"reward_std": 0.17733501195907592,
"rewards/acc_reward_func": 0.4203125,
"step": 35
},
{
"completion_length": 166.29296875,
"epoch": 0.1606425702811245,
"grad_norm": 0.27188873291015625,
"kl": 0.0003602906537707895,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4328125,
"reward_std": 0.18154324293136598,
"rewards/acc_reward_func": 0.4328125,
"step": 40
},
{
"completion_length": 171.10703125,
"epoch": 0.18072289156626506,
"grad_norm": 0.13274171948432922,
"kl": 0.00031624052789993586,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4453125,
"reward_std": 0.16933046877384186,
"rewards/acc_reward_func": 0.4453125,
"step": 45
},
{
"completion_length": 178.0140625,
"epoch": 0.20080321285140562,
"grad_norm": 0.23663479089736938,
"kl": 0.0003486273228190839,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4046875,
"reward_std": 0.1791737824678421,
"rewards/acc_reward_func": 0.4046875,
"step": 50
},
{
"completion_length": 171.86953125,
"epoch": 0.22088353413654618,
"grad_norm": 0.27000316977500916,
"kl": 0.0003532590402755886,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.45703125,
"reward_std": 0.20447877645492554,
"rewards/acc_reward_func": 0.45703125,
"step": 55
},
{
"completion_length": 170.43046875,
"epoch": 0.24096385542168675,
"grad_norm": 0.19454790651798248,
"kl": 0.0003477427875623107,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4359375,
"reward_std": 0.20944839119911193,
"rewards/acc_reward_func": 0.4359375,
"step": 60
},
{
"completion_length": 178.028125,
"epoch": 0.26104417670682734,
"grad_norm": 0.163658007979393,
"kl": 0.00040721939294599,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4609375,
"reward_std": 0.19395649433135986,
"rewards/acc_reward_func": 0.4609375,
"step": 65
},
{
"completion_length": 176.78203125,
"epoch": 0.28112449799196787,
"grad_norm": 0.1931592971086502,
"kl": 0.0003734047233592719,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.378125,
"reward_std": 0.20313633978366852,
"rewards/acc_reward_func": 0.378125,
"step": 70
},
{
"completion_length": 170.04140625,
"epoch": 0.30120481927710846,
"grad_norm": 0.3031991422176361,
"kl": 0.0003914170432835817,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.459375,
"reward_std": 0.18856578767299653,
"rewards/acc_reward_func": 0.459375,
"step": 75
},
{
"completion_length": 171.06953125,
"epoch": 0.321285140562249,
"grad_norm": 0.1705673485994339,
"kl": 0.000373684469377622,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4125,
"reward_std": 0.15384772717952727,
"rewards/acc_reward_func": 0.4125,
"step": 80
},
{
"completion_length": 171.71484375,
"epoch": 0.3413654618473896,
"grad_norm": 0.22563545405864716,
"kl": 0.00038995217182673513,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.48125,
"reward_std": 0.19845107197761536,
"rewards/acc_reward_func": 0.48125,
"step": 85
},
{
"completion_length": 168.41953125,
"epoch": 0.3614457831325301,
"grad_norm": 0.21966855227947235,
"kl": 0.00036684817168861625,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.44453125,
"reward_std": 0.1892715275287628,
"rewards/acc_reward_func": 0.44453125,
"step": 90
},
{
"completion_length": 178.41875,
"epoch": 0.3815261044176707,
"grad_norm": 0.17476530373096466,
"kl": 0.00037805224419571457,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.41484375,
"reward_std": 0.18746339976787568,
"rewards/acc_reward_func": 0.41484375,
"step": 95
},
{
"completion_length": 165.01640625,
"epoch": 0.40160642570281124,
"grad_norm": 0.198855459690094,
"kl": 0.0004076789598912001,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.42734375,
"reward_std": 0.15926295667886733,
"rewards/acc_reward_func": 0.42734375,
"step": 100
},
{
"completion_length": 178.39453125,
"epoch": 0.42168674698795183,
"grad_norm": 0.2242916375398636,
"kl": 0.00040195270557887854,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.41796875,
"reward_std": 0.20668453574180604,
"rewards/acc_reward_func": 0.41796875,
"step": 105
},
{
"completion_length": 167.00078125,
"epoch": 0.44176706827309237,
"grad_norm": 0.19378553330898285,
"kl": 0.0004166673868894577,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.42109375,
"reward_std": 0.1755434274673462,
"rewards/acc_reward_func": 0.42109375,
"step": 110
},
{
"completion_length": 172.29609375,
"epoch": 0.46184738955823296,
"grad_norm": 0.17894329130649567,
"kl": 0.0004249607736710459,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.48828125,
"reward_std": 0.17969622015953063,
"rewards/acc_reward_func": 0.48828125,
"step": 115
},
{
"completion_length": 169.82421875,
"epoch": 0.4819277108433735,
"grad_norm": 0.15234076976776123,
"kl": 0.000443508883472532,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.41640625,
"reward_std": 0.1677071064710617,
"rewards/acc_reward_func": 0.41640625,
"step": 120
},
{
"completion_length": 174.04765625,
"epoch": 0.5020080321285141,
"grad_norm": 0.28823086619377136,
"kl": 0.0004540980560705066,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.43828125,
"reward_std": 0.2034762591123581,
"rewards/acc_reward_func": 0.43828125,
"step": 125
},
{
"completion_length": 167.84609375,
"epoch": 0.5220883534136547,
"grad_norm": 0.21105672419071198,
"kl": 0.00046077867737039926,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.45703125,
"reward_std": 0.19590035378932952,
"rewards/acc_reward_func": 0.45703125,
"step": 130
},
{
"completion_length": 170.73828125,
"epoch": 0.5421686746987951,
"grad_norm": 0.23586468398571014,
"kl": 0.000467709539225325,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.44609375,
"reward_std": 0.1708876222372055,
"rewards/acc_reward_func": 0.44609375,
"step": 135
},
{
"completion_length": 167.87734375,
"epoch": 0.5622489959839357,
"grad_norm": 0.15362346172332764,
"kl": 0.000496411306085065,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.4328125,
"reward_std": 0.15960543006658554,
"rewards/acc_reward_func": 0.4328125,
"step": 140
},
{
"completion_length": 170.35234375,
"epoch": 0.5823293172690763,
"grad_norm": 0.18066054582595825,
"kl": 0.0004964877618476749,
"learning_rate": 5e-06,
"loss": 0.0,
"reward": 0.45546875,
"reward_std": 0.16113038659095763,
"rewards/acc_reward_func": 0.45546875,
"step": 145
},
{
"completion_length": 164.85625,
"epoch": 0.6024096385542169,
"grad_norm": 0.47294002771377563,
"kl": 0.000543906888924539,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.43671875,
"reward_std": 0.1639706775546074,
"rewards/acc_reward_func": 0.43671875,
"step": 150
},
{
"completion_length": 162.93046875,
"epoch": 0.6224899598393574,
"grad_norm": 0.2425825446844101,
"kl": 0.0005227615125477314,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4171875,
"reward_std": 0.1566603273153305,
"rewards/acc_reward_func": 0.4171875,
"step": 155
},
{
"completion_length": 172.47109375,
"epoch": 0.642570281124498,
"grad_norm": 0.2564389407634735,
"kl": 0.0005757474922575056,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.46171875,
"reward_std": 0.1736527532339096,
"rewards/acc_reward_func": 0.46171875,
"step": 160
},
{
"completion_length": 165.5296875,
"epoch": 0.6626506024096386,
"grad_norm": 0.195328027009964,
"kl": 0.0005334455403499305,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.5109375,
"reward_std": 0.16494325399398804,
"rewards/acc_reward_func": 0.5109375,
"step": 165
},
{
"completion_length": 176.8578125,
"epoch": 0.6827309236947792,
"grad_norm": 0.20268158614635468,
"kl": 0.0005439485888928175,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.39609375,
"reward_std": 0.20100373923778533,
"rewards/acc_reward_func": 0.39609375,
"step": 170
},
{
"completion_length": 169.26640625,
"epoch": 0.7028112449799196,
"grad_norm": 0.29971057176589966,
"kl": 0.0005218727746978402,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4265625,
"reward_std": 0.16231610029935836,
"rewards/acc_reward_func": 0.4265625,
"step": 175
},
{
"completion_length": 173.6984375,
"epoch": 0.7228915662650602,
"grad_norm": 0.224137544631958,
"kl": 0.0005710305646061897,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.42890625,
"reward_std": 0.18908920288085937,
"rewards/acc_reward_func": 0.42890625,
"step": 180
},
{
"completion_length": 170.2984375,
"epoch": 0.7429718875502008,
"grad_norm": 0.20224140584468842,
"kl": 0.0005392800841946155,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4265625,
"reward_std": 0.17404890954494476,
"rewards/acc_reward_func": 0.4265625,
"step": 185
},
{
"completion_length": 164.5890625,
"epoch": 0.7630522088353414,
"grad_norm": 0.15808193385601044,
"kl": 0.000536753749474883,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.478125,
"reward_std": 0.17583222985267638,
"rewards/acc_reward_func": 0.478125,
"step": 190
},
{
"completion_length": 167.2265625,
"epoch": 0.7831325301204819,
"grad_norm": 0.15209996700286865,
"kl": 0.0005800858489237726,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4234375,
"reward_std": 0.17092030942440034,
"rewards/acc_reward_func": 0.4234375,
"step": 195
},
{
"completion_length": 167.82265625,
"epoch": 0.8032128514056225,
"grad_norm": 0.1717645525932312,
"kl": 0.0005622614640742541,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.428125,
"reward_std": 0.18004470467567443,
"rewards/acc_reward_func": 0.428125,
"step": 200
},
{
"completion_length": 162.8765625,
"epoch": 0.8232931726907631,
"grad_norm": 0.23908711969852448,
"kl": 0.0006255221436731517,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.446875,
"reward_std": 0.16946747601032258,
"rewards/acc_reward_func": 0.446875,
"step": 205
},
{
"completion_length": 176.16484375,
"epoch": 0.8433734939759037,
"grad_norm": 0.20071397721767426,
"kl": 0.0006495082518085838,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4671875,
"reward_std": 0.18251356184482576,
"rewards/acc_reward_func": 0.4671875,
"step": 210
},
{
"completion_length": 166.8421875,
"epoch": 0.8634538152610441,
"grad_norm": 0.3243388235569,
"kl": 0.0007322286954149603,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4625,
"reward_std": 0.20768478214740754,
"rewards/acc_reward_func": 0.4625,
"step": 215
},
{
"completion_length": 163.52578125,
"epoch": 0.8835341365461847,
"grad_norm": 0.29947733879089355,
"kl": 0.0006525587290525436,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41640625,
"reward_std": 0.19122307002544403,
"rewards/acc_reward_func": 0.41640625,
"step": 220
},
{
"completion_length": 170.865625,
"epoch": 0.9036144578313253,
"grad_norm": 0.1929980367422104,
"kl": 0.0007917622802779079,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.3875,
"reward_std": 0.18488225042819978,
"rewards/acc_reward_func": 0.3875,
"step": 225
},
{
"completion_length": 175.0296875,
"epoch": 0.9236947791164659,
"grad_norm": 0.19861873984336853,
"kl": 0.0008189699263311922,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.425,
"reward_std": 0.21300033628940582,
"rewards/acc_reward_func": 0.425,
"step": 230
},
{
"completion_length": 170.084375,
"epoch": 0.9437751004016064,
"grad_norm": 0.17323565483093262,
"kl": 0.0009274777257815003,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.459375,
"reward_std": 0.17804046273231505,
"rewards/acc_reward_func": 0.459375,
"step": 235
},
{
"completion_length": 175.76015625,
"epoch": 0.963855421686747,
"grad_norm": 0.1776685118675232,
"kl": 0.0008491224725730717,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.446875,
"reward_std": 0.1788846880197525,
"rewards/acc_reward_func": 0.446875,
"step": 240
},
{
"completion_length": 162.76171875,
"epoch": 0.9839357429718876,
"grad_norm": 0.22359345853328705,
"kl": 0.0009510789182968438,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.46015625,
"reward_std": 0.181617134809494,
"rewards/acc_reward_func": 0.46015625,
"step": 245
},
{
"completion_length": 161.99687652587892,
"epoch": 1.0040160642570282,
"grad_norm": 0.5221239328384399,
"kl": 0.0009677842142991721,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.446875,
"reward_std": 0.20202724933624266,
"rewards/acc_reward_func": 0.446875,
"step": 250
},
{
"completion_length": 173.12890625,
"epoch": 1.0240963855421688,
"grad_norm": 0.19141757488250732,
"kl": 0.0008808981510810554,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.47421875,
"reward_std": 0.17435919046401976,
"rewards/acc_reward_func": 0.47421875,
"step": 255
},
{
"completion_length": 157.93828125,
"epoch": 1.0441767068273093,
"grad_norm": 0.2180185467004776,
"kl": 0.0008752723690122366,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4671875,
"reward_std": 0.1644939050078392,
"rewards/acc_reward_func": 0.4671875,
"step": 260
},
{
"completion_length": 164.88984375,
"epoch": 1.0642570281124497,
"grad_norm": 0.18071024119853973,
"kl": 0.0008972461801022291,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.52421875,
"reward_std": 0.15566177368164064,
"rewards/acc_reward_func": 0.52421875,
"step": 265
},
{
"completion_length": 175.95546875,
"epoch": 1.0843373493975903,
"grad_norm": 0.19162052869796753,
"kl": 0.0008950538700446487,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.43828125,
"reward_std": 0.2212550789117813,
"rewards/acc_reward_func": 0.43828125,
"step": 270
},
{
"completion_length": 173.99375,
"epoch": 1.104417670682731,
"grad_norm": 0.16138312220573425,
"kl": 0.0008833881816826761,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.3609375,
"reward_std": 0.19666717052459717,
"rewards/acc_reward_func": 0.3609375,
"step": 275
},
{
"completion_length": 169.71171875,
"epoch": 1.1244979919678715,
"grad_norm": 0.27626168727874756,
"kl": 0.000851949246134609,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.5078125,
"reward_std": 0.19584795236587524,
"rewards/acc_reward_func": 0.5078125,
"step": 280
},
{
"completion_length": 174.3875,
"epoch": 1.144578313253012,
"grad_norm": 0.20062246918678284,
"kl": 0.0007981388131156564,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4890625,
"reward_std": 0.20531708300113677,
"rewards/acc_reward_func": 0.4890625,
"step": 285
},
{
"completion_length": 175.4296875,
"epoch": 1.1646586345381527,
"grad_norm": 0.18304277956485748,
"kl": 0.0009087029262445867,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.45859375,
"reward_std": 0.18924926221370697,
"rewards/acc_reward_func": 0.45859375,
"step": 290
},
{
"completion_length": 175.73046875,
"epoch": 1.1847389558232932,
"grad_norm": 0.18497192859649658,
"kl": 0.000977440387941897,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.44140625,
"reward_std": 0.19798250496387482,
"rewards/acc_reward_func": 0.44140625,
"step": 295
},
{
"completion_length": 165.253125,
"epoch": 1.2048192771084336,
"grad_norm": 0.27729108929634094,
"kl": 0.0012293277774006127,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.46015625,
"reward_std": 0.19403489232063292,
"rewards/acc_reward_func": 0.46015625,
"step": 300
},
{
"completion_length": 157.8125,
"epoch": 1.2248995983935742,
"grad_norm": 0.22676385939121246,
"kl": 0.0011794663034379483,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.48203125,
"reward_std": 0.1866166889667511,
"rewards/acc_reward_func": 0.48203125,
"step": 305
},
{
"completion_length": 177.00390625,
"epoch": 1.2449799196787148,
"grad_norm": 0.21284343302249908,
"kl": 0.001277761277742684,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.48203125,
"reward_std": 0.18564265072345734,
"rewards/acc_reward_func": 0.48203125,
"step": 310
},
{
"completion_length": 169.81875,
"epoch": 1.2650602409638554,
"grad_norm": 0.232464998960495,
"kl": 0.0017772512743249535,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.425,
"reward_std": 0.19517117738723755,
"rewards/acc_reward_func": 0.425,
"step": 315
},
{
"completion_length": 177.75625,
"epoch": 1.285140562248996,
"grad_norm": 0.18870379030704498,
"kl": 0.0013113886117935181,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41640625,
"reward_std": 0.21717941164970397,
"rewards/acc_reward_func": 0.41640625,
"step": 320
},
{
"completion_length": 172.2703125,
"epoch": 1.3052208835341366,
"grad_norm": 0.24346290528774261,
"kl": 0.001179230585694313,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41328125,
"reward_std": 0.17480578124523163,
"rewards/acc_reward_func": 0.41328125,
"step": 325
},
{
"completion_length": 166.24921875,
"epoch": 1.3253012048192772,
"grad_norm": 0.19009321928024292,
"kl": 0.0012709878385066987,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41484375,
"reward_std": 0.20418426394462585,
"rewards/acc_reward_func": 0.41484375,
"step": 330
},
{
"completion_length": 164.46484375,
"epoch": 1.3453815261044177,
"grad_norm": 0.15110790729522705,
"kl": 0.0012169820489361881,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4484375,
"reward_std": 0.16283962428569793,
"rewards/acc_reward_func": 0.4484375,
"step": 335
},
{
"completion_length": 174.02890625,
"epoch": 1.3654618473895583,
"grad_norm": 0.2161194533109665,
"kl": 0.0013411057880148292,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.46796875,
"reward_std": 0.22593857645988463,
"rewards/acc_reward_func": 0.46796875,
"step": 340
},
{
"completion_length": 173.4453125,
"epoch": 1.3855421686746987,
"grad_norm": 0.2230212688446045,
"kl": 0.0016622768715023994,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.484375,
"reward_std": 0.20973570942878722,
"rewards/acc_reward_func": 0.484375,
"step": 345
},
{
"completion_length": 163.484375,
"epoch": 1.4056224899598393,
"grad_norm": 0.20852205157279968,
"kl": 0.0017341260565444828,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.50625,
"reward_std": 0.1828001022338867,
"rewards/acc_reward_func": 0.50625,
"step": 350
},
{
"completion_length": 181.48515625,
"epoch": 1.4257028112449799,
"grad_norm": 0.4246974289417267,
"kl": 0.0015736527508124708,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45390625,
"reward_std": 0.2022945612668991,
"rewards/acc_reward_func": 0.45390625,
"step": 355
},
{
"completion_length": 175.24609375,
"epoch": 1.4457831325301205,
"grad_norm": 0.17853409051895142,
"kl": 0.0011188496835529804,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.47421875,
"reward_std": 0.1851181536912918,
"rewards/acc_reward_func": 0.47421875,
"step": 360
},
{
"completion_length": 164.4390625,
"epoch": 1.465863453815261,
"grad_norm": 0.16713328659534454,
"kl": 0.0012727443594485522,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4796875,
"reward_std": 0.13790302872657775,
"rewards/acc_reward_func": 0.4796875,
"step": 365
},
{
"completion_length": 173.72109375,
"epoch": 1.4859437751004017,
"grad_norm": 0.253159761428833,
"kl": 0.0016238141106441617,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4328125,
"reward_std": 0.19916468858718872,
"rewards/acc_reward_func": 0.4328125,
"step": 370
},
{
"completion_length": 170.5984375,
"epoch": 1.5060240963855422,
"grad_norm": 0.1901482343673706,
"kl": 0.0015483764465898276,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4328125,
"reward_std": 0.18379817008972169,
"rewards/acc_reward_func": 0.4328125,
"step": 375
},
{
"completion_length": 172.55546875,
"epoch": 1.5261044176706826,
"grad_norm": 0.20508316159248352,
"kl": 0.0012389007257297634,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4265625,
"reward_std": 0.19647727012634278,
"rewards/acc_reward_func": 0.4265625,
"step": 380
},
{
"completion_length": 167.434375,
"epoch": 1.5461847389558234,
"grad_norm": 0.2074270248413086,
"kl": 0.0013553853146731853,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.496875,
"reward_std": 0.18230061531066893,
"rewards/acc_reward_func": 0.496875,
"step": 385
},
{
"completion_length": 151.4921875,
"epoch": 1.5662650602409638,
"grad_norm": 0.21685202419757843,
"kl": 0.001522923377342522,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.52421875,
"reward_std": 0.15255896151065826,
"rewards/acc_reward_func": 0.52421875,
"step": 390
},
{
"completion_length": 159.71015625,
"epoch": 1.5863453815261044,
"grad_norm": 0.1718018501996994,
"kl": 0.001694214204326272,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4828125,
"reward_std": 0.1434539884328842,
"rewards/acc_reward_func": 0.4828125,
"step": 395
},
{
"completion_length": 160.0125,
"epoch": 1.606425702811245,
"grad_norm": 0.26318079233169556,
"kl": 0.0012937180465087295,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4578125,
"reward_std": 0.1806471049785614,
"rewards/acc_reward_func": 0.4578125,
"step": 400
},
{
"completion_length": 165.5828125,
"epoch": 1.6265060240963856,
"grad_norm": 0.19343388080596924,
"kl": 0.0014851080253720284,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4515625,
"reward_std": 0.21382013857364654,
"rewards/acc_reward_func": 0.4515625,
"step": 405
},
{
"completion_length": 168.0203125,
"epoch": 1.6465863453815262,
"grad_norm": 0.1713176816701889,
"kl": 0.001350321597419679,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.47109375,
"reward_std": 0.16802487075328826,
"rewards/acc_reward_func": 0.47109375,
"step": 410
},
{
"completion_length": 168.28515625,
"epoch": 1.6666666666666665,
"grad_norm": 0.13149915635585785,
"kl": 0.0013275448000058532,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41171875,
"reward_std": 0.1467423528432846,
"rewards/acc_reward_func": 0.41171875,
"step": 415
},
{
"completion_length": 171.8828125,
"epoch": 1.6867469879518073,
"grad_norm": 0.1553875058889389,
"kl": 0.0013671110384166241,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4296875,
"reward_std": 0.15315754264593123,
"rewards/acc_reward_func": 0.4296875,
"step": 420
},
{
"completion_length": 165.1984375,
"epoch": 1.7068273092369477,
"grad_norm": 0.19266025722026825,
"kl": 0.0015494710067287087,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45,
"reward_std": 0.18632612824440004,
"rewards/acc_reward_func": 0.45,
"step": 425
},
{
"completion_length": 169.65234375,
"epoch": 1.7269076305220885,
"grad_norm": 0.18278367817401886,
"kl": 0.0015408705454319715,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.43671875,
"reward_std": 0.16562672853469848,
"rewards/acc_reward_func": 0.43671875,
"step": 430
},
{
"completion_length": 157.1546875,
"epoch": 1.7469879518072289,
"grad_norm": 0.23319286108016968,
"kl": 0.0016808727523311973,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4796875,
"reward_std": 0.17185940742492675,
"rewards/acc_reward_func": 0.4796875,
"step": 435
},
{
"completion_length": 165.42734375,
"epoch": 1.7670682730923695,
"grad_norm": 0.1231621578335762,
"kl": 0.0015965948114171624,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.44765625,
"reward_std": 0.17927809059619904,
"rewards/acc_reward_func": 0.44765625,
"step": 440
},
{
"completion_length": 180.221875,
"epoch": 1.78714859437751,
"grad_norm": 0.16520430147647858,
"kl": 0.0014637083746492862,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.3703125,
"reward_std": 0.20197454690933228,
"rewards/acc_reward_func": 0.3703125,
"step": 445
},
{
"completion_length": 166.18203125,
"epoch": 1.8072289156626506,
"grad_norm": 0.17448249459266663,
"kl": 0.0019314930541440845,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45859375,
"reward_std": 0.18432761132717132,
"rewards/acc_reward_func": 0.45859375,
"step": 450
},
{
"completion_length": 167.51953125,
"epoch": 1.8273092369477912,
"grad_norm": 0.2385585755109787,
"kl": 0.0016987314447760582,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.46875,
"reward_std": 0.21612717509269713,
"rewards/acc_reward_func": 0.46875,
"step": 455
},
{
"completion_length": 162.62578125,
"epoch": 1.8473895582329316,
"grad_norm": 0.17252178490161896,
"kl": 0.0015415515284985305,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48515625,
"reward_std": 0.18296339362859726,
"rewards/acc_reward_func": 0.48515625,
"step": 460
},
{
"completion_length": 180.59453125,
"epoch": 1.8674698795180724,
"grad_norm": 0.25823068618774414,
"kl": 0.0015800336841493845,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4015625,
"reward_std": 0.1927712768316269,
"rewards/acc_reward_func": 0.4015625,
"step": 465
},
{
"completion_length": 166.84140625,
"epoch": 1.8875502008032128,
"grad_norm": 0.18400608003139496,
"kl": 0.001694285310804844,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.40546875,
"reward_std": 0.17548877298831939,
"rewards/acc_reward_func": 0.40546875,
"step": 470
},
{
"completion_length": 180.5515625,
"epoch": 1.9076305220883534,
"grad_norm": 0.1988365650177002,
"kl": 0.0013879930600523948,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.44375,
"reward_std": 0.2029540091753006,
"rewards/acc_reward_func": 0.44375,
"step": 475
},
{
"completion_length": 165.86484375,
"epoch": 1.927710843373494,
"grad_norm": 0.17131586372852325,
"kl": 0.0012664912967011333,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4671875,
"reward_std": 0.18038126528263093,
"rewards/acc_reward_func": 0.4671875,
"step": 480
},
{
"completion_length": 162.2859375,
"epoch": 1.9477911646586346,
"grad_norm": 0.14193740487098694,
"kl": 0.0013188483193516732,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.44453125,
"reward_std": 0.16213289499282837,
"rewards/acc_reward_func": 0.44453125,
"step": 485
},
{
"completion_length": 162.2859375,
"epoch": 1.9678714859437751,
"grad_norm": 0.20864109694957733,
"kl": 0.0018756768200546502,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45703125,
"reward_std": 0.17869449257850648,
"rewards/acc_reward_func": 0.45703125,
"step": 490
},
{
"completion_length": 164.62421875,
"epoch": 1.9879518072289155,
"grad_norm": 0.18869014084339142,
"kl": 0.0014750010799616576,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.4375,
"reward_std": 0.1776250869035721,
"rewards/acc_reward_func": 0.4375,
"step": 495
},
{
"completion_length": 182.69375,
"epoch": 2.0080321285140563,
"grad_norm": 0.22825610637664795,
"kl": 0.0017788540106266737,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48046875,
"reward_std": 0.19371981024742127,
"rewards/acc_reward_func": 0.48046875,
"step": 500
},
{
"completion_length": 171.9796875,
"epoch": 2.0281124497991967,
"grad_norm": 0.1859317570924759,
"kl": 0.0014007980469614267,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.41875,
"reward_std": 0.18771535754203797,
"rewards/acc_reward_func": 0.41875,
"step": 505
},
{
"completion_length": 167.28984375,
"epoch": 2.0481927710843375,
"grad_norm": 0.13138189911842346,
"kl": 0.001390733919106424,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.3984375,
"reward_std": 0.1619436800479889,
"rewards/acc_reward_func": 0.3984375,
"step": 510
},
{
"completion_length": 170.3484375,
"epoch": 2.068273092369478,
"grad_norm": 0.20640629529953003,
"kl": 0.0016555654583498836,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48125,
"reward_std": 0.2025841474533081,
"rewards/acc_reward_func": 0.48125,
"step": 515
},
{
"completion_length": 171.1609375,
"epoch": 2.0883534136546187,
"grad_norm": 0.1779826134443283,
"kl": 0.0016433863900601863,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.41796875,
"reward_std": 0.18314336836338044,
"rewards/acc_reward_func": 0.41796875,
"step": 520
},
{
"completion_length": 165.709375,
"epoch": 2.108433734939759,
"grad_norm": 0.1792406439781189,
"kl": 0.0015564454719424249,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.43125,
"reward_std": 0.19179919064044954,
"rewards/acc_reward_func": 0.43125,
"step": 525
},
{
"completion_length": 170.3234375,
"epoch": 2.1285140562248994,
"grad_norm": 0.1456846445798874,
"kl": 0.0014390965923666954,
"learning_rate": 5e-06,
"loss": 0.0001,
"reward": 0.459375,
"reward_std": 0.16652237474918366,
"rewards/acc_reward_func": 0.459375,
"step": 530
},
{
"completion_length": 170.99765625,
"epoch": 2.1485943775100402,
"grad_norm": 0.1734231561422348,
"kl": 0.0015704976627603172,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.49765625,
"reward_std": 0.16996922194957734,
"rewards/acc_reward_func": 0.49765625,
"step": 535
},
{
"completion_length": 176.13984375,
"epoch": 2.1686746987951806,
"grad_norm": 0.1697244644165039,
"kl": 0.0017200220609083772,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.378125,
"reward_std": 0.1871478885412216,
"rewards/acc_reward_func": 0.378125,
"step": 540
},
{
"completion_length": 171.1515625,
"epoch": 2.1887550200803214,
"grad_norm": 0.2017068862915039,
"kl": 0.0016784318257123231,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.41875,
"reward_std": 0.14308876693248748,
"rewards/acc_reward_func": 0.41875,
"step": 545
},
{
"completion_length": 166.8390625,
"epoch": 2.208835341365462,
"grad_norm": 0.2384696751832962,
"kl": 0.001702140923589468,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.51015625,
"reward_std": 0.1810736984014511,
"rewards/acc_reward_func": 0.51015625,
"step": 550
},
{
"completion_length": 159.53125,
"epoch": 2.2289156626506026,
"grad_norm": 0.1919238269329071,
"kl": 0.00175718292593956,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4859375,
"reward_std": 0.17135893404483796,
"rewards/acc_reward_func": 0.4859375,
"step": 555
},
{
"completion_length": 170.75,
"epoch": 2.248995983935743,
"grad_norm": 0.1601853221654892,
"kl": 0.002081968728452921,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.43046875,
"reward_std": 0.18811692893505097,
"rewards/acc_reward_func": 0.43046875,
"step": 560
},
{
"completion_length": 167.421875,
"epoch": 2.2690763052208833,
"grad_norm": 0.17960651218891144,
"kl": 0.0016798235708847642,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.44375,
"reward_std": 0.1492922842502594,
"rewards/acc_reward_func": 0.44375,
"step": 565
},
{
"completion_length": 157.996875,
"epoch": 2.289156626506024,
"grad_norm": 0.1791730374097824,
"kl": 0.0017936693038791418,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.503125,
"reward_std": 0.1692562907934189,
"rewards/acc_reward_func": 0.503125,
"step": 570
},
{
"completion_length": 173.921875,
"epoch": 2.3092369477911645,
"grad_norm": 0.23927773535251617,
"kl": 0.0018104223068803548,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.42265625,
"reward_std": 0.2313847303390503,
"rewards/acc_reward_func": 0.42265625,
"step": 575
},
{
"completion_length": 163.70078125,
"epoch": 2.3293172690763053,
"grad_norm": 0.2303539514541626,
"kl": 0.0020187195390462876,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.50625,
"reward_std": 0.16630844473838807,
"rewards/acc_reward_func": 0.50625,
"step": 580
},
{
"completion_length": 170.11171875,
"epoch": 2.3493975903614457,
"grad_norm": 0.18714947998523712,
"kl": 0.001954457885585725,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.43828125,
"reward_std": 0.177463561296463,
"rewards/acc_reward_func": 0.43828125,
"step": 585
},
{
"completion_length": 168.0265625,
"epoch": 2.3694779116465865,
"grad_norm": 0.1414794921875,
"kl": 0.0022247758926823734,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45546875,
"reward_std": 0.17782701998949052,
"rewards/acc_reward_func": 0.45546875,
"step": 590
},
{
"completion_length": 164.63046875,
"epoch": 2.389558232931727,
"grad_norm": 0.19845053553581238,
"kl": 0.0019970756489783525,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.47109375,
"reward_std": 0.1869324892759323,
"rewards/acc_reward_func": 0.47109375,
"step": 595
},
{
"completion_length": 179.9046875,
"epoch": 2.4096385542168672,
"grad_norm": 0.22539827227592468,
"kl": 0.001920244237408042,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4328125,
"reward_std": 0.17870275378227235,
"rewards/acc_reward_func": 0.4328125,
"step": 600
},
{
"completion_length": 167.17421875,
"epoch": 2.429718875502008,
"grad_norm": 0.15344974398612976,
"kl": 0.003571906848810613,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.44765625,
"reward_std": 0.19482170641422272,
"rewards/acc_reward_func": 0.44765625,
"step": 605
},
{
"completion_length": 161.63125,
"epoch": 2.4497991967871484,
"grad_norm": 0.19146452844142914,
"kl": 0.0020797441247850657,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.49921875,
"reward_std": 0.19066989421844482,
"rewards/acc_reward_func": 0.49921875,
"step": 610
},
{
"completion_length": 167.70703125,
"epoch": 2.4698795180722892,
"grad_norm": 0.16380134224891663,
"kl": 0.0024963648989796638,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4375,
"reward_std": 0.1623079299926758,
"rewards/acc_reward_func": 0.4375,
"step": 615
},
{
"completion_length": 173.51015625,
"epoch": 2.4899598393574296,
"grad_norm": 0.28427842259407043,
"kl": 0.002350706118158996,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.484375,
"reward_std": 0.21160372495651245,
"rewards/acc_reward_func": 0.484375,
"step": 620
},
{
"completion_length": 169.8390625,
"epoch": 2.5100401606425704,
"grad_norm": 0.21858711540699005,
"kl": 0.0022199705708771944,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.51640625,
"reward_std": 0.18480659574270247,
"rewards/acc_reward_func": 0.51640625,
"step": 625
},
{
"completion_length": 162.2171875,
"epoch": 2.5301204819277108,
"grad_norm": 0.2074146568775177,
"kl": 0.0027587429154664277,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4734375,
"reward_std": 0.16720536351203918,
"rewards/acc_reward_func": 0.4734375,
"step": 630
},
{
"completion_length": 169.5125,
"epoch": 2.550200803212851,
"grad_norm": 0.3031947910785675,
"kl": 0.0023493935121223332,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.44765625,
"reward_std": 0.18801212310791016,
"rewards/acc_reward_func": 0.44765625,
"step": 635
},
{
"completion_length": 168.32578125,
"epoch": 2.570281124497992,
"grad_norm": 0.22741979360580444,
"kl": 0.002392452908679843,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4578125,
"reward_std": 0.1990044355392456,
"rewards/acc_reward_func": 0.4578125,
"step": 640
},
{
"completion_length": 156.84765625,
"epoch": 2.5903614457831328,
"grad_norm": 0.31040796637535095,
"kl": 0.0024277767166495322,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.553125,
"reward_std": 0.18792948126792908,
"rewards/acc_reward_func": 0.553125,
"step": 645
},
{
"completion_length": 169.5828125,
"epoch": 2.610441767068273,
"grad_norm": 0.18631672859191895,
"kl": 0.002129961014725268,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48203125,
"reward_std": 0.20476263463497163,
"rewards/acc_reward_func": 0.48203125,
"step": 650
},
{
"completion_length": 167.3921875,
"epoch": 2.6305220883534135,
"grad_norm": 0.3941134810447693,
"kl": 0.0025281702168285848,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4875,
"reward_std": 0.19434418976306916,
"rewards/acc_reward_func": 0.4875,
"step": 655
},
{
"completion_length": 161.134375,
"epoch": 2.6506024096385543,
"grad_norm": 0.16547559201717377,
"kl": 0.002066282252781093,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45078125,
"reward_std": 0.1536063954234123,
"rewards/acc_reward_func": 0.45078125,
"step": 660
},
{
"completion_length": 166.02421875,
"epoch": 2.6706827309236947,
"grad_norm": 0.2315889447927475,
"kl": 0.002196951396763325,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4796875,
"reward_std": 0.2142605274915695,
"rewards/acc_reward_func": 0.4796875,
"step": 665
},
{
"completion_length": 170.13046875,
"epoch": 2.6907630522088355,
"grad_norm": 0.18366378545761108,
"kl": 0.0031739554600790144,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4546875,
"reward_std": 0.18340969681739808,
"rewards/acc_reward_func": 0.4546875,
"step": 670
},
{
"completion_length": 167.7234375,
"epoch": 2.710843373493976,
"grad_norm": 0.1795644611120224,
"kl": 0.002132023056037724,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5421875,
"reward_std": 0.19661171734333038,
"rewards/acc_reward_func": 0.5421875,
"step": 675
},
{
"completion_length": 172.259375,
"epoch": 2.7309236947791167,
"grad_norm": 0.197841078042984,
"kl": 0.002035749773494899,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.465625,
"reward_std": 0.18075270354747772,
"rewards/acc_reward_func": 0.465625,
"step": 680
},
{
"completion_length": 168.378125,
"epoch": 2.751004016064257,
"grad_norm": 0.19361449778079987,
"kl": 0.0020883690798655153,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.51484375,
"reward_std": 0.16988765746355056,
"rewards/acc_reward_func": 0.51484375,
"step": 685
},
{
"completion_length": 169.26796875,
"epoch": 2.7710843373493974,
"grad_norm": 0.1799454241991043,
"kl": 0.0019145054975524545,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.47578125,
"reward_std": 0.14681751281023026,
"rewards/acc_reward_func": 0.47578125,
"step": 690
},
{
"completion_length": 163.45078125,
"epoch": 2.791164658634538,
"grad_norm": 0.24009369313716888,
"kl": 0.00207897019572556,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45859375,
"reward_std": 0.1931656539440155,
"rewards/acc_reward_func": 0.45859375,
"step": 695
},
{
"completion_length": 155.53984375,
"epoch": 2.8112449799196786,
"grad_norm": 0.21647228300571442,
"kl": 0.0022690463811159134,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.49453125,
"reward_std": 0.15963537693023683,
"rewards/acc_reward_func": 0.49453125,
"step": 700
},
{
"completion_length": 167.93125,
"epoch": 2.8313253012048194,
"grad_norm": 0.18224656581878662,
"kl": 0.0021705702878534795,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.50078125,
"reward_std": 0.21607731580734252,
"rewards/acc_reward_func": 0.50078125,
"step": 705
},
{
"completion_length": 163.28203125,
"epoch": 2.8514056224899598,
"grad_norm": 0.20629364252090454,
"kl": 0.0021417615003883838,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.446875,
"reward_std": 0.17288437485694885,
"rewards/acc_reward_func": 0.446875,
"step": 710
},
{
"completion_length": 165.82734375,
"epoch": 2.8714859437751006,
"grad_norm": 0.19730441272258759,
"kl": 0.0025335745420306923,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.5109375,
"reward_std": 0.2060305058956146,
"rewards/acc_reward_func": 0.5109375,
"step": 715
},
{
"completion_length": 157.9640625,
"epoch": 2.891566265060241,
"grad_norm": 0.33498820662498474,
"kl": 0.002661615936085582,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.1668643593788147,
"rewards/acc_reward_func": 0.5,
"step": 720
},
{
"completion_length": 167.50234375,
"epoch": 2.9116465863453813,
"grad_norm": 0.26220834255218506,
"kl": 0.004001938318833709,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47265625,
"reward_std": 0.1912984162569046,
"rewards/acc_reward_func": 0.47265625,
"step": 725
},
{
"completion_length": 173.77265625,
"epoch": 2.931726907630522,
"grad_norm": 0.15571069717407227,
"kl": 0.0024122723145410417,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.459375,
"reward_std": 0.18712190091609954,
"rewards/acc_reward_func": 0.459375,
"step": 730
},
{
"completion_length": 166.53203125,
"epoch": 2.9518072289156625,
"grad_norm": 0.23753374814987183,
"kl": 0.002874248195439577,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.48125,
"reward_std": 0.1870627999305725,
"rewards/acc_reward_func": 0.48125,
"step": 735
},
{
"completion_length": 174.43671875,
"epoch": 2.9718875502008033,
"grad_norm": 0.17206275463104248,
"kl": 0.0023007401498034595,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.459375,
"reward_std": 0.1821707934141159,
"rewards/acc_reward_func": 0.459375,
"step": 740
},
{
"completion_length": 168.40078125,
"epoch": 2.9919678714859437,
"grad_norm": 0.25362470746040344,
"kl": 0.002372942678630352,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.21486756503582,
"rewards/acc_reward_func": 0.5,
"step": 745
},
{
"completion_length": 180.49129638671874,
"epoch": 3.0120481927710845,
"grad_norm": 0.13761889934539795,
"kl": 0.0028317445889115334,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4109375,
"reward_std": 0.16525401473045348,
"rewards/acc_reward_func": 0.4109375,
"step": 750
},
{
"completion_length": 158.54921875,
"epoch": 3.032128514056225,
"grad_norm": 0.1767706722021103,
"kl": 0.0022356531117111444,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4671875,
"reward_std": 0.17598875164985656,
"rewards/acc_reward_func": 0.4671875,
"step": 755
},
{
"completion_length": 165.08828125,
"epoch": 3.0522088353413657,
"grad_norm": 0.24206030368804932,
"kl": 0.0026053044479340316,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4359375,
"reward_std": 0.166653174161911,
"rewards/acc_reward_func": 0.4359375,
"step": 760
},
{
"completion_length": 163.43359375,
"epoch": 3.072289156626506,
"grad_norm": 0.13091525435447693,
"kl": 0.0029567593010142446,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.50546875,
"reward_std": 0.16952263116836547,
"rewards/acc_reward_func": 0.50546875,
"step": 765
},
{
"completion_length": 167.9515625,
"epoch": 3.0923694779116464,
"grad_norm": 0.22246809303760529,
"kl": 0.0028403045376762747,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.48984375,
"reward_std": 0.19681974053382872,
"rewards/acc_reward_func": 0.48984375,
"step": 770
},
{
"completion_length": 170.97734375,
"epoch": 3.112449799196787,
"grad_norm": 0.18141813576221466,
"kl": 0.0024082385236397384,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.4375,
"reward_std": 0.17291657924652098,
"rewards/acc_reward_func": 0.4375,
"step": 775
},
{
"completion_length": 164.44765625,
"epoch": 3.1325301204819276,
"grad_norm": 0.1898430585861206,
"kl": 0.0024608696810901167,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5390625,
"reward_std": 0.21618261635303498,
"rewards/acc_reward_func": 0.5390625,
"step": 780
},
{
"completion_length": 171.309375,
"epoch": 3.1526104417670684,
"grad_norm": 0.137849822640419,
"kl": 0.002426739735528827,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.44921875,
"reward_std": 0.1522618979215622,
"rewards/acc_reward_func": 0.44921875,
"step": 785
},
{
"completion_length": 157.8609375,
"epoch": 3.1726907630522088,
"grad_norm": 0.214852437376976,
"kl": 0.002971158013679087,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.47578125,
"reward_std": 0.14729551821947098,
"rewards/acc_reward_func": 0.47578125,
"step": 790
},
{
"completion_length": 163.13671875,
"epoch": 3.1927710843373496,
"grad_norm": 0.1850077360868454,
"kl": 0.002568071405403316,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.52421875,
"reward_std": 0.18070049583911896,
"rewards/acc_reward_func": 0.52421875,
"step": 795
},
{
"completion_length": 173.4234375,
"epoch": 3.21285140562249,
"grad_norm": 0.22789514064788818,
"kl": 0.003475360944867134,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4703125,
"reward_std": 0.1655502900481224,
"rewards/acc_reward_func": 0.4703125,
"step": 800
},
{
"completion_length": 160.71875,
"epoch": 3.2329317269076308,
"grad_norm": 0.20145151019096375,
"kl": 0.0020800810772925614,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.521875,
"reward_std": 0.1710708260536194,
"rewards/acc_reward_func": 0.521875,
"step": 805
},
{
"completion_length": 166.04140625,
"epoch": 3.253012048192771,
"grad_norm": 0.17967192828655243,
"kl": 0.0029742006212472917,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4953125,
"reward_std": 0.14658259004354476,
"rewards/acc_reward_func": 0.4953125,
"step": 810
},
{
"completion_length": 165.04375,
"epoch": 3.2730923694779115,
"grad_norm": 0.20132969319820404,
"kl": 0.00216698651202023,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.46171875,
"reward_std": 0.19243075847625732,
"rewards/acc_reward_func": 0.46171875,
"step": 815
},
{
"completion_length": 170.6875,
"epoch": 3.2931726907630523,
"grad_norm": 0.17751039564609528,
"kl": 0.002652911003679037,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.496875,
"reward_std": 0.1787781149148941,
"rewards/acc_reward_func": 0.496875,
"step": 820
},
{
"completion_length": 170.58828125,
"epoch": 3.3132530120481927,
"grad_norm": 0.18270958960056305,
"kl": 0.002537048631347716,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4890625,
"reward_std": 0.20342562198638917,
"rewards/acc_reward_func": 0.4890625,
"step": 825
},
{
"completion_length": 160.2875,
"epoch": 3.3333333333333335,
"grad_norm": 0.2583668529987335,
"kl": 0.0028565811458975076,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.54453125,
"reward_std": 0.18148908019065857,
"rewards/acc_reward_func": 0.54453125,
"step": 830
},
{
"completion_length": 170.0390625,
"epoch": 3.353413654618474,
"grad_norm": 0.21700581908226013,
"kl": 0.004266613628715277,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.428125,
"reward_std": 0.20687197744846345,
"rewards/acc_reward_func": 0.428125,
"step": 835
},
{
"completion_length": 169.84765625,
"epoch": 3.3734939759036147,
"grad_norm": 0.21277758479118347,
"kl": 0.0031619609566405416,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.475,
"reward_std": 0.17110105752944946,
"rewards/acc_reward_func": 0.475,
"step": 840
},
{
"completion_length": 165.22890625,
"epoch": 3.393574297188755,
"grad_norm": 0.23947608470916748,
"kl": 0.002295933500863612,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.47578125,
"reward_std": 0.1876683712005615,
"rewards/acc_reward_func": 0.47578125,
"step": 845
},
{
"completion_length": 168.33828125,
"epoch": 3.4136546184738954,
"grad_norm": 0.16878502070903778,
"kl": 0.0027886088471859695,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4296875,
"reward_std": 0.18006815016269684,
"rewards/acc_reward_func": 0.4296875,
"step": 850
},
{
"completion_length": 179.0234375,
"epoch": 3.433734939759036,
"grad_norm": 0.1827416718006134,
"kl": 0.0030534268589690328,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.39296875,
"reward_std": 0.20950157642364503,
"rewards/acc_reward_func": 0.39296875,
"step": 855
},
{
"completion_length": 164.63359375,
"epoch": 3.4538152610441766,
"grad_norm": 0.23324623703956604,
"kl": 0.002470552735030651,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5171875,
"reward_std": 0.19022006690502166,
"rewards/acc_reward_func": 0.5171875,
"step": 860
},
{
"completion_length": 167.69140625,
"epoch": 3.4738955823293174,
"grad_norm": 0.1824842095375061,
"kl": 0.002165103727020323,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48828125,
"reward_std": 0.1799081891775131,
"rewards/acc_reward_func": 0.48828125,
"step": 865
},
{
"completion_length": 165.90703125,
"epoch": 3.4939759036144578,
"grad_norm": 0.20221981406211853,
"kl": 0.002306993515230715,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.45390625,
"reward_std": 0.17067545652389526,
"rewards/acc_reward_func": 0.45390625,
"step": 870
},
{
"completion_length": 169.14765625,
"epoch": 3.5140562248995986,
"grad_norm": 0.2500782907009125,
"kl": 0.0031795531278476117,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.45625,
"reward_std": 0.19382765293121337,
"rewards/acc_reward_func": 0.45625,
"step": 875
},
{
"completion_length": 173.65390625,
"epoch": 3.534136546184739,
"grad_norm": 0.15908785164356232,
"kl": 0.002252256707288325,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.48984375,
"reward_std": 0.18456400334835052,
"rewards/acc_reward_func": 0.48984375,
"step": 880
},
{
"completion_length": 169.99296875,
"epoch": 3.5542168674698793,
"grad_norm": 0.22498035430908203,
"kl": 0.0022546121617779136,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5015625,
"reward_std": 0.20679060816764833,
"rewards/acc_reward_func": 0.5015625,
"step": 885
},
{
"completion_length": 170.00859375,
"epoch": 3.57429718875502,
"grad_norm": 0.19120348989963531,
"kl": 0.0029653264209628105,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.52421875,
"reward_std": 0.19229375422000886,
"rewards/acc_reward_func": 0.52421875,
"step": 890
},
{
"completion_length": 172.1578125,
"epoch": 3.5943775100401605,
"grad_norm": 0.1741994470357895,
"kl": 0.002913491940125823,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.471875,
"reward_std": 0.19708430767059326,
"rewards/acc_reward_func": 0.471875,
"step": 895
},
{
"completion_length": 161.38515625,
"epoch": 3.6144578313253013,
"grad_norm": 0.1781347095966339,
"kl": 0.0023952496238052847,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5,
"reward_std": 0.18798760771751405,
"rewards/acc_reward_func": 0.5,
"step": 900
},
{
"completion_length": 159.2109375,
"epoch": 3.6345381526104417,
"grad_norm": 0.1718325912952423,
"kl": 0.002631871239282191,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.50859375,
"reward_std": 0.16639206409454346,
"rewards/acc_reward_func": 0.50859375,
"step": 905
},
{
"completion_length": 165.05546875,
"epoch": 3.6546184738955825,
"grad_norm": 0.3423060178756714,
"kl": 0.0028334404807537792,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.5515625,
"reward_std": 0.1995299220085144,
"rewards/acc_reward_func": 0.5515625,
"step": 910
},
{
"completion_length": 180.7109375,
"epoch": 3.674698795180723,
"grad_norm": 0.19262397289276123,
"kl": 0.0026390203274786472,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4359375,
"reward_std": 0.18462437391281128,
"rewards/acc_reward_func": 0.4359375,
"step": 915
},
{
"completion_length": 158.94609375,
"epoch": 3.694779116465863,
"grad_norm": 0.1653033345937729,
"kl": 0.0026809167582541706,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.44375,
"reward_std": 0.1499770313501358,
"rewards/acc_reward_func": 0.44375,
"step": 920
},
{
"completion_length": 175.71015625,
"epoch": 3.714859437751004,
"grad_norm": 0.2063070833683014,
"kl": 0.002891782345250249,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.5125,
"reward_std": 0.2247842788696289,
"rewards/acc_reward_func": 0.5125,
"step": 925
},
{
"completion_length": 167.24765625,
"epoch": 3.734939759036145,
"grad_norm": 0.23962463438510895,
"kl": 0.003104905132204294,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4703125,
"reward_std": 0.19368986487388612,
"rewards/acc_reward_func": 0.4703125,
"step": 930
},
{
"completion_length": 171.17265625,
"epoch": 3.755020080321285,
"grad_norm": 0.19606095552444458,
"kl": 0.002944292780011892,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.44609375,
"reward_std": 0.18348534703254699,
"rewards/acc_reward_func": 0.44609375,
"step": 935
},
{
"completion_length": 176.634375,
"epoch": 3.7751004016064256,
"grad_norm": 0.16267438232898712,
"kl": 0.0025878275278955697,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.396875,
"reward_std": 0.16052481383085251,
"rewards/acc_reward_func": 0.396875,
"step": 940
},
{
"completion_length": 172.28515625,
"epoch": 3.7951807228915664,
"grad_norm": 0.1909836232662201,
"kl": 0.0033456831239163875,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.478125,
"reward_std": 0.1799086809158325,
"rewards/acc_reward_func": 0.478125,
"step": 945
},
{
"completion_length": 166.090625,
"epoch": 3.8152610441767068,
"grad_norm": 0.20936939120292664,
"kl": 0.0032263599801808595,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.43671875,
"reward_std": 0.19198239743709564,
"rewards/acc_reward_func": 0.43671875,
"step": 950
},
{
"completion_length": 162.3078125,
"epoch": 3.835341365461847,
"grad_norm": 0.19414915144443512,
"kl": 0.0029077294282615187,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.5,
"reward_std": 0.19624283909797668,
"rewards/acc_reward_func": 0.5,
"step": 955
},
{
"completion_length": 163.85703125,
"epoch": 3.855421686746988,
"grad_norm": 0.1911579817533493,
"kl": 0.0024573323782533405,
"learning_rate": 5e-06,
"loss": 0.0002,
"reward": 0.5421875,
"reward_std": 0.18132755011320115,
"rewards/acc_reward_func": 0.5421875,
"step": 960
},
{
"completion_length": 173.8125,
"epoch": 3.8755020080321287,
"grad_norm": 0.23467978835105896,
"kl": 0.003194801090285182,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.43203125,
"reward_std": 0.2057114690542221,
"rewards/acc_reward_func": 0.43203125,
"step": 965
},
{
"completion_length": 166.9375,
"epoch": 3.895582329317269,
"grad_norm": 0.2439257949590683,
"kl": 0.0032875371631234883,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.403125,
"reward_std": 0.15169296264648438,
"rewards/acc_reward_func": 0.403125,
"step": 970
},
{
"completion_length": 167.90234375,
"epoch": 3.9156626506024095,
"grad_norm": 0.24670979380607605,
"kl": 0.0032419377472251653,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.51484375,
"reward_std": 0.2215191602706909,
"rewards/acc_reward_func": 0.51484375,
"step": 975
},
{
"completion_length": 168.6703125,
"epoch": 3.9357429718875503,
"grad_norm": 0.20177994668483734,
"kl": 0.002821409748867154,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.47109375,
"reward_std": 0.17378178834915162,
"rewards/acc_reward_func": 0.47109375,
"step": 980
},
{
"completion_length": 169.90390625,
"epoch": 3.9558232931726907,
"grad_norm": 0.18187826871871948,
"kl": 0.002861806657165289,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.490625,
"reward_std": 0.20763208270072936,
"rewards/acc_reward_func": 0.490625,
"step": 985
},
{
"completion_length": 173.34453125,
"epoch": 3.9759036144578315,
"grad_norm": 0.15702760219573975,
"kl": 0.002640718384645879,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.421875,
"reward_std": 0.19335063099861144,
"rewards/acc_reward_func": 0.421875,
"step": 990
},
{
"completion_length": 175.04921875,
"epoch": 3.995983935742972,
"grad_norm": 0.22665689885616302,
"kl": 0.0025947901885956526,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.45078125,
"reward_std": 0.20008436739444732,
"rewards/acc_reward_func": 0.45078125,
"step": 995
},
{
"completion_length": 156.0404022216797,
"epoch": 4.016064257028113,
"grad_norm": 0.22517254948616028,
"kl": 0.003840234503149986,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48125,
"reward_std": 0.1741759806871414,
"rewards/acc_reward_func": 0.48125,
"step": 1000
},
{
"completion_length": 165.73515625,
"epoch": 4.036144578313253,
"grad_norm": 0.16833551228046417,
"kl": 0.0030605928506702183,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.56328125,
"reward_std": 0.17191357612609864,
"rewards/acc_reward_func": 0.56328125,
"step": 1005
},
{
"completion_length": 162.45390625,
"epoch": 4.056224899598393,
"grad_norm": 0.20470766723155975,
"kl": 0.004656547494232654,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51015625,
"reward_std": 0.18146504759788512,
"rewards/acc_reward_func": 0.51015625,
"step": 1010
},
{
"completion_length": 161.85625,
"epoch": 4.076305220883534,
"grad_norm": 0.23017007112503052,
"kl": 0.0031197062227874993,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.56171875,
"reward_std": 0.19219357669353485,
"rewards/acc_reward_func": 0.56171875,
"step": 1015
},
{
"completion_length": 166.6265625,
"epoch": 4.096385542168675,
"grad_norm": 0.16413679718971252,
"kl": 0.003686010604724288,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4625,
"reward_std": 0.1656496822834015,
"rewards/acc_reward_func": 0.4625,
"step": 1020
},
{
"completion_length": 169.8046875,
"epoch": 4.116465863453815,
"grad_norm": 0.24671293795108795,
"kl": 0.003416293207556009,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.50078125,
"reward_std": 0.20381903201341628,
"rewards/acc_reward_func": 0.50078125,
"step": 1025
},
{
"completion_length": 168.8796875,
"epoch": 4.136546184738956,
"grad_norm": 0.17657935619354248,
"kl": 0.0025244275806471706,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.46328125,
"reward_std": 0.14277227520942687,
"rewards/acc_reward_func": 0.46328125,
"step": 1030
},
{
"completion_length": 168.8421875,
"epoch": 4.156626506024097,
"grad_norm": 0.2604842782020569,
"kl": 0.0035679984372109174,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49296875,
"reward_std": 0.1542965739965439,
"rewards/acc_reward_func": 0.49296875,
"step": 1035
},
{
"completion_length": 169.9796875,
"epoch": 4.176706827309237,
"grad_norm": 0.2881716191768646,
"kl": 0.003248523501679301,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.49453125,
"reward_std": 0.18656824231147767,
"rewards/acc_reward_func": 0.49453125,
"step": 1040
},
{
"completion_length": 170.515625,
"epoch": 4.196787148594377,
"grad_norm": 0.23274263739585876,
"kl": 0.00285443589091301,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.45625,
"reward_std": 0.17857038974761963,
"rewards/acc_reward_func": 0.45625,
"step": 1045
},
{
"completion_length": 168.0421875,
"epoch": 4.216867469879518,
"grad_norm": 0.20154252648353577,
"kl": 0.004401167575269938,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49375,
"reward_std": 0.20558213889598848,
"rewards/acc_reward_func": 0.49375,
"step": 1050
},
{
"completion_length": 167.103125,
"epoch": 4.236947791164659,
"grad_norm": 0.207560732960701,
"kl": 0.006237465981394052,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.49765625,
"reward_std": 0.20846686661243438,
"rewards/acc_reward_func": 0.49765625,
"step": 1055
},
{
"completion_length": 168.00625,
"epoch": 4.257028112449799,
"grad_norm": 0.21645885705947876,
"kl": 0.0038530716672539713,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48984375,
"reward_std": 0.1788547456264496,
"rewards/acc_reward_func": 0.48984375,
"step": 1060
},
{
"completion_length": 171.36328125,
"epoch": 4.27710843373494,
"grad_norm": 0.21674005687236786,
"kl": 0.003597916383296251,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46015625,
"reward_std": 0.16602160632610322,
"rewards/acc_reward_func": 0.46015625,
"step": 1065
},
{
"completion_length": 168.38125,
"epoch": 4.2971887550200805,
"grad_norm": 0.21569159626960754,
"kl": 0.002890155231580138,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.509375,
"reward_std": 0.2124694287776947,
"rewards/acc_reward_func": 0.509375,
"step": 1070
},
{
"completion_length": 164.46484375,
"epoch": 4.317269076305221,
"grad_norm": 0.19328252971172333,
"kl": 0.0034100897144526245,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.45625,
"reward_std": 0.17899106144905091,
"rewards/acc_reward_func": 0.45625,
"step": 1075
},
{
"completion_length": 168.56328125,
"epoch": 4.337349397590361,
"grad_norm": 0.23035928606987,
"kl": 0.003162852395325899,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.465625,
"reward_std": 0.19337733685970307,
"rewards/acc_reward_func": 0.465625,
"step": 1080
},
{
"completion_length": 159.8671875,
"epoch": 4.357429718875502,
"grad_norm": 0.184407040476799,
"kl": 0.002822027588263154,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.53203125,
"reward_std": 0.17101742923259736,
"rewards/acc_reward_func": 0.53203125,
"step": 1085
},
{
"completion_length": 169.44375,
"epoch": 4.377510040160643,
"grad_norm": 0.17877520620822906,
"kl": 0.0036907714325934648,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48828125,
"reward_std": 0.20197680592536926,
"rewards/acc_reward_func": 0.48828125,
"step": 1090
},
{
"completion_length": 166.77734375,
"epoch": 4.397590361445783,
"grad_norm": 0.22261664271354675,
"kl": 0.0035729116294533013,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4484375,
"reward_std": 0.18630289137363434,
"rewards/acc_reward_func": 0.4484375,
"step": 1095
},
{
"completion_length": 167.671875,
"epoch": 4.417670682730924,
"grad_norm": 0.2182173877954483,
"kl": 0.004001274891197682,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47109375,
"reward_std": 0.16967293620109558,
"rewards/acc_reward_func": 0.47109375,
"step": 1100
},
{
"completion_length": 172.18984375,
"epoch": 4.437751004016064,
"grad_norm": 0.16743730008602142,
"kl": 0.0034225759096443652,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.478125,
"reward_std": 0.15839696526527405,
"rewards/acc_reward_func": 0.478125,
"step": 1105
},
{
"completion_length": 172.1484375,
"epoch": 4.457831325301205,
"grad_norm": 0.21942225098609924,
"kl": 0.003629435086622834,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45234375,
"reward_std": 0.1835702419281006,
"rewards/acc_reward_func": 0.45234375,
"step": 1110
},
{
"completion_length": 167.49296875,
"epoch": 4.477911646586345,
"grad_norm": 0.14904429018497467,
"kl": 0.0035309677477926016,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50390625,
"reward_std": 0.16912500262260438,
"rewards/acc_reward_func": 0.50390625,
"step": 1115
},
{
"completion_length": 171.1328125,
"epoch": 4.497991967871486,
"grad_norm": 0.1797754466533661,
"kl": 0.0025119307450950147,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.46015625,
"reward_std": 0.18558897078037262,
"rewards/acc_reward_func": 0.46015625,
"step": 1120
},
{
"completion_length": 169.5453125,
"epoch": 4.518072289156627,
"grad_norm": 0.23662979900836945,
"kl": 0.0032538773957639933,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4734375,
"reward_std": 0.21544496715068817,
"rewards/acc_reward_func": 0.4734375,
"step": 1125
},
{
"completion_length": 162.3859375,
"epoch": 4.538152610441767,
"grad_norm": 0.19919449090957642,
"kl": 0.004429271025583148,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.478125,
"reward_std": 0.17004487216472625,
"rewards/acc_reward_func": 0.478125,
"step": 1130
},
{
"completion_length": 166.5375,
"epoch": 4.5582329317269075,
"grad_norm": 0.22688740491867065,
"kl": 0.0032728100661188362,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.49765625,
"reward_std": 0.17078202664852143,
"rewards/acc_reward_func": 0.49765625,
"step": 1135
},
{
"completion_length": 169.23359375,
"epoch": 4.578313253012048,
"grad_norm": 0.20016172528266907,
"kl": 0.003256208822131157,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.51875,
"reward_std": 0.18927299678325654,
"rewards/acc_reward_func": 0.51875,
"step": 1140
},
{
"completion_length": 164.34453125,
"epoch": 4.598393574297189,
"grad_norm": 0.20429684221744537,
"kl": 0.0033339104149490593,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.465625,
"reward_std": 0.1623902827501297,
"rewards/acc_reward_func": 0.465625,
"step": 1145
},
{
"completion_length": 174.5734375,
"epoch": 4.618473895582329,
"grad_norm": 0.1921919882297516,
"kl": 0.0034361934289336205,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.478125,
"reward_std": 0.19792578220367432,
"rewards/acc_reward_func": 0.478125,
"step": 1150
},
{
"completion_length": 168.88203125,
"epoch": 4.63855421686747,
"grad_norm": 0.2705855667591095,
"kl": 0.0037747529800981282,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5,
"reward_std": 0.19397895336151122,
"rewards/acc_reward_func": 0.5,
"step": 1155
},
{
"completion_length": 156.29296875,
"epoch": 4.658634538152611,
"grad_norm": 0.19299572706222534,
"kl": 0.0029947038274258375,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.56875,
"reward_std": 0.19387336373329161,
"rewards/acc_reward_func": 0.56875,
"step": 1160
},
{
"completion_length": 173.68671875,
"epoch": 4.678714859437751,
"grad_norm": 0.20209172368049622,
"kl": 0.0030925452709198,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.42734375,
"reward_std": 0.18419256210327148,
"rewards/acc_reward_func": 0.42734375,
"step": 1165
},
{
"completion_length": 167.8265625,
"epoch": 4.698795180722891,
"grad_norm": 0.1785442978143692,
"kl": 0.0030869544018059967,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.55390625,
"reward_std": 0.20076735317707062,
"rewards/acc_reward_func": 0.55390625,
"step": 1170
},
{
"completion_length": 167.26875,
"epoch": 4.718875502008032,
"grad_norm": 0.24141569435596466,
"kl": 0.003842420503497124,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47890625,
"reward_std": 0.19013918936252594,
"rewards/acc_reward_func": 0.47890625,
"step": 1175
},
{
"completion_length": 159.959375,
"epoch": 4.738955823293173,
"grad_norm": 0.20012110471725464,
"kl": 0.003939095744863152,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49765625,
"reward_std": 0.1821962922811508,
"rewards/acc_reward_func": 0.49765625,
"step": 1180
},
{
"completion_length": 173.41484375,
"epoch": 4.759036144578313,
"grad_norm": 0.31007662415504456,
"kl": 0.004036217415705323,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48828125,
"reward_std": 0.1988249570131302,
"rewards/acc_reward_func": 0.48828125,
"step": 1185
},
{
"completion_length": 164.4515625,
"epoch": 4.779116465863454,
"grad_norm": 0.16889087855815887,
"kl": 0.003308457275852561,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4921875,
"reward_std": 0.16141898632049562,
"rewards/acc_reward_func": 0.4921875,
"step": 1190
},
{
"completion_length": 169.34140625,
"epoch": 4.7991967871485945,
"grad_norm": 0.20528994500637054,
"kl": 0.0033339539542794226,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4328125,
"reward_std": 0.1679941400885582,
"rewards/acc_reward_func": 0.4328125,
"step": 1195
},
{
"completion_length": 174.31953125,
"epoch": 4.8192771084337345,
"grad_norm": 0.2340361475944519,
"kl": 0.004177400190383196,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46171875,
"reward_std": 0.168808214366436,
"rewards/acc_reward_func": 0.46171875,
"step": 1200
},
{
"completion_length": 160.046875,
"epoch": 4.839357429718875,
"grad_norm": 0.38156622648239136,
"kl": 0.004627138469368219,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5328125,
"reward_std": 0.17246899604797364,
"rewards/acc_reward_func": 0.5328125,
"step": 1205
},
{
"completion_length": 166.8265625,
"epoch": 4.859437751004016,
"grad_norm": 0.22112254798412323,
"kl": 0.005549876671284437,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.47578125,
"reward_std": 0.2075572282075882,
"rewards/acc_reward_func": 0.47578125,
"step": 1210
},
{
"completion_length": 167.3859375,
"epoch": 4.879518072289157,
"grad_norm": 0.20393472909927368,
"kl": 0.004529682593420148,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.47109375,
"reward_std": 0.18220171332359314,
"rewards/acc_reward_func": 0.47109375,
"step": 1215
},
{
"completion_length": 168.44921875,
"epoch": 4.899598393574297,
"grad_norm": 0.15564318001270294,
"kl": 0.004334859363734722,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51875,
"reward_std": 0.1683833956718445,
"rewards/acc_reward_func": 0.51875,
"step": 1220
},
{
"completion_length": 166.925,
"epoch": 4.919678714859438,
"grad_norm": 0.43570226430892944,
"kl": 0.004211871605366469,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46484375,
"reward_std": 0.183743217587471,
"rewards/acc_reward_func": 0.46484375,
"step": 1225
},
{
"completion_length": 171.3453125,
"epoch": 4.9397590361445785,
"grad_norm": 0.1980995386838913,
"kl": 0.006298268353566528,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.45390625,
"reward_std": 0.16925580203533172,
"rewards/acc_reward_func": 0.45390625,
"step": 1230
},
{
"completion_length": 170.70703125,
"epoch": 4.959839357429718,
"grad_norm": 0.20515531301498413,
"kl": 0.004817906394600868,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.446875,
"reward_std": 0.17620269060134888,
"rewards/acc_reward_func": 0.446875,
"step": 1235
},
{
"completion_length": 172.5453125,
"epoch": 4.979919678714859,
"grad_norm": 0.24766899645328522,
"kl": 0.004823639849200844,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4671875,
"reward_std": 0.19640210568904876,
"rewards/acc_reward_func": 0.4671875,
"step": 1240
},
{
"completion_length": 161.7947570800781,
"epoch": 5.0,
"grad_norm": 0.21875418722629547,
"kl": 0.0050005201715976,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4359375,
"reward_std": 0.20489816665649413,
"rewards/acc_reward_func": 0.4359375,
"step": 1245
},
{
"completion_length": 158.62734375,
"epoch": 5.020080321285141,
"grad_norm": 0.21704816818237305,
"kl": 0.004759628046303988,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5328125,
"reward_std": 0.18903582096099852,
"rewards/acc_reward_func": 0.5328125,
"step": 1250
},
{
"completion_length": 171.97890625,
"epoch": 5.040160642570281,
"grad_norm": 0.17518068850040436,
"kl": 0.004319222178310156,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46171875,
"reward_std": 0.16357678174972534,
"rewards/acc_reward_func": 0.46171875,
"step": 1255
},
{
"completion_length": 163.09765625,
"epoch": 5.0602409638554215,
"grad_norm": 0.2485804557800293,
"kl": 0.004300285456702113,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.53515625,
"reward_std": 0.18808826208114623,
"rewards/acc_reward_func": 0.53515625,
"step": 1260
},
{
"completion_length": 169.6328125,
"epoch": 5.080321285140562,
"grad_norm": 0.26576781272888184,
"kl": 0.005453150719404221,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.46328125,
"reward_std": 0.20532279312610627,
"rewards/acc_reward_func": 0.46328125,
"step": 1265
},
{
"completion_length": 171.50546875,
"epoch": 5.100401606425703,
"grad_norm": 0.2594987154006958,
"kl": 0.00517116067931056,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51015625,
"reward_std": 0.20666127800941467,
"rewards/acc_reward_func": 0.51015625,
"step": 1270
},
{
"completion_length": 169.28046875,
"epoch": 5.120481927710843,
"grad_norm": 0.17170865833759308,
"kl": 0.004057973297312856,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5046875,
"reward_std": 0.1725970596075058,
"rewards/acc_reward_func": 0.5046875,
"step": 1275
},
{
"completion_length": 161.22578125,
"epoch": 5.140562248995984,
"grad_norm": 0.18916991353034973,
"kl": 0.0042429367080330845,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51015625,
"reward_std": 0.18553529381752015,
"rewards/acc_reward_func": 0.51015625,
"step": 1280
},
{
"completion_length": 166.45,
"epoch": 5.160642570281125,
"grad_norm": 0.17687956988811493,
"kl": 0.004810953792184591,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.52890625,
"reward_std": 0.20294986963272094,
"rewards/acc_reward_func": 0.52890625,
"step": 1285
},
{
"completion_length": 160.91796875,
"epoch": 5.180722891566265,
"grad_norm": 0.1697753518819809,
"kl": 0.009254092490300537,
"learning_rate": 5e-06,
"loss": 0.0009,
"reward": 0.4828125,
"reward_std": 0.1662046104669571,
"rewards/acc_reward_func": 0.4828125,
"step": 1290
},
{
"completion_length": 168.68359375,
"epoch": 5.2008032128514055,
"grad_norm": 0.31127893924713135,
"kl": 0.003546125767752528,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4984375,
"reward_std": 0.1875927209854126,
"rewards/acc_reward_func": 0.4984375,
"step": 1295
},
{
"completion_length": 162.34765625,
"epoch": 5.220883534136546,
"grad_norm": 0.18628616631031036,
"kl": 0.005193229066208005,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5390625,
"reward_std": 0.17191484570503235,
"rewards/acc_reward_func": 0.5390625,
"step": 1300
},
{
"completion_length": 176.30703125,
"epoch": 5.240963855421687,
"grad_norm": 0.18088261783123016,
"kl": 0.0038496053777635096,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48671875,
"reward_std": 0.18312011659145355,
"rewards/acc_reward_func": 0.48671875,
"step": 1305
},
{
"completion_length": 165.45,
"epoch": 5.261044176706827,
"grad_norm": 0.16925431787967682,
"kl": 0.003396956715732813,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.528125,
"reward_std": 0.160917729139328,
"rewards/acc_reward_func": 0.528125,
"step": 1310
},
{
"completion_length": 157.62734375,
"epoch": 5.281124497991968,
"grad_norm": 0.20400631427764893,
"kl": 0.004160564253106713,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.52421875,
"reward_std": 0.17697027921676636,
"rewards/acc_reward_func": 0.52421875,
"step": 1315
},
{
"completion_length": 171.98046875,
"epoch": 5.301204819277109,
"grad_norm": 0.16840705275535583,
"kl": 0.0036880777683109046,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.459375,
"reward_std": 0.19944757223129272,
"rewards/acc_reward_func": 0.459375,
"step": 1320
},
{
"completion_length": 168.76328125,
"epoch": 5.321285140562249,
"grad_norm": 0.2576988935470581,
"kl": 0.0043326653074473144,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50078125,
"reward_std": 0.17838993072509765,
"rewards/acc_reward_func": 0.50078125,
"step": 1325
},
{
"completion_length": 170.3703125,
"epoch": 5.341365461847389,
"grad_norm": 0.16828767955303192,
"kl": 0.0041460281703621146,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49296875,
"reward_std": 0.18524796962738038,
"rewards/acc_reward_func": 0.49296875,
"step": 1330
},
{
"completion_length": 166.046875,
"epoch": 5.36144578313253,
"grad_norm": 0.21222200989723206,
"kl": 0.005526072159409523,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5,
"reward_std": 0.18835185170173646,
"rewards/acc_reward_func": 0.5,
"step": 1335
},
{
"completion_length": 167.784375,
"epoch": 5.381526104417671,
"grad_norm": 0.19030138850212097,
"kl": 0.004389704763889312,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48671875,
"reward_std": 0.166469968855381,
"rewards/acc_reward_func": 0.48671875,
"step": 1340
},
{
"completion_length": 164.5453125,
"epoch": 5.401606425702811,
"grad_norm": 0.1627490520477295,
"kl": 0.004341602185741067,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49609375,
"reward_std": 0.1558452695608139,
"rewards/acc_reward_func": 0.49609375,
"step": 1345
},
{
"completion_length": 166.6265625,
"epoch": 5.421686746987952,
"grad_norm": 0.22193607687950134,
"kl": 0.00564349377527833,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5,
"reward_std": 0.18469333052635192,
"rewards/acc_reward_func": 0.5,
"step": 1350
},
{
"completion_length": 172.0625,
"epoch": 5.4417670682730925,
"grad_norm": 0.19178318977355957,
"kl": 0.005153108527883888,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.53515625,
"reward_std": 0.17167717814445496,
"rewards/acc_reward_func": 0.53515625,
"step": 1355
},
{
"completion_length": 180.1453125,
"epoch": 5.461847389558233,
"grad_norm": 0.1635100245475769,
"kl": 0.004484001686796546,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46796875,
"reward_std": 0.1897729754447937,
"rewards/acc_reward_func": 0.46796875,
"step": 1360
},
{
"completion_length": 154.84921875,
"epoch": 5.481927710843373,
"grad_norm": 0.19068863987922668,
"kl": 0.007403366360813379,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.5421875,
"reward_std": 0.19395375102758408,
"rewards/acc_reward_func": 0.5421875,
"step": 1365
},
{
"completion_length": 164.79453125,
"epoch": 5.502008032128514,
"grad_norm": 0.21637621521949768,
"kl": 0.0035220107529312372,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5609375,
"reward_std": 0.16878624856472016,
"rewards/acc_reward_func": 0.5609375,
"step": 1370
},
{
"completion_length": 168.9765625,
"epoch": 5.522088353413655,
"grad_norm": 0.16138571500778198,
"kl": 0.003919241763651371,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48671875,
"reward_std": 0.17470018863677977,
"rewards/acc_reward_func": 0.48671875,
"step": 1375
},
{
"completion_length": 173.23984375,
"epoch": 5.542168674698795,
"grad_norm": 0.18280155956745148,
"kl": 0.004307154426351189,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45703125,
"reward_std": 0.20597809553146362,
"rewards/acc_reward_func": 0.45703125,
"step": 1380
},
{
"completion_length": 158.925,
"epoch": 5.562248995983936,
"grad_norm": 0.17805874347686768,
"kl": 0.004826354794204235,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.496875,
"reward_std": 0.14961180835962296,
"rewards/acc_reward_func": 0.496875,
"step": 1385
},
{
"completion_length": 169.16015625,
"epoch": 5.582329317269076,
"grad_norm": 0.1610419750213623,
"kl": 0.003801784198731184,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51484375,
"reward_std": 0.1774403154850006,
"rewards/acc_reward_func": 0.51484375,
"step": 1390
},
{
"completion_length": 157.95625,
"epoch": 5.602409638554217,
"grad_norm": 0.19692274928092957,
"kl": 0.004147279774770141,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46640625,
"reward_std": 0.17506463825702667,
"rewards/acc_reward_func": 0.46640625,
"step": 1395
},
{
"completion_length": 155.771875,
"epoch": 5.622489959839357,
"grad_norm": 0.22810834646224976,
"kl": 0.0047633805312216285,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.446875,
"reward_std": 0.17824740409851075,
"rewards/acc_reward_func": 0.446875,
"step": 1400
},
{
"completion_length": 166.79453125,
"epoch": 5.642570281124498,
"grad_norm": 0.19700032472610474,
"kl": 0.003686194634065032,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.53359375,
"reward_std": 0.19514044523239135,
"rewards/acc_reward_func": 0.53359375,
"step": 1405
},
{
"completion_length": 174.20625,
"epoch": 5.662650602409639,
"grad_norm": 0.230524942278862,
"kl": 0.003768759872764349,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5,
"reward_std": 0.20705547034740449,
"rewards/acc_reward_func": 0.5,
"step": 1410
},
{
"completion_length": 166.265625,
"epoch": 5.682730923694779,
"grad_norm": 0.1851169466972351,
"kl": 0.004947473015636206,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.47421875,
"reward_std": 0.1891168922185898,
"rewards/acc_reward_func": 0.47421875,
"step": 1415
},
{
"completion_length": 165.9390625,
"epoch": 5.7028112449799195,
"grad_norm": 0.19335930049419403,
"kl": 0.005260448809713126,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.459375,
"reward_std": 0.17525561451911925,
"rewards/acc_reward_func": 0.459375,
"step": 1420
},
{
"completion_length": 171.14921875,
"epoch": 5.72289156626506,
"grad_norm": 0.19283075630664825,
"kl": 0.004185305628925562,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.44609375,
"reward_std": 0.18574745804071427,
"rewards/acc_reward_func": 0.44609375,
"step": 1425
},
{
"completion_length": 170.128125,
"epoch": 5.742971887550201,
"grad_norm": 0.2090463787317276,
"kl": 0.005675878562033177,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.478125,
"reward_std": 0.16433463394641876,
"rewards/acc_reward_func": 0.478125,
"step": 1430
},
{
"completion_length": 170.93125,
"epoch": 5.763052208835341,
"grad_norm": 0.21423965692520142,
"kl": 0.004435191815719009,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51328125,
"reward_std": 0.19879531264305114,
"rewards/acc_reward_func": 0.51328125,
"step": 1435
},
{
"completion_length": 168.43203125,
"epoch": 5.783132530120482,
"grad_norm": 0.26377153396606445,
"kl": 0.004027861636132002,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4953125,
"reward_std": 0.18201151937246324,
"rewards/acc_reward_func": 0.4953125,
"step": 1440
},
{
"completion_length": 176.6515625,
"epoch": 5.803212851405623,
"grad_norm": 0.24323724210262299,
"kl": 0.0033185009844601153,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.48984375,
"reward_std": 0.21152729988098146,
"rewards/acc_reward_func": 0.48984375,
"step": 1445
},
{
"completion_length": 170.44921875,
"epoch": 5.823293172690763,
"grad_norm": 0.176719531416893,
"kl": 0.0033552560023963453,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.471875,
"reward_std": 0.1784900039434433,
"rewards/acc_reward_func": 0.471875,
"step": 1450
},
{
"completion_length": 163.53359375,
"epoch": 5.843373493975903,
"grad_norm": 0.18552526831626892,
"kl": 0.005202607065439224,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.53515625,
"reward_std": 0.18981966376304626,
"rewards/acc_reward_func": 0.53515625,
"step": 1455
},
{
"completion_length": 170.81640625,
"epoch": 5.863453815261044,
"grad_norm": 0.23733043670654297,
"kl": 0.003750589909031987,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48203125,
"reward_std": 0.16400111615657806,
"rewards/acc_reward_func": 0.48203125,
"step": 1460
},
{
"completion_length": 174.26953125,
"epoch": 5.883534136546185,
"grad_norm": 0.18482081592082977,
"kl": 0.0028898491524159907,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.43046875,
"reward_std": 0.15752456188201905,
"rewards/acc_reward_func": 0.43046875,
"step": 1465
},
{
"completion_length": 161.2484375,
"epoch": 5.903614457831325,
"grad_norm": 0.25243157148361206,
"kl": 0.003909509163349867,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47734375,
"reward_std": 0.1728631943464279,
"rewards/acc_reward_func": 0.47734375,
"step": 1470
},
{
"completion_length": 170.67734375,
"epoch": 5.923694779116466,
"grad_norm": 0.1560831069946289,
"kl": 0.004822219582274556,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4578125,
"reward_std": 0.16488956809043884,
"rewards/acc_reward_func": 0.4578125,
"step": 1475
},
{
"completion_length": 163.73984375,
"epoch": 5.943775100401607,
"grad_norm": 0.3025646507740021,
"kl": 0.0037991451565176247,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5015625,
"reward_std": 0.1829603523015976,
"rewards/acc_reward_func": 0.5015625,
"step": 1480
},
{
"completion_length": 164.8859375,
"epoch": 5.9638554216867465,
"grad_norm": 0.17325057089328766,
"kl": 0.0032608849927783013,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.50078125,
"reward_std": 0.1647592604160309,
"rewards/acc_reward_func": 0.50078125,
"step": 1485
},
{
"completion_length": 175.43125,
"epoch": 5.983935742971887,
"grad_norm": 0.23498950898647308,
"kl": 0.004199746390804648,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.42890625,
"reward_std": 0.21073694825172423,
"rewards/acc_reward_func": 0.42890625,
"step": 1490
},
{
"completion_length": 184.42433166503906,
"epoch": 6.004016064257028,
"grad_norm": 0.2495023012161255,
"kl": 0.0061729966662824156,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.515625,
"reward_std": 0.20198428332805635,
"rewards/acc_reward_func": 0.515625,
"step": 1495
},
{
"completion_length": 161.859375,
"epoch": 6.024096385542169,
"grad_norm": 0.18060770630836487,
"kl": 0.003893446130678058,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.54296875,
"reward_std": 0.17851275503635405,
"rewards/acc_reward_func": 0.54296875,
"step": 1500
},
{
"completion_length": 166.240625,
"epoch": 6.044176706827309,
"grad_norm": 0.16868437826633453,
"kl": 0.003805461712181568,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4859375,
"reward_std": 0.1837507039308548,
"rewards/acc_reward_func": 0.4859375,
"step": 1505
},
{
"completion_length": 160.85625,
"epoch": 6.06425702811245,
"grad_norm": 0.22041331231594086,
"kl": 0.005358227575197816,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50390625,
"reward_std": 0.17320341467857361,
"rewards/acc_reward_func": 0.50390625,
"step": 1510
},
{
"completion_length": 165.78828125,
"epoch": 6.0843373493975905,
"grad_norm": 0.23881591856479645,
"kl": 0.003745431452989578,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5328125,
"reward_std": 0.1678643196821213,
"rewards/acc_reward_func": 0.5328125,
"step": 1515
},
{
"completion_length": 163.56875,
"epoch": 6.104417670682731,
"grad_norm": 0.23352575302124023,
"kl": 0.004220539703965187,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46171875,
"reward_std": 0.17301645278930664,
"rewards/acc_reward_func": 0.46171875,
"step": 1520
},
{
"completion_length": 169.3390625,
"epoch": 6.124497991967871,
"grad_norm": 0.21609720587730408,
"kl": 0.0036188287660479544,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45390625,
"reward_std": 0.17002290189266206,
"rewards/acc_reward_func": 0.45390625,
"step": 1525
},
{
"completion_length": 162.3828125,
"epoch": 6.144578313253012,
"grad_norm": 0.186594158411026,
"kl": 0.0032249293755739926,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.471875,
"reward_std": 0.17215646654367447,
"rewards/acc_reward_func": 0.471875,
"step": 1530
},
{
"completion_length": 160.55859375,
"epoch": 6.164658634538153,
"grad_norm": 0.23915688693523407,
"kl": 0.006140461005270481,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5375,
"reward_std": 0.18240620493888854,
"rewards/acc_reward_func": 0.5375,
"step": 1535
},
{
"completion_length": 171.22890625,
"epoch": 6.184738955823293,
"grad_norm": 0.2295251190662384,
"kl": 0.0047486312687397,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5234375,
"reward_std": 0.17341783046722412,
"rewards/acc_reward_func": 0.5234375,
"step": 1540
},
{
"completion_length": 164.5578125,
"epoch": 6.204819277108434,
"grad_norm": 0.1796785295009613,
"kl": 0.0037390706595033405,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5046875,
"reward_std": 0.19187611043453218,
"rewards/acc_reward_func": 0.5046875,
"step": 1545
},
{
"completion_length": 171.79921875,
"epoch": 6.224899598393574,
"grad_norm": 0.1903897225856781,
"kl": 0.004389600781723857,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4921875,
"reward_std": 0.1906694084405899,
"rewards/acc_reward_func": 0.4921875,
"step": 1550
},
{
"completion_length": 159.57578125,
"epoch": 6.244979919678715,
"grad_norm": 0.26276904344558716,
"kl": 0.004401320079341531,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.521875,
"reward_std": 0.21102378368377686,
"rewards/acc_reward_func": 0.521875,
"step": 1555
},
{
"completion_length": 160.49140625,
"epoch": 6.265060240963855,
"grad_norm": 0.18606114387512207,
"kl": 0.006198170594871044,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.52578125,
"reward_std": 0.18974254429340362,
"rewards/acc_reward_func": 0.52578125,
"step": 1560
},
{
"completion_length": 170.8125,
"epoch": 6.285140562248996,
"grad_norm": 0.21555183827877045,
"kl": 0.003926319163292646,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47734375,
"reward_std": 0.1782539129257202,
"rewards/acc_reward_func": 0.47734375,
"step": 1565
},
{
"completion_length": 157.86484375,
"epoch": 6.305220883534137,
"grad_norm": 0.22281333804130554,
"kl": 0.0035013348795473577,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51796875,
"reward_std": 0.1774645447731018,
"rewards/acc_reward_func": 0.51796875,
"step": 1570
},
{
"completion_length": 179.4171875,
"epoch": 6.325301204819277,
"grad_norm": 0.1492380052804947,
"kl": 0.004015867225825786,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4703125,
"reward_std": 0.14800146371126174,
"rewards/acc_reward_func": 0.4703125,
"step": 1575
},
{
"completion_length": 160.57734375,
"epoch": 6.3453815261044175,
"grad_norm": 0.2002713531255722,
"kl": 0.004907358484342694,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.575,
"reward_std": 0.17554293870925902,
"rewards/acc_reward_func": 0.575,
"step": 1580
},
{
"completion_length": 175.9375,
"epoch": 6.365461847389558,
"grad_norm": 0.4199647903442383,
"kl": 0.002621839474886656,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.509375,
"reward_std": 0.1709937036037445,
"rewards/acc_reward_func": 0.509375,
"step": 1585
},
{
"completion_length": 161.84296875,
"epoch": 6.385542168674699,
"grad_norm": 0.22395344078540802,
"kl": 0.004057837929576635,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5328125,
"reward_std": 0.16370453983545302,
"rewards/acc_reward_func": 0.5328125,
"step": 1590
},
{
"completion_length": 162.79609375,
"epoch": 6.405622489959839,
"grad_norm": 0.17236217856407166,
"kl": 0.003938271198421717,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.52109375,
"reward_std": 0.15996938645839692,
"rewards/acc_reward_func": 0.52109375,
"step": 1595
},
{
"completion_length": 167.084375,
"epoch": 6.42570281124498,
"grad_norm": 0.22631123661994934,
"kl": 0.003588200220838189,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4390625,
"reward_std": 0.1787314236164093,
"rewards/acc_reward_func": 0.4390625,
"step": 1600
},
{
"completion_length": 158.30078125,
"epoch": 6.445783132530121,
"grad_norm": 0.2189439982175827,
"kl": 0.003932695230469107,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5109375,
"reward_std": 0.1482663258910179,
"rewards/acc_reward_func": 0.5109375,
"step": 1605
},
{
"completion_length": 173.984375,
"epoch": 6.4658634538152615,
"grad_norm": 0.17536965012550354,
"kl": 0.004332389356568456,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4671875,
"reward_std": 0.16784343421459197,
"rewards/acc_reward_func": 0.4671875,
"step": 1610
},
{
"completion_length": 159.8375,
"epoch": 6.485943775100401,
"grad_norm": 0.20090238749980927,
"kl": 0.005236976826563477,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5375,
"reward_std": 0.17512403428554535,
"rewards/acc_reward_func": 0.5375,
"step": 1615
},
{
"completion_length": 161.2859375,
"epoch": 6.506024096385542,
"grad_norm": 0.16911007463932037,
"kl": 0.0032496250700205565,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.50390625,
"reward_std": 0.15321269631385803,
"rewards/acc_reward_func": 0.50390625,
"step": 1620
},
{
"completion_length": 164.26953125,
"epoch": 6.526104417670683,
"grad_norm": 0.20112669467926025,
"kl": 0.005721660749986768,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4765625,
"reward_std": 0.16933144629001617,
"rewards/acc_reward_func": 0.4765625,
"step": 1625
},
{
"completion_length": 172.20703125,
"epoch": 6.546184738955823,
"grad_norm": 0.23955944180488586,
"kl": 0.004955686116591096,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.45390625,
"reward_std": 0.18633106350898743,
"rewards/acc_reward_func": 0.45390625,
"step": 1630
},
{
"completion_length": 174.4578125,
"epoch": 6.566265060240964,
"grad_norm": 0.2082107961177826,
"kl": 0.0036007991526275872,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45390625,
"reward_std": 0.193873855471611,
"rewards/acc_reward_func": 0.45390625,
"step": 1635
},
{
"completion_length": 174.42109375,
"epoch": 6.586345381526105,
"grad_norm": 0.23655228316783905,
"kl": 0.00456015644595027,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.46171875,
"reward_std": 0.1951412320137024,
"rewards/acc_reward_func": 0.46171875,
"step": 1640
},
{
"completion_length": 166.25,
"epoch": 6.606425702811245,
"grad_norm": 0.2838016748428345,
"kl": 0.0037197262048721314,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.471875,
"reward_std": 0.18558769524097443,
"rewards/acc_reward_func": 0.471875,
"step": 1645
},
{
"completion_length": 169.52734375,
"epoch": 6.626506024096385,
"grad_norm": 0.17167700827121735,
"kl": 0.004374908190220595,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45234375,
"reward_std": 0.15586949288845062,
"rewards/acc_reward_func": 0.45234375,
"step": 1650
},
{
"completion_length": 164.6828125,
"epoch": 6.646586345381526,
"grad_norm": 0.23104216158390045,
"kl": 0.003933112556114793,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5125,
"reward_std": 0.19369065165519714,
"rewards/acc_reward_func": 0.5125,
"step": 1655
},
{
"completion_length": 172.5265625,
"epoch": 6.666666666666667,
"grad_norm": 0.19686660170555115,
"kl": 0.004738407302647829,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48125,
"reward_std": 0.19287706017494202,
"rewards/acc_reward_func": 0.48125,
"step": 1660
},
{
"completion_length": 163.5984375,
"epoch": 6.686746987951807,
"grad_norm": 0.1952996701002121,
"kl": 0.003322056075558066,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.55,
"reward_std": 0.1736514836549759,
"rewards/acc_reward_func": 0.55,
"step": 1665
},
{
"completion_length": 164.3140625,
"epoch": 6.706827309236948,
"grad_norm": 0.18798507750034332,
"kl": 0.0037219693418592214,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.54453125,
"reward_std": 0.17801770865917205,
"rewards/acc_reward_func": 0.54453125,
"step": 1670
},
{
"completion_length": 169.51875,
"epoch": 6.7269076305220885,
"grad_norm": 0.2563120424747467,
"kl": 0.005326081439852715,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.453125,
"reward_std": 0.16696995198726655,
"rewards/acc_reward_func": 0.453125,
"step": 1675
},
{
"completion_length": 165.33671875,
"epoch": 6.746987951807229,
"grad_norm": 0.178642138838768,
"kl": 0.004675744194537401,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.45625,
"reward_std": 0.16041647493839264,
"rewards/acc_reward_func": 0.45625,
"step": 1680
},
{
"completion_length": 171.18359375,
"epoch": 6.767068273092369,
"grad_norm": 0.16683745384216309,
"kl": 0.004911075672134757,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48203125,
"reward_std": 0.1782548874616623,
"rewards/acc_reward_func": 0.48203125,
"step": 1685
},
{
"completion_length": 165.3453125,
"epoch": 6.78714859437751,
"grad_norm": 0.23056790232658386,
"kl": 0.003962885634973645,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51875,
"reward_std": 0.18369878828525543,
"rewards/acc_reward_func": 0.51875,
"step": 1690
},
{
"completion_length": 169.69921875,
"epoch": 6.807228915662651,
"grad_norm": 0.2673325538635254,
"kl": 0.0048594952560961245,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5046875,
"reward_std": 0.20014474391937256,
"rewards/acc_reward_func": 0.5046875,
"step": 1695
},
{
"completion_length": 175.39140625,
"epoch": 6.827309236947791,
"grad_norm": 0.12390197068452835,
"kl": 0.003452077321708202,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.4875,
"reward_std": 0.17162477672100068,
"rewards/acc_reward_func": 0.4875,
"step": 1700
},
{
"completion_length": 177.07734375,
"epoch": 6.847389558232932,
"grad_norm": 0.19321836531162262,
"kl": 0.00381118506193161,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5109375,
"reward_std": 0.21291799545288087,
"rewards/acc_reward_func": 0.5109375,
"step": 1705
},
{
"completion_length": 167.85859375,
"epoch": 6.867469879518072,
"grad_norm": 0.17356781661510468,
"kl": 0.003902095882222056,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49296875,
"reward_std": 0.1720191702246666,
"rewards/acc_reward_func": 0.49296875,
"step": 1710
},
{
"completion_length": 166.8859375,
"epoch": 6.887550200803213,
"grad_norm": 0.17060615122318268,
"kl": 0.0050950954668223854,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.45390625,
"reward_std": 0.16486406922340394,
"rewards/acc_reward_func": 0.45390625,
"step": 1715
},
{
"completion_length": 172.9375,
"epoch": 6.907630522088353,
"grad_norm": 0.2380588948726654,
"kl": 0.004966308176517486,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.47265625,
"reward_std": 0.2119983196258545,
"rewards/acc_reward_func": 0.47265625,
"step": 1720
},
{
"completion_length": 172.8171875,
"epoch": 6.927710843373494,
"grad_norm": 0.19691957533359528,
"kl": 0.004990886058658361,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5078125,
"reward_std": 0.17996236085891723,
"rewards/acc_reward_func": 0.5078125,
"step": 1725
},
{
"completion_length": 173.16875,
"epoch": 6.947791164658635,
"grad_norm": 0.17230121791362762,
"kl": 0.005301001155748964,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.53359375,
"reward_std": 0.18569555282592773,
"rewards/acc_reward_func": 0.53359375,
"step": 1730
},
{
"completion_length": 170.0234375,
"epoch": 6.967871485943775,
"grad_norm": 0.19846974313259125,
"kl": 0.004578445944935084,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5140625,
"reward_std": 0.19245370030403136,
"rewards/acc_reward_func": 0.5140625,
"step": 1735
},
{
"completion_length": 165.16015625,
"epoch": 6.9879518072289155,
"grad_norm": 0.16047954559326172,
"kl": 0.004474028572440147,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4859375,
"reward_std": 0.16750085949897767,
"rewards/acc_reward_func": 0.4859375,
"step": 1740
},
{
"completion_length": 136.57723236083984,
"epoch": 7.008032128514056,
"grad_norm": 0.17738589644432068,
"kl": 0.006900531239807606,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.4859375,
"reward_std": 0.18829993903636932,
"rewards/acc_reward_func": 0.4859375,
"step": 1745
},
{
"completion_length": 163.39609375,
"epoch": 7.028112449799197,
"grad_norm": 0.17355577647686005,
"kl": 0.004702023277059197,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5015625,
"reward_std": 0.18041169941425322,
"rewards/acc_reward_func": 0.5015625,
"step": 1750
},
{
"completion_length": 177.6109375,
"epoch": 7.048192771084337,
"grad_norm": 0.209559366106987,
"kl": 0.0037886591628193854,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.46875,
"reward_std": 0.1576360672712326,
"rewards/acc_reward_func": 0.46875,
"step": 1755
},
{
"completion_length": 187.11796875,
"epoch": 7.068273092369478,
"grad_norm": 0.17179812490940094,
"kl": 0.0032479729037731887,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.45546875,
"reward_std": 0.21962848901748658,
"rewards/acc_reward_func": 0.45546875,
"step": 1760
},
{
"completion_length": 169.584375,
"epoch": 7.088353413654619,
"grad_norm": 0.20904237031936646,
"kl": 0.005238852137699724,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.528125,
"reward_std": 0.19539960026741027,
"rewards/acc_reward_func": 0.528125,
"step": 1765
},
{
"completion_length": 167.30390625,
"epoch": 7.108433734939759,
"grad_norm": 0.304017037153244,
"kl": 0.006272567342966795,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.54921875,
"reward_std": 0.19177623689174653,
"rewards/acc_reward_func": 0.54921875,
"step": 1770
},
{
"completion_length": 156.928125,
"epoch": 7.128514056224899,
"grad_norm": 0.1864553540945053,
"kl": 0.003993002325296402,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5296875,
"reward_std": 0.19440407156944275,
"rewards/acc_reward_func": 0.5296875,
"step": 1775
},
{
"completion_length": 172.28515625,
"epoch": 7.14859437751004,
"grad_norm": 0.25102752447128296,
"kl": 0.006025291467085481,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.45625,
"reward_std": 0.19485165178775787,
"rewards/acc_reward_func": 0.45625,
"step": 1780
},
{
"completion_length": 164.67421875,
"epoch": 7.168674698795181,
"grad_norm": 0.17249836027622223,
"kl": 0.003704800782725215,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.52421875,
"reward_std": 0.17112577855587005,
"rewards/acc_reward_func": 0.52421875,
"step": 1785
},
{
"completion_length": 162.44296875,
"epoch": 7.188755020080321,
"grad_norm": 0.2556600570678711,
"kl": 0.005100478325039149,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5109375,
"reward_std": 0.19990214407444,
"rewards/acc_reward_func": 0.5109375,
"step": 1790
},
{
"completion_length": 172.34453125,
"epoch": 7.208835341365462,
"grad_norm": 0.22728115320205688,
"kl": 0.003885161271318793,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47109375,
"reward_std": 0.19129920601844788,
"rewards/acc_reward_func": 0.47109375,
"step": 1795
},
{
"completion_length": 170.40703125,
"epoch": 7.228915662650603,
"grad_norm": 0.17826801538467407,
"kl": 0.004675903636962175,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4421875,
"reward_std": 0.16262765228748322,
"rewards/acc_reward_func": 0.4421875,
"step": 1800
},
{
"completion_length": 179.565625,
"epoch": 7.2489959839357425,
"grad_norm": 0.20149245858192444,
"kl": 0.004339186474680901,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4546875,
"reward_std": 0.17435869872570037,
"rewards/acc_reward_func": 0.4546875,
"step": 1805
},
{
"completion_length": 166.63828125,
"epoch": 7.269076305220883,
"grad_norm": 0.20250743627548218,
"kl": 0.005552412709221244,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.47578125,
"reward_std": 0.17175332605838775,
"rewards/acc_reward_func": 0.47578125,
"step": 1810
},
{
"completion_length": 167.025,
"epoch": 7.289156626506024,
"grad_norm": 0.22346261143684387,
"kl": 0.004601765749976039,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49375,
"reward_std": 0.1652619868516922,
"rewards/acc_reward_func": 0.49375,
"step": 1815
},
{
"completion_length": 170.37109375,
"epoch": 7.309236947791165,
"grad_norm": 0.178205206990242,
"kl": 0.004340034676715732,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48515625,
"reward_std": 0.16681116968393325,
"rewards/acc_reward_func": 0.48515625,
"step": 1820
},
{
"completion_length": 168.8859375,
"epoch": 7.329317269076305,
"grad_norm": 0.1969699114561081,
"kl": 0.004392454726621508,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51171875,
"reward_std": 0.16091821789741517,
"rewards/acc_reward_func": 0.51171875,
"step": 1825
},
{
"completion_length": 165.66796875,
"epoch": 7.349397590361446,
"grad_norm": 0.20173302292823792,
"kl": 0.004101750953122973,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.53984375,
"reward_std": 0.18879991471767427,
"rewards/acc_reward_func": 0.53984375,
"step": 1830
},
{
"completion_length": 166.3125,
"epoch": 7.3694779116465865,
"grad_norm": 0.15750809013843536,
"kl": 0.004297500057145953,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48671875,
"reward_std": 0.16925580203533172,
"rewards/acc_reward_func": 0.48671875,
"step": 1835
},
{
"completion_length": 167.91171875,
"epoch": 7.389558232931727,
"grad_norm": 0.19564926624298096,
"kl": 0.004325853241607547,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4890625,
"reward_std": 0.16754734814167022,
"rewards/acc_reward_func": 0.4890625,
"step": 1840
},
{
"completion_length": 173.02578125,
"epoch": 7.409638554216867,
"grad_norm": 0.20001207292079926,
"kl": 0.004749092832207679,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48046875,
"reward_std": 0.19645450711250306,
"rewards/acc_reward_func": 0.48046875,
"step": 1845
},
{
"completion_length": 165.81640625,
"epoch": 7.429718875502008,
"grad_norm": 0.17301803827285767,
"kl": 0.0033940696623176335,
"learning_rate": 5e-06,
"loss": 0.0003,
"reward": 0.534375,
"reward_std": 0.17007530927658082,
"rewards/acc_reward_func": 0.534375,
"step": 1850
},
{
"completion_length": 162.24921875,
"epoch": 7.449799196787149,
"grad_norm": 0.25191670656204224,
"kl": 0.004400596721097827,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50390625,
"reward_std": 0.18351577818393708,
"rewards/acc_reward_func": 0.50390625,
"step": 1855
},
{
"completion_length": 169.48125,
"epoch": 7.469879518072289,
"grad_norm": 0.13689953088760376,
"kl": 0.0048028172459453344,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.515625,
"reward_std": 0.15137344598770142,
"rewards/acc_reward_func": 0.515625,
"step": 1860
},
{
"completion_length": 172.78125,
"epoch": 7.48995983935743,
"grad_norm": 0.1866185963153839,
"kl": 0.004676873050630093,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50078125,
"reward_std": 0.1989313393831253,
"rewards/acc_reward_func": 0.50078125,
"step": 1865
},
{
"completion_length": 164.3234375,
"epoch": 7.51004016064257,
"grad_norm": 0.23410245776176453,
"kl": 0.005325514962896705,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4703125,
"reward_std": 0.1925602823495865,
"rewards/acc_reward_func": 0.4703125,
"step": 1870
},
{
"completion_length": 170.21484375,
"epoch": 7.530120481927711,
"grad_norm": 0.19160370528697968,
"kl": 0.003813550900667906,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49453125,
"reward_std": 0.18627372831106187,
"rewards/acc_reward_func": 0.49453125,
"step": 1875
},
{
"completion_length": 171.9109375,
"epoch": 7.550200803212851,
"grad_norm": 0.19194425642490387,
"kl": 0.005130987428128719,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48046875,
"reward_std": 0.20168771743774414,
"rewards/acc_reward_func": 0.48046875,
"step": 1880
},
{
"completion_length": 165.7328125,
"epoch": 7.570281124497992,
"grad_norm": 0.1845492124557495,
"kl": 0.006337259523570537,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4859375,
"reward_std": 0.1606831043958664,
"rewards/acc_reward_func": 0.4859375,
"step": 1885
},
{
"completion_length": 164.1546875,
"epoch": 7.590361445783133,
"grad_norm": 0.22460918128490448,
"kl": 0.005079053528606892,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.55390625,
"reward_std": 0.16691578030586243,
"rewards/acc_reward_func": 0.55390625,
"step": 1890
},
{
"completion_length": 171.5453125,
"epoch": 7.610441767068274,
"grad_norm": 0.13173526525497437,
"kl": 0.004137717792764306,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50546875,
"reward_std": 0.18729869723320008,
"rewards/acc_reward_func": 0.50546875,
"step": 1895
},
{
"completion_length": 163.40859375,
"epoch": 7.6305220883534135,
"grad_norm": 0.20468363165855408,
"kl": 0.0035774966701865194,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51015625,
"reward_std": 0.16102381199598312,
"rewards/acc_reward_func": 0.51015625,
"step": 1900
},
{
"completion_length": 157.20234375,
"epoch": 7.650602409638554,
"grad_norm": 0.18477745354175568,
"kl": 0.00518039995804429,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5171875,
"reward_std": 0.1682313174009323,
"rewards/acc_reward_func": 0.5171875,
"step": 1905
},
{
"completion_length": 165.121875,
"epoch": 7.670682730923695,
"grad_norm": 0.24713313579559326,
"kl": 0.004455389454960823,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48203125,
"reward_std": 0.2017611101269722,
"rewards/acc_reward_func": 0.48203125,
"step": 1910
},
{
"completion_length": 158.24453125,
"epoch": 7.690763052208835,
"grad_norm": 0.27691611647605896,
"kl": 0.004452999541535973,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.52734375,
"reward_std": 0.15584605187177658,
"rewards/acc_reward_func": 0.52734375,
"step": 1915
},
{
"completion_length": 171.646875,
"epoch": 7.710843373493976,
"grad_norm": 0.17179369926452637,
"kl": 0.005658068554475903,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4984375,
"reward_std": 0.17356835007667543,
"rewards/acc_reward_func": 0.4984375,
"step": 1920
},
{
"completion_length": 165.35,
"epoch": 7.730923694779117,
"grad_norm": 0.18297934532165527,
"kl": 0.0038178108632564546,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51875,
"reward_std": 0.17515002489089965,
"rewards/acc_reward_func": 0.51875,
"step": 1925
},
{
"completion_length": 172.55625,
"epoch": 7.7510040160642575,
"grad_norm": 0.22155731916427612,
"kl": 0.004152493830770254,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51328125,
"reward_std": 0.22165172100067138,
"rewards/acc_reward_func": 0.51328125,
"step": 1930
},
{
"completion_length": 160.4171875,
"epoch": 7.771084337349397,
"grad_norm": 0.24871139228343964,
"kl": 0.004298145975917577,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50859375,
"reward_std": 0.154714697599411,
"rewards/acc_reward_func": 0.50859375,
"step": 1935
},
{
"completion_length": 171.86953125,
"epoch": 7.791164658634538,
"grad_norm": 0.19001755118370056,
"kl": 0.004053365485742688,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47890625,
"reward_std": 0.15913235396146774,
"rewards/acc_reward_func": 0.47890625,
"step": 1940
},
{
"completion_length": 167.07734375,
"epoch": 7.811244979919679,
"grad_norm": 0.16538827121257782,
"kl": 0.004332380229607224,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5328125,
"reward_std": 0.1584488719701767,
"rewards/acc_reward_func": 0.5328125,
"step": 1945
},
{
"completion_length": 170.06328125,
"epoch": 7.831325301204819,
"grad_norm": 0.2236621379852295,
"kl": 0.004883517092093825,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49765625,
"reward_std": 0.18454075157642363,
"rewards/acc_reward_func": 0.49765625,
"step": 1950
},
{
"completion_length": 169.584375,
"epoch": 7.85140562248996,
"grad_norm": 0.16362765431404114,
"kl": 0.004597957525402307,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5265625,
"reward_std": 0.18206597864627838,
"rewards/acc_reward_func": 0.5265625,
"step": 1955
},
{
"completion_length": 165.31953125,
"epoch": 7.871485943775101,
"grad_norm": 0.21278263628482819,
"kl": 0.0052021652925759556,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5703125,
"reward_std": 0.19598143100738524,
"rewards/acc_reward_func": 0.5703125,
"step": 1960
},
{
"completion_length": 162.621875,
"epoch": 7.891566265060241,
"grad_norm": 0.18768790364265442,
"kl": 0.005252905795350671,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5109375,
"reward_std": 0.1665752649307251,
"rewards/acc_reward_func": 0.5109375,
"step": 1965
},
{
"completion_length": 167.84765625,
"epoch": 7.911646586345381,
"grad_norm": 0.16724392771720886,
"kl": 0.005013928003609181,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.44921875,
"reward_std": 0.16862747073173523,
"rewards/acc_reward_func": 0.44921875,
"step": 1970
},
{
"completion_length": 169.7859375,
"epoch": 7.931726907630522,
"grad_norm": 0.21075892448425293,
"kl": 0.005731826461851597,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.49921875,
"reward_std": 0.16057721972465516,
"rewards/acc_reward_func": 0.49921875,
"step": 1975
},
{
"completion_length": 162.66484375,
"epoch": 7.951807228915663,
"grad_norm": 0.2669246196746826,
"kl": 0.004664321616292,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50390625,
"reward_std": 0.1739410638809204,
"rewards/acc_reward_func": 0.50390625,
"step": 1980
},
{
"completion_length": 163.11953125,
"epoch": 7.971887550200803,
"grad_norm": 0.266990602016449,
"kl": 0.0038902328815311193,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.525,
"reward_std": 0.16144144386053086,
"rewards/acc_reward_func": 0.525,
"step": 1985
},
{
"completion_length": 161.259375,
"epoch": 7.991967871485944,
"grad_norm": 0.20793935656547546,
"kl": 0.004720974247902632,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.525,
"reward_std": 0.1585804581642151,
"rewards/acc_reward_func": 0.525,
"step": 1990
},
{
"completion_length": 142.00937519073486,
"epoch": 8.012048192771084,
"grad_norm": 0.18195876479148865,
"kl": 0.004488107794895768,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51484375,
"reward_std": 0.157919442653656,
"rewards/acc_reward_func": 0.51484375,
"step": 1995
},
{
"completion_length": 168.15703125,
"epoch": 8.032128514056225,
"grad_norm": 0.16356365382671356,
"kl": 0.00401020054705441,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.53125,
"reward_std": 0.15263383090496063,
"rewards/acc_reward_func": 0.53125,
"step": 2000
},
{
"completion_length": 169.0984375,
"epoch": 8.052208835341366,
"grad_norm": 0.15103977918624878,
"kl": 0.005360491573810577,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.52734375,
"reward_std": 0.1702583134174347,
"rewards/acc_reward_func": 0.52734375,
"step": 2005
},
{
"completion_length": 172.23359375,
"epoch": 8.072289156626505,
"grad_norm": 0.17061328887939453,
"kl": 0.004422061378136277,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.440625,
"reward_std": 0.16981043815612792,
"rewards/acc_reward_func": 0.440625,
"step": 2010
},
{
"completion_length": 156.746875,
"epoch": 8.092369477911646,
"grad_norm": 0.19579732418060303,
"kl": 0.005747760739177465,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4703125,
"reward_std": 0.14048289954662324,
"rewards/acc_reward_func": 0.4703125,
"step": 2015
},
{
"completion_length": 178.1046875,
"epoch": 8.112449799196787,
"grad_norm": 0.20847676694393158,
"kl": 0.006182280369102955,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.51328125,
"reward_std": 0.1824621394276619,
"rewards/acc_reward_func": 0.51328125,
"step": 2020
},
{
"completion_length": 161.95234375,
"epoch": 8.132530120481928,
"grad_norm": 0.4776877164840698,
"kl": 0.004559037415310741,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5328125,
"reward_std": 0.16399539709091188,
"rewards/acc_reward_func": 0.5328125,
"step": 2025
},
{
"completion_length": 166.728125,
"epoch": 8.152610441767068,
"grad_norm": 0.17510586977005005,
"kl": 0.005545435380190611,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.55625,
"reward_std": 0.1712523579597473,
"rewards/acc_reward_func": 0.55625,
"step": 2030
},
{
"completion_length": 167.50859375,
"epoch": 8.17269076305221,
"grad_norm": 0.23873630166053772,
"kl": 0.007514993194490671,
"learning_rate": 5e-06,
"loss": 0.0008,
"reward": 0.4765625,
"reward_std": 0.2041398286819458,
"rewards/acc_reward_func": 0.4765625,
"step": 2035
},
{
"completion_length": 162.1515625,
"epoch": 8.19277108433735,
"grad_norm": 0.1659599393606186,
"kl": 0.0049521148204803465,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48515625,
"reward_std": 0.1511383280158043,
"rewards/acc_reward_func": 0.48515625,
"step": 2040
},
{
"completion_length": 171.9421875,
"epoch": 8.21285140562249,
"grad_norm": 0.16265946626663208,
"kl": 0.005077867861837148,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49453125,
"reward_std": 0.16176047623157502,
"rewards/acc_reward_func": 0.49453125,
"step": 2045
},
{
"completion_length": 157.54453125,
"epoch": 8.23293172690763,
"grad_norm": 0.41423535346984863,
"kl": 0.0043539782520383595,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51171875,
"reward_std": 0.1708886057138443,
"rewards/acc_reward_func": 0.51171875,
"step": 2050
},
{
"completion_length": 162.9234375,
"epoch": 8.25301204819277,
"grad_norm": 0.1766187697649002,
"kl": 0.005095082893967628,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51875,
"reward_std": 0.16586165279150009,
"rewards/acc_reward_func": 0.51875,
"step": 2055
},
{
"completion_length": 164.25859375,
"epoch": 8.273092369477911,
"grad_norm": 0.17100907862186432,
"kl": 0.006200623698532581,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.53515625,
"reward_std": 0.1657108411192894,
"rewards/acc_reward_func": 0.53515625,
"step": 2060
},
{
"completion_length": 178.175,
"epoch": 8.293172690763052,
"grad_norm": 0.19841331243515015,
"kl": 0.0059931413270533085,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4734375,
"reward_std": 0.1801495224237442,
"rewards/acc_reward_func": 0.4734375,
"step": 2065
},
{
"completion_length": 152.75859375,
"epoch": 8.313253012048193,
"grad_norm": 0.16599750518798828,
"kl": 0.0047016800846904514,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.6,
"reward_std": 0.1781999349594116,
"rewards/acc_reward_func": 0.6,
"step": 2070
},
{
"completion_length": 163.95703125,
"epoch": 8.333333333333334,
"grad_norm": 0.23247690498828888,
"kl": 0.004955739155411721,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.509375,
"reward_std": 0.15715982317924498,
"rewards/acc_reward_func": 0.509375,
"step": 2075
},
{
"completion_length": 176.896875,
"epoch": 8.353413654618475,
"grad_norm": 0.24125733971595764,
"kl": 0.0037874475121498106,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.49375,
"reward_std": 0.1779358506202698,
"rewards/acc_reward_func": 0.49375,
"step": 2080
},
{
"completion_length": 170.54765625,
"epoch": 8.373493975903614,
"grad_norm": 0.17222893238067627,
"kl": 0.0052736220881342884,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50390625,
"reward_std": 0.16901940405368804,
"rewards/acc_reward_func": 0.50390625,
"step": 2085
},
{
"completion_length": 185.83359375,
"epoch": 8.393574297188755,
"grad_norm": 0.20361952483654022,
"kl": 0.0035595998633652925,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47421875,
"reward_std": 0.1880659967660904,
"rewards/acc_reward_func": 0.47421875,
"step": 2090
},
{
"completion_length": 162.059375,
"epoch": 8.413654618473895,
"grad_norm": 0.18317574262619019,
"kl": 0.005798228364437818,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.55859375,
"reward_std": 0.17591487169265746,
"rewards/acc_reward_func": 0.55859375,
"step": 2095
},
{
"completion_length": 177.1671875,
"epoch": 8.433734939759036,
"grad_norm": 0.1929018348455429,
"kl": 0.004856448713690042,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5046875,
"reward_std": 0.17880854308605193,
"rewards/acc_reward_func": 0.5046875,
"step": 2100
},
{
"completion_length": 176.82265625,
"epoch": 8.453815261044177,
"grad_norm": 0.12986740469932556,
"kl": 0.004110977705568075,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50078125,
"reward_std": 0.17649128437042236,
"rewards/acc_reward_func": 0.50078125,
"step": 2105
},
{
"completion_length": 168.59140625,
"epoch": 8.473895582329318,
"grad_norm": 0.183445006608963,
"kl": 0.005346502503380179,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51953125,
"reward_std": 0.16657555997371673,
"rewards/acc_reward_func": 0.51953125,
"step": 2110
},
{
"completion_length": 162.40390625,
"epoch": 8.493975903614459,
"grad_norm": 0.2082839012145996,
"kl": 0.004679181473329663,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51875,
"reward_std": 0.16862600147724152,
"rewards/acc_reward_func": 0.51875,
"step": 2115
},
{
"completion_length": 164.0484375,
"epoch": 8.514056224899598,
"grad_norm": 0.20099391043186188,
"kl": 0.004028816474601626,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5078125,
"reward_std": 0.16499614715576172,
"rewards/acc_reward_func": 0.5078125,
"step": 2120
},
{
"completion_length": 175.53359375,
"epoch": 8.534136546184738,
"grad_norm": 0.20232976973056793,
"kl": 0.00470002549700439,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50859375,
"reward_std": 0.1925363451242447,
"rewards/acc_reward_func": 0.50859375,
"step": 2125
},
{
"completion_length": 178.6078125,
"epoch": 8.55421686746988,
"grad_norm": 0.1769324243068695,
"kl": 0.005247410014271736,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.47578125,
"reward_std": 0.2229382336139679,
"rewards/acc_reward_func": 0.47578125,
"step": 2130
},
{
"completion_length": 163.528125,
"epoch": 8.57429718875502,
"grad_norm": 0.2802134156227112,
"kl": 0.004398725088685751,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.54765625,
"reward_std": 0.18390424847602843,
"rewards/acc_reward_func": 0.54765625,
"step": 2135
},
{
"completion_length": 171.721875,
"epoch": 8.594377510040161,
"grad_norm": 0.18698906898498535,
"kl": 0.004597881296649575,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48046875,
"reward_std": 0.19990164637565613,
"rewards/acc_reward_func": 0.48046875,
"step": 2140
},
{
"completion_length": 166.28359375,
"epoch": 8.614457831325302,
"grad_norm": 0.2137596607208252,
"kl": 0.004542758595198393,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.50703125,
"reward_std": 0.16497241258621215,
"rewards/acc_reward_func": 0.50703125,
"step": 2145
},
{
"completion_length": 163.9171875,
"epoch": 8.634538152610443,
"grad_norm": 0.3083778917789459,
"kl": 0.004555982304736972,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4640625,
"reward_std": 0.19377221465110778,
"rewards/acc_reward_func": 0.4640625,
"step": 2150
},
{
"completion_length": 165.446875,
"epoch": 8.654618473895582,
"grad_norm": 0.18126215040683746,
"kl": 0.006295733619481325,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5046875,
"reward_std": 0.18124696910381316,
"rewards/acc_reward_func": 0.5046875,
"step": 2155
},
{
"completion_length": 163.82109375,
"epoch": 8.674698795180722,
"grad_norm": 0.20952458679676056,
"kl": 0.005493194237351418,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49453125,
"reward_std": 0.16941507160663605,
"rewards/acc_reward_func": 0.49453125,
"step": 2160
},
{
"completion_length": 171.4828125,
"epoch": 8.694779116465863,
"grad_norm": 0.22488608956336975,
"kl": 0.005020735878497362,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4875,
"reward_std": 0.18966611325740815,
"rewards/acc_reward_func": 0.4875,
"step": 2165
},
{
"completion_length": 168.55390625,
"epoch": 8.714859437751004,
"grad_norm": 0.1883411854505539,
"kl": 0.004444738104939461,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.51328125,
"reward_std": 0.18061638176441192,
"rewards/acc_reward_func": 0.51328125,
"step": 2170
},
{
"completion_length": 158.84375,
"epoch": 8.734939759036145,
"grad_norm": 0.1968211978673935,
"kl": 0.006332902424037457,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.465625,
"reward_std": 0.1959008514881134,
"rewards/acc_reward_func": 0.465625,
"step": 2175
},
{
"completion_length": 168.00546875,
"epoch": 8.755020080321286,
"grad_norm": 0.15639910101890564,
"kl": 0.004818708728998899,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5015625,
"reward_std": 0.15662813037633896,
"rewards/acc_reward_func": 0.5015625,
"step": 2180
},
{
"completion_length": 165.4875,
"epoch": 8.775100401606426,
"grad_norm": 0.10501320660114288,
"kl": 0.00518454764969647,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.459375,
"reward_std": 0.14335362315177919,
"rewards/acc_reward_func": 0.459375,
"step": 2185
},
{
"completion_length": 167.94296875,
"epoch": 8.795180722891565,
"grad_norm": 0.19659049808979034,
"kl": 0.0052264477126300335,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.4875,
"reward_std": 0.20181429982185364,
"rewards/acc_reward_func": 0.4875,
"step": 2190
},
{
"completion_length": 160.10625,
"epoch": 8.815261044176706,
"grad_norm": 0.15146048367023468,
"kl": 0.005564142344519496,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.553125,
"reward_std": 0.1699168175458908,
"rewards/acc_reward_func": 0.553125,
"step": 2195
},
{
"completion_length": 163.73203125,
"epoch": 8.835341365461847,
"grad_norm": 0.216526597738266,
"kl": 0.005784123670309782,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.546875,
"reward_std": 0.16578570008277893,
"rewards/acc_reward_func": 0.546875,
"step": 2200
},
{
"completion_length": 166.5453125,
"epoch": 8.855421686746988,
"grad_norm": 0.18292556703090668,
"kl": 0.004489894863218069,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.56484375,
"reward_std": 0.18732835054397584,
"rewards/acc_reward_func": 0.56484375,
"step": 2205
},
{
"completion_length": 163.53046875,
"epoch": 8.875502008032129,
"grad_norm": 0.2851000428199768,
"kl": 0.004820996290072798,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.53203125,
"reward_std": 0.18559517860412597,
"rewards/acc_reward_func": 0.53203125,
"step": 2210
},
{
"completion_length": 168.9859375,
"epoch": 8.89558232931727,
"grad_norm": 0.156977117061615,
"kl": 0.0042758449912071225,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5734375,
"reward_std": 0.1854059636592865,
"rewards/acc_reward_func": 0.5734375,
"step": 2215
},
{
"completion_length": 161.86640625,
"epoch": 8.91566265060241,
"grad_norm": 0.2522971034049988,
"kl": 0.006265667825937271,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.54296875,
"reward_std": 0.17101743817329407,
"rewards/acc_reward_func": 0.54296875,
"step": 2220
},
{
"completion_length": 174.11796875,
"epoch": 8.93574297188755,
"grad_norm": 0.21798557043075562,
"kl": 0.003508847579360008,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4421875,
"reward_std": 0.18943246006965636,
"rewards/acc_reward_func": 0.4421875,
"step": 2225
},
{
"completion_length": 163.63515625,
"epoch": 8.95582329317269,
"grad_norm": 0.2004886120557785,
"kl": 0.0062758251093328,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5109375,
"reward_std": 0.1985057294368744,
"rewards/acc_reward_func": 0.5109375,
"step": 2230
},
{
"completion_length": 165.74296875,
"epoch": 8.975903614457831,
"grad_norm": 0.1294558346271515,
"kl": 0.004269269946962595,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.546875,
"reward_std": 0.15892322957515717,
"rewards/acc_reward_func": 0.546875,
"step": 2235
},
{
"completion_length": 170.78515625,
"epoch": 8.995983935742972,
"grad_norm": 0.16309596598148346,
"kl": 0.0066596707329154015,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.53046875,
"reward_std": 0.17980958819389342,
"rewards/acc_reward_func": 0.53046875,
"step": 2240
},
{
"completion_length": 183.82723388671874,
"epoch": 9.016064257028113,
"grad_norm": 0.18240590393543243,
"kl": 0.00409495048224926,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5078125,
"reward_std": 0.17327906489372252,
"rewards/acc_reward_func": 0.5078125,
"step": 2245
},
{
"completion_length": 163.5921875,
"epoch": 9.036144578313253,
"grad_norm": 0.19653591513633728,
"kl": 0.004806891083717346,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.48046875,
"reward_std": 0.17293756902217866,
"rewards/acc_reward_func": 0.48046875,
"step": 2250
},
{
"completion_length": 168.98671875,
"epoch": 9.056224899598394,
"grad_norm": 0.16897033154964447,
"kl": 0.005633874516934157,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.484375,
"reward_std": 0.17367472350597382,
"rewards/acc_reward_func": 0.484375,
"step": 2255
},
{
"completion_length": 167.68046875,
"epoch": 9.076305220883533,
"grad_norm": 0.15388362109661102,
"kl": 0.005731765972450375,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.50703125,
"reward_std": 0.16397244334220887,
"rewards/acc_reward_func": 0.50703125,
"step": 2260
},
{
"completion_length": 167.546875,
"epoch": 9.096385542168674,
"grad_norm": 0.2113339751958847,
"kl": 0.0054182523861527445,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.58125,
"reward_std": 0.18346337378025054,
"rewards/acc_reward_func": 0.58125,
"step": 2265
},
{
"completion_length": 167.471875,
"epoch": 9.116465863453815,
"grad_norm": 0.16747760772705078,
"kl": 0.005155595624819398,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49609375,
"reward_std": 0.18549037277698516,
"rewards/acc_reward_func": 0.49609375,
"step": 2270
},
{
"completion_length": 166.6203125,
"epoch": 9.136546184738956,
"grad_norm": 0.19661560654640198,
"kl": 0.0053213945589959625,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.51171875,
"reward_std": 0.14984672516584396,
"rewards/acc_reward_func": 0.51171875,
"step": 2275
},
{
"completion_length": 177.2625,
"epoch": 9.156626506024097,
"grad_norm": 0.18499067425727844,
"kl": 0.0044435403309762474,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.47265625,
"reward_std": 0.17826109230518342,
"rewards/acc_reward_func": 0.47265625,
"step": 2280
},
{
"completion_length": 168.9625,
"epoch": 9.176706827309237,
"grad_norm": 0.1453307867050171,
"kl": 0.0044111269526183605,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.4953125,
"reward_std": 0.16639432013034822,
"rewards/acc_reward_func": 0.4953125,
"step": 2285
},
{
"completion_length": 164.628125,
"epoch": 9.196787148594378,
"grad_norm": 0.191897451877594,
"kl": 0.005432105902582407,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5765625,
"reward_std": 0.19979655146598815,
"rewards/acc_reward_func": 0.5765625,
"step": 2290
},
{
"completion_length": 170.79609375,
"epoch": 9.216867469879517,
"grad_norm": 0.1752876192331314,
"kl": 0.005702431686222554,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5078125,
"reward_std": 0.16036455929279328,
"rewards/acc_reward_func": 0.5078125,
"step": 2295
},
{
"completion_length": 165.3890625,
"epoch": 9.236947791164658,
"grad_norm": 0.19302144646644592,
"kl": 0.0056034672074019905,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.51953125,
"reward_std": 0.16439580023288727,
"rewards/acc_reward_func": 0.51953125,
"step": 2300
},
{
"completion_length": 164.69609375,
"epoch": 9.257028112449799,
"grad_norm": 0.20960208773612976,
"kl": 0.005645757727324963,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.52890625,
"reward_std": 0.1936678946018219,
"rewards/acc_reward_func": 0.52890625,
"step": 2305
},
{
"completion_length": 160.2625,
"epoch": 9.27710843373494,
"grad_norm": 0.20257046818733215,
"kl": 0.004606122244149446,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.578125,
"reward_std": 0.19579427540302277,
"rewards/acc_reward_func": 0.578125,
"step": 2310
},
{
"completion_length": 163.46796875,
"epoch": 9.29718875502008,
"grad_norm": 0.16352717578411102,
"kl": 0.005468207225203514,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5234375,
"reward_std": 0.13906579166650773,
"rewards/acc_reward_func": 0.5234375,
"step": 2315
},
{
"completion_length": 162.33203125,
"epoch": 9.317269076305221,
"grad_norm": 0.21218574047088623,
"kl": 0.006555710919201374,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.51640625,
"reward_std": 0.16097366213798522,
"rewards/acc_reward_func": 0.51640625,
"step": 2320
},
{
"completion_length": 171.3984375,
"epoch": 9.337349397590362,
"grad_norm": 0.19277319312095642,
"kl": 0.00576570238918066,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.48203125,
"reward_std": 0.18148907721042634,
"rewards/acc_reward_func": 0.48203125,
"step": 2325
},
{
"completion_length": 174.428125,
"epoch": 9.357429718875501,
"grad_norm": 0.17500832676887512,
"kl": 0.004855396132916212,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.471875,
"reward_std": 0.1707296222448349,
"rewards/acc_reward_func": 0.471875,
"step": 2330
},
{
"completion_length": 165.04453125,
"epoch": 9.377510040160642,
"grad_norm": 0.3611130714416504,
"kl": 0.006584520079195499,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.47890625,
"reward_std": 0.15434326231479645,
"rewards/acc_reward_func": 0.47890625,
"step": 2335
},
{
"completion_length": 167.3765625,
"epoch": 9.397590361445783,
"grad_norm": 0.20707273483276367,
"kl": 0.005232116673141718,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.53359375,
"reward_std": 0.18987955152988434,
"rewards/acc_reward_func": 0.53359375,
"step": 2340
},
{
"completion_length": 164.89921875,
"epoch": 9.417670682730924,
"grad_norm": 0.2210911512374878,
"kl": 0.004250374855473638,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5296875,
"reward_std": 0.16896798312664033,
"rewards/acc_reward_func": 0.5296875,
"step": 2345
},
{
"completion_length": 163.93359375,
"epoch": 9.437751004016064,
"grad_norm": 0.22661933302879333,
"kl": 0.005709053250029683,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.525,
"reward_std": 0.21452035903930664,
"rewards/acc_reward_func": 0.525,
"step": 2350
},
{
"completion_length": 170.8703125,
"epoch": 9.457831325301205,
"grad_norm": 0.1705859899520874,
"kl": 0.004198294645175338,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.50625,
"reward_std": 0.1844579130411148,
"rewards/acc_reward_func": 0.50625,
"step": 2355
},
{
"completion_length": 150.840625,
"epoch": 9.477911646586346,
"grad_norm": 0.19300290942192078,
"kl": 0.007862291485071182,
"learning_rate": 5e-06,
"loss": 0.0008,
"reward": 0.54765625,
"reward_std": 0.17046427428722383,
"rewards/acc_reward_func": 0.54765625,
"step": 2360
},
{
"completion_length": 169.303125,
"epoch": 9.497991967871485,
"grad_norm": 0.1754230111837387,
"kl": 0.006654448201879859,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.49375,
"reward_std": 0.16544548720121383,
"rewards/acc_reward_func": 0.49375,
"step": 2365
},
{
"completion_length": 173.57421875,
"epoch": 9.518072289156626,
"grad_norm": 0.17772729694843292,
"kl": 0.004821743769571185,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.49140625,
"reward_std": 0.1668173760175705,
"rewards/acc_reward_func": 0.49140625,
"step": 2370
},
{
"completion_length": 164.91640625,
"epoch": 9.538152610441767,
"grad_norm": 0.1230701357126236,
"kl": 0.0055628960952162744,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.496875,
"reward_std": 0.16751691251993178,
"rewards/acc_reward_func": 0.496875,
"step": 2375
},
{
"completion_length": 169.1609375,
"epoch": 9.558232931726907,
"grad_norm": 0.16143178939819336,
"kl": 0.0041476615704596044,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.528125,
"reward_std": 0.16299633979797362,
"rewards/acc_reward_func": 0.528125,
"step": 2380
},
{
"completion_length": 168.9625,
"epoch": 9.578313253012048,
"grad_norm": 0.2319881170988083,
"kl": 0.004491470381617546,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.52265625,
"reward_std": 0.17406990230083466,
"rewards/acc_reward_func": 0.52265625,
"step": 2385
},
{
"completion_length": 166.2421875,
"epoch": 9.598393574297189,
"grad_norm": 0.1952240914106369,
"kl": 0.006343753729015589,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.48359375,
"reward_std": 0.18732912838459015,
"rewards/acc_reward_func": 0.48359375,
"step": 2390
},
{
"completion_length": 163.60625,
"epoch": 9.61847389558233,
"grad_norm": 0.1586199551820755,
"kl": 0.005929756537079811,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.45859375,
"reward_std": 0.1640772521495819,
"rewards/acc_reward_func": 0.45859375,
"step": 2395
},
{
"completion_length": 179.434375,
"epoch": 9.638554216867469,
"grad_norm": 0.2399452179670334,
"kl": 0.0037348355166614057,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45625,
"reward_std": 0.15768255889415742,
"rewards/acc_reward_func": 0.45625,
"step": 2400
},
{
"completion_length": 172.32421875,
"epoch": 9.65863453815261,
"grad_norm": 0.20332050323486328,
"kl": 0.004412861214950681,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.54609375,
"reward_std": 0.2027107298374176,
"rewards/acc_reward_func": 0.54609375,
"step": 2405
},
{
"completion_length": 177.6375,
"epoch": 9.67871485943775,
"grad_norm": 0.17742620408535004,
"kl": 0.007015732675790786,
"learning_rate": 5e-06,
"loss": 0.0007,
"reward": 0.50625,
"reward_std": 0.21444422602653504,
"rewards/acc_reward_func": 0.50625,
"step": 2410
},
{
"completion_length": 166.58046875,
"epoch": 9.698795180722891,
"grad_norm": 0.2093236893415451,
"kl": 0.0038778461515903474,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.5125,
"reward_std": 0.19650691151618957,
"rewards/acc_reward_func": 0.5125,
"step": 2415
},
{
"completion_length": 169.15859375,
"epoch": 9.718875502008032,
"grad_norm": 0.1588141769170761,
"kl": 0.003884556284174323,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.55625,
"reward_std": 0.164942467212677,
"rewards/acc_reward_func": 0.55625,
"step": 2420
},
{
"completion_length": 168.29453125,
"epoch": 9.738955823293173,
"grad_norm": 0.2029934972524643,
"kl": 0.0038687960244715215,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.55390625,
"reward_std": 0.1814129412174225,
"rewards/acc_reward_func": 0.55390625,
"step": 2425
},
{
"completion_length": 159.940625,
"epoch": 9.759036144578314,
"grad_norm": 0.17102688550949097,
"kl": 0.006306747253984213,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5546875,
"reward_std": 0.17076005637645722,
"rewards/acc_reward_func": 0.5546875,
"step": 2430
},
{
"completion_length": 167.92265625,
"epoch": 9.779116465863455,
"grad_norm": 0.1867334544658661,
"kl": 0.007829534402117134,
"learning_rate": 5e-06,
"loss": 0.0008,
"reward": 0.50859375,
"reward_std": 0.18995470702648162,
"rewards/acc_reward_func": 0.50859375,
"step": 2435
},
{
"completion_length": 147.2453125,
"epoch": 9.799196787148594,
"grad_norm": 0.14552320539951324,
"kl": 0.006351995375007391,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.5515625,
"reward_std": 0.15960367023944855,
"rewards/acc_reward_func": 0.5515625,
"step": 2440
},
{
"completion_length": 166.71484375,
"epoch": 9.819277108433734,
"grad_norm": 0.19098646938800812,
"kl": 0.006422513630241156,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4953125,
"reward_std": 0.15442512482404708,
"rewards/acc_reward_func": 0.4953125,
"step": 2445
},
{
"completion_length": 165.13203125,
"epoch": 9.839357429718875,
"grad_norm": 0.27820637822151184,
"kl": 0.008421385521069169,
"learning_rate": 5e-06,
"loss": 0.0008,
"reward": 0.54296875,
"reward_std": 0.17557464838027953,
"rewards/acc_reward_func": 0.54296875,
"step": 2450
},
{
"completion_length": 159.103125,
"epoch": 9.859437751004016,
"grad_norm": 0.22965365648269653,
"kl": 0.00582457073032856,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.57421875,
"reward_std": 0.1618974894285202,
"rewards/acc_reward_func": 0.57421875,
"step": 2455
},
{
"completion_length": 162.40625,
"epoch": 9.879518072289157,
"grad_norm": 0.21567627787590027,
"kl": 0.006435764627531171,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.53984375,
"reward_std": 0.22751247882843018,
"rewards/acc_reward_func": 0.53984375,
"step": 2460
},
{
"completion_length": 165.2796875,
"epoch": 9.899598393574298,
"grad_norm": 0.14859847724437714,
"kl": 0.0051711639855057,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.521875,
"reward_std": 0.14700692594051362,
"rewards/acc_reward_func": 0.521875,
"step": 2465
},
{
"completion_length": 165.41796875,
"epoch": 9.919678714859439,
"grad_norm": 0.2174079269170761,
"kl": 0.004814123082906008,
"learning_rate": 5e-06,
"loss": 0.0005,
"reward": 0.5234375,
"reward_std": 0.15107814967632294,
"rewards/acc_reward_func": 0.5234375,
"step": 2470
},
{
"completion_length": 176.4578125,
"epoch": 9.939759036144578,
"grad_norm": 0.2032446563243866,
"kl": 0.005913135502487421,
"learning_rate": 5e-06,
"loss": 0.0006,
"reward": 0.4578125,
"reward_std": 0.16086679846048355,
"rewards/acc_reward_func": 0.4578125,
"step": 2475
},
{
"completion_length": 164.83515625,
"epoch": 9.959839357429718,
"grad_norm": 0.19223801791667938,
"kl": 0.003596154833212495,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.534375,
"reward_std": 0.16749543994665145,
"rewards/acc_reward_func": 0.534375,
"step": 2480
},
{
"completion_length": 169.6390625,
"epoch": 9.97991967871486,
"grad_norm": 0.20604898035526276,
"kl": 0.004213751200586558,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.48125,
"reward_std": 0.1420108899474144,
"rewards/acc_reward_func": 0.48125,
"step": 2485
},
{
"completion_length": 193.56361694335936,
"epoch": 10.0,
"grad_norm": 0.21623508632183075,
"kl": 0.003922218782827258,
"learning_rate": 5e-06,
"loss": 0.0004,
"reward": 0.45625,
"reward_std": 0.1598937392234802,
"rewards/acc_reward_func": 0.45625,
"step": 2490
},
{
"epoch": 10.0,
"step": 2490,
"total_flos": 0.0,
"train_loss": 0.0003452953377523063,
"train_runtime": 112376.2194,
"train_samples_per_second": 0.707,
"train_steps_per_second": 0.022
}
],
"logging_steps": 5,
"max_steps": 2490,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}