{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.403361344537815, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 208.0, "epoch": 0.008403361344537815, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1 }, { "completion_length": 256.0, "epoch": 0.01680672268907563, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 2 }, { "completion_length": 256.0, "epoch": 0.025210084033613446, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 3 }, { "completion_length": 256.0, "epoch": 0.03361344537815126, "grad_norm": 0.8838915228843689, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 4 }, { "completion_length": 256.0, "epoch": 0.04201680672268908, "grad_norm": 0.8301438093185425, "kl": 0.0009573494317010045, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 5 }, { "completion_length": 256.0, "epoch": 0.05042016806722689, "grad_norm": 1.043217658996582, "kl": 0.0007592927431687713, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 6 }, { "completion_length": 235.5, "epoch": 0.058823529411764705, "grad_norm": 1.012459635734558, "kl": 0.0005637629656121135, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 7 }, { "completion_length": 256.0, "epoch": 0.06722689075630252, "grad_norm": 0.005794348195195198, "kl": 0.0007739531574770808, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 8 }, { "completion_length": 256.0, "epoch": 0.07563025210084033, "grad_norm": 0.0030261946376413107, "kl": 0.00047754880506545305, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 9 }, { "completion_length": 154.0, "epoch": 0.08403361344537816, "grad_norm": 1.6739884614944458, "kl": 0.0011951741762459278, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 10 }, { "completion_length": 216.0, "epoch": 0.09243697478991597, "grad_norm": 1.0617121458053589, "kl": 0.0005866724532097578, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 11 }, { "completion_length": 256.0, "epoch": 0.10084033613445378, "grad_norm": 0.9380314350128174, "kl": 0.0006365124136209488, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 12 }, { "completion_length": 256.0, "epoch": 0.1092436974789916, "grad_norm": 0.004214159213006496, "kl": 0.0007441662019118667, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 13 }, { "completion_length": 182.0, "epoch": 0.11764705882352941, "grad_norm": 1.2295690774917603, "kl": 0.0007082950905896723, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 14 }, { "completion_length": 256.0, "epoch": 0.12605042016806722, "grad_norm": 0.004315641708672047, "kl": 0.0006666813278570771, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 15 }, { "completion_length": 256.0, "epoch": 0.13445378151260504, "grad_norm": 0.8514454364776611, "kl": 0.0004808601224794984, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 16 }, { "completion_length": 256.0, "epoch": 0.14285714285714285, "grad_norm": 0.0051048225723207, "kl": 0.0006351690972223878, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 17 }, { "completion_length": 256.0, "epoch": 0.15126050420168066, "grad_norm": 0.004074061755090952, "kl": 0.0006499768933281302, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 18 }, { "completion_length": 256.0, "epoch": 0.15966386554621848, "grad_norm": 0.004084853455424309, "kl": 0.0006889225915074348, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 19 }, { "completion_length": 148.0, "epoch": 0.16806722689075632, "grad_norm": 1.7451705932617188, "kl": 0.0006731421453878284, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 20 }, { "completion_length": 229.5, "epoch": 0.17647058823529413, "grad_norm": 1.0134456157684326, "kl": 0.0008167714113369584, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 21 }, { "completion_length": 95.0, "epoch": 0.18487394957983194, "grad_norm": 1.4376728534698486, "kl": 0.0003571448614820838, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 0.2175000011920929, "reward_std": 0.2241528332233429, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.2175000011920929, "step": 22 }, { "completion_length": 256.0, "epoch": 0.19327731092436976, "grad_norm": 0.9212502241134644, "kl": 0.0007788074435666203, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 23 }, { "completion_length": 256.0, "epoch": 0.20168067226890757, "grad_norm": 0.0038035286124795675, "kl": 0.0007109848083928227, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 24 }, { "completion_length": 191.0, "epoch": 0.21008403361344538, "grad_norm": 0.006888057105243206, "kl": 0.0004465555539354682, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 25 }, { "completion_length": 158.0, "epoch": 0.2184873949579832, "grad_norm": 1.6241445541381836, "kl": 0.0006565352086909115, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 26 }, { "completion_length": 256.0, "epoch": 0.226890756302521, "grad_norm": 0.0034175037872046232, "kl": 0.000486561912111938, "learning_rate": 1.3500000000000002e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 27 }, { "completion_length": 256.0, "epoch": 0.23529411764705882, "grad_norm": 0.0036285840906202793, "kl": 0.0006047994829714298, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 28 }, { "completion_length": 256.0, "epoch": 0.24369747899159663, "grad_norm": 0.0032581069972366095, "kl": 0.00038483343087136745, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 29 }, { "completion_length": 256.0, "epoch": 0.25210084033613445, "grad_norm": 0.007449345197528601, "kl": 0.000766490469686687, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 30 }, { "completion_length": 256.0, "epoch": 0.2605042016806723, "grad_norm": 0.004598978906869888, "kl": 0.0005899533862248063, "learning_rate": 1.5500000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 31 }, { "completion_length": 256.0, "epoch": 0.2689075630252101, "grad_norm": 0.9563409090042114, "kl": 0.0007002416532486677, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 32 }, { "completion_length": 256.0, "epoch": 0.2773109243697479, "grad_norm": 0.0032359708566218615, "kl": 0.0004646314773708582, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 33 }, { "completion_length": 256.0, "epoch": 0.2857142857142857, "grad_norm": 0.00397283211350441, "kl": 0.0006836783140897751, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 34 }, { "completion_length": 238.0, "epoch": 0.29411764705882354, "grad_norm": 0.00816441886126995, "kl": 0.0009063881007023156, "learning_rate": 1.75e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 35 }, { "completion_length": 256.0, "epoch": 0.3025210084033613, "grad_norm": 0.9932153224945068, "kl": 0.000791319995187223, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 36 }, { "completion_length": 256.0, "epoch": 0.31092436974789917, "grad_norm": 0.004532299004495144, "kl": 0.0007765735499560833, "learning_rate": 1.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 37 }, { "completion_length": 256.0, "epoch": 0.31932773109243695, "grad_norm": 0.004246938973665237, "kl": 0.0007269444176927209, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 38 }, { "completion_length": 161.5, "epoch": 0.3277310924369748, "grad_norm": 1.6179838180541992, "kl": 0.0008333936566486955, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 39 }, { "completion_length": 256.0, "epoch": 0.33613445378151263, "grad_norm": 0.003035498782992363, "kl": 0.0005902517586946487, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 40 }, { "completion_length": 256.0, "epoch": 0.3445378151260504, "grad_norm": 0.0025484724901616573, "kl": 0.0004038845654577017, "learning_rate": 2.05e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 41 }, { "completion_length": 256.0, "epoch": 0.35294117647058826, "grad_norm": 0.003505554748699069, "kl": 0.0006591258570551872, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 42 }, { "completion_length": 231.5, "epoch": 0.36134453781512604, "grad_norm": 0.0046354010701179504, "kl": 0.0007136644562706351, "learning_rate": 2.15e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 43 }, { "completion_length": 256.0, "epoch": 0.3697478991596639, "grad_norm": 0.0032716356217861176, "kl": 0.0006846879841759801, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 44 }, { "completion_length": 256.0, "epoch": 0.37815126050420167, "grad_norm": 0.004479558672755957, "kl": 0.0007645890582352877, "learning_rate": 2.25e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 45 }, { "completion_length": 256.0, "epoch": 0.3865546218487395, "grad_norm": 0.003661304945126176, "kl": 0.0005422246176749468, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 46 }, { "completion_length": 256.0, "epoch": 0.3949579831932773, "grad_norm": 0.8428875803947449, "kl": 0.0007787372451275587, "learning_rate": 2.35e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 47 }, { "completion_length": 100.5, "epoch": 0.40336134453781514, "grad_norm": 0.00735253794118762, "kl": 0.0009225605172105134, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 48 }, { "completion_length": 256.0, "epoch": 0.4117647058823529, "grad_norm": 0.8739237785339355, "kl": 0.0006543750641867518, "learning_rate": 2.4500000000000003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 49 }, { "completion_length": 256.0, "epoch": 0.42016806722689076, "grad_norm": 0.0060773007571697235, "kl": 0.000822792062535882, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 50 }, { "completion_length": 256.0, "epoch": 0.42857142857142855, "grad_norm": 0.010451748967170715, "kl": 0.0012671941658481956, "learning_rate": 2.55e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 51 }, { "completion_length": 99.5, "epoch": 0.4369747899159664, "grad_norm": 0.006612293887883425, "kl": 0.0008995666867122054, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 52 }, { "completion_length": 256.0, "epoch": 0.44537815126050423, "grad_norm": 0.9036616086959839, "kl": 0.0008495927322655916, "learning_rate": 2.6500000000000005e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 53 }, { "completion_length": 195.0, "epoch": 0.453781512605042, "grad_norm": 1.3959556818008423, "kl": 0.0010955859906971455, "learning_rate": 2.7000000000000004e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 54 }, { "completion_length": 256.0, "epoch": 0.46218487394957986, "grad_norm": 0.849948525428772, "kl": 0.0007333974353969097, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 55 }, { "completion_length": 225.0, "epoch": 0.47058823529411764, "grad_norm": 0.870661199092865, "kl": 0.000870745861902833, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 56 }, { "completion_length": 256.0, "epoch": 0.4789915966386555, "grad_norm": 0.0075195119716227055, "kl": 0.0012707834830507636, "learning_rate": 2.85e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 57 }, { "completion_length": 256.0, "epoch": 0.48739495798319327, "grad_norm": 0.0033428138121962547, "kl": 0.0005659679882228374, "learning_rate": 2.9e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 58 }, { "completion_length": 217.0, "epoch": 0.4957983193277311, "grad_norm": 0.010066932067275047, "kl": 0.001386146410368383, "learning_rate": 2.95e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 59 }, { "completion_length": 256.0, "epoch": 0.5042016806722689, "grad_norm": 0.005329888314008713, "kl": 0.000831706216558814, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 60 }, { "completion_length": 256.0, "epoch": 0.5126050420168067, "grad_norm": 0.01864616759121418, "kl": 0.002434937749058008, "learning_rate": 3.05e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 61 }, { "completion_length": 253.5, "epoch": 0.5210084033613446, "grad_norm": 0.950167715549469, "kl": 0.0012768663000315428, "learning_rate": 3.1000000000000004e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 62 }, { "completion_length": 256.0, "epoch": 0.5294117647058824, "grad_norm": 0.008330680429935455, "kl": 0.0012756225187331438, "learning_rate": 3.1500000000000003e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 63 }, { "completion_length": 140.5, "epoch": 0.5378151260504201, "grad_norm": 0.014831366017460823, "kl": 0.0031117983162403107, "learning_rate": 3.2000000000000003e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 64 }, { "completion_length": 256.0, "epoch": 0.5462184873949579, "grad_norm": 0.9908618927001953, "kl": 0.0014188946224749088, "learning_rate": 3.2500000000000002e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 65 }, { "completion_length": 256.0, "epoch": 0.5546218487394958, "grad_norm": 0.7165637016296387, "kl": 0.0006240149959921837, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 66 }, { "completion_length": 91.5, "epoch": 0.5630252100840336, "grad_norm": 0.018771149218082428, "kl": 0.0030818600207567215, "learning_rate": 3.3500000000000005e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 67 }, { "completion_length": 169.5, "epoch": 0.5714285714285714, "grad_norm": 1.1422396898269653, "kl": 0.001963505521416664, "learning_rate": 3.4000000000000005e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 68 }, { "completion_length": 225.5, "epoch": 0.5798319327731093, "grad_norm": 0.014802279882133007, "kl": 0.0021320898085832596, "learning_rate": 3.45e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 69 }, { "completion_length": 256.0, "epoch": 0.5882352941176471, "grad_norm": 0.9517803192138672, "kl": 0.0013911720598116517, "learning_rate": 3.5e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 70 }, { "completion_length": 256.0, "epoch": 0.5966386554621849, "grad_norm": 1.2495174407958984, "kl": 0.001323533128015697, "learning_rate": 3.5500000000000003e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 71 }, { "completion_length": 188.0, "epoch": 0.6050420168067226, "grad_norm": 1.1743896007537842, "kl": 0.002660756465047598, "learning_rate": 3.6000000000000003e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 72 }, { "completion_length": 256.0, "epoch": 0.6134453781512605, "grad_norm": 0.003697387408465147, "kl": 0.0006548548117280006, "learning_rate": 3.65e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 73 }, { "completion_length": 256.0, "epoch": 0.6218487394957983, "grad_norm": 0.8471294045448303, "kl": 0.0012757513904944062, "learning_rate": 3.7e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 74 }, { "completion_length": 256.0, "epoch": 0.6302521008403361, "grad_norm": 0.009078427217900753, "kl": 0.0013090419815853238, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 75 }, { "completion_length": 256.0, "epoch": 0.6386554621848739, "grad_norm": 0.9492903351783752, "kl": 0.002218120265752077, "learning_rate": 3.8000000000000005e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 76 }, { "completion_length": 256.0, "epoch": 0.6470588235294118, "grad_norm": 0.0035131191834807396, "kl": 0.000702213728800416, "learning_rate": 3.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 77 }, { "completion_length": 228.5, "epoch": 0.6554621848739496, "grad_norm": 0.005907819140702486, "kl": 0.0012794373324140906, "learning_rate": 3.900000000000001e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 78 }, { "completion_length": 256.0, "epoch": 0.6638655462184874, "grad_norm": 0.9454696178436279, "kl": 0.0018798538949340582, "learning_rate": 3.95e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 79 }, { "completion_length": 256.0, "epoch": 0.6722689075630253, "grad_norm": 0.011725543066859245, "kl": 0.0036349850706756115, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 80 }, { "completion_length": 182.0, "epoch": 0.680672268907563, "grad_norm": 0.0036761611700057983, "kl": 0.0005937532987445593, "learning_rate": 4.05e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 81 }, { "completion_length": 238.5, "epoch": 0.6890756302521008, "grad_norm": 0.00544555252417922, "kl": 0.0012453272938728333, "learning_rate": 4.1e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 82 }, { "completion_length": 256.0, "epoch": 0.6974789915966386, "grad_norm": 0.002836729632690549, "kl": 0.0004919321509078145, "learning_rate": 4.15e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 83 }, { "completion_length": 256.0, "epoch": 0.7058823529411765, "grad_norm": 0.002849327167496085, "kl": 0.00048355443868786097, "learning_rate": 4.2000000000000004e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 84 }, { "completion_length": 163.0, "epoch": 0.7142857142857143, "grad_norm": 0.023407526314258575, "kl": 0.008791845291852951, "learning_rate": 4.25e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 85 }, { "completion_length": 256.0, "epoch": 0.7226890756302521, "grad_norm": 0.003463078523054719, "kl": 0.0007299153367057443, "learning_rate": 4.3e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 86 }, { "completion_length": 256.0, "epoch": 0.7310924369747899, "grad_norm": 0.007253331132233143, "kl": 0.004142088815569878, "learning_rate": 4.350000000000001e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 87 }, { "completion_length": 256.0, "epoch": 0.7394957983193278, "grad_norm": 0.5496742129325867, "kl": 0.010586329735815525, "learning_rate": 4.4e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 88 }, { "completion_length": 256.0, "epoch": 0.7478991596638656, "grad_norm": 0.964024543762207, "kl": 0.0029028397984802723, "learning_rate": 4.450000000000001e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 89 }, { "completion_length": 83.0, "epoch": 0.7563025210084033, "grad_norm": 0.0122342174872756, "kl": 0.0071075791493058205, "learning_rate": 4.5e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 90 }, { "completion_length": 136.5, "epoch": 0.7647058823529411, "grad_norm": 0.004309786017984152, "kl": 0.002355900825932622, "learning_rate": 4.5500000000000005e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 91 }, { "completion_length": 256.0, "epoch": 0.773109243697479, "grad_norm": 0.009771923534572124, "kl": 0.000915266340598464, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 92 }, { "completion_length": 220.0, "epoch": 0.7815126050420168, "grad_norm": 0.004257082939147949, "kl": 0.00190926983486861, "learning_rate": 4.65e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 93 }, { "completion_length": 179.0, "epoch": 0.7899159663865546, "grad_norm": 0.9655229449272156, "kl": 0.005708057899028063, "learning_rate": 4.7e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 94 }, { "completion_length": 256.0, "epoch": 0.7983193277310925, "grad_norm": 0.006459425203502178, "kl": 0.0011771931312978268, "learning_rate": 4.75e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 95 }, { "completion_length": 225.5, "epoch": 0.8067226890756303, "grad_norm": 0.017570044845342636, "kl": 0.004837444052100182, "learning_rate": 4.800000000000001e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 96 }, { "completion_length": 256.0, "epoch": 0.8151260504201681, "grad_norm": 0.004950360860675573, "kl": 0.0007728399941697717, "learning_rate": 4.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 97 }, { "completion_length": 256.0, "epoch": 0.8235294117647058, "grad_norm": 0.005911249667406082, "kl": 0.001108302385546267, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 98 }, { "completion_length": 256.0, "epoch": 0.8319327731092437, "grad_norm": 0.0036189311649650335, "kl": 0.000541006913408637, "learning_rate": 4.95e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 99 }, { "completion_length": 256.0, "epoch": 0.8403361344537815, "grad_norm": 0.007687319535762072, "kl": 0.0007422353373840451, "learning_rate": 5e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 100 }, { "completion_length": 125.0, "epoch": 0.8487394957983193, "grad_norm": 0.00810989085584879, "kl": 0.0014798549236729741, "learning_rate": 4.999984769144476e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 101 }, { "completion_length": 165.0, "epoch": 0.8571428571428571, "grad_norm": 0.008330943062901497, "kl": 0.006028025411069393, "learning_rate": 4.999939076763487e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 102 }, { "completion_length": 223.5, "epoch": 0.865546218487395, "grad_norm": 0.006431542336940765, "kl": 0.003954974934458733, "learning_rate": 4.999862923413781e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 103 }, { "completion_length": 256.0, "epoch": 0.8739495798319328, "grad_norm": 0.003713998943567276, "kl": 0.0006493695545941591, "learning_rate": 4.999756310023261e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 104 }, { "completion_length": 235.0, "epoch": 0.8823529411764706, "grad_norm": 0.986025333404541, "kl": 0.005234128329902887, "learning_rate": 4.9996192378909785e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 105 }, { "completion_length": 256.0, "epoch": 0.8907563025210085, "grad_norm": 0.9308247566223145, "kl": 0.003449173178523779, "learning_rate": 4.999451708687114e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 106 }, { "completion_length": 256.0, "epoch": 0.8991596638655462, "grad_norm": 0.0033724121749401093, "kl": 0.0007107893470674753, "learning_rate": 4.9992537244529585e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 107 }, { "completion_length": 256.0, "epoch": 0.907563025210084, "grad_norm": 0.004898673389106989, "kl": 0.0008465611608698964, "learning_rate": 4.999025287600886e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 108 }, { "completion_length": 219.0, "epoch": 0.9159663865546218, "grad_norm": 0.020835796371102333, "kl": 0.007097202353179455, "learning_rate": 4.998766400914329e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 109 }, { "completion_length": 256.0, "epoch": 0.9243697478991597, "grad_norm": 0.0033773048780858517, "kl": 0.0006910396041348577, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 110 }, { "completion_length": 239.0, "epoch": 0.9327731092436975, "grad_norm": 0.0072030615992844105, "kl": 0.0014091921038925648, "learning_rate": 4.998157291026553e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 111 }, { "completion_length": 256.0, "epoch": 0.9411764705882353, "grad_norm": 0.9274163842201233, "kl": 0.0008682548068463802, "learning_rate": 4.997807075247147e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 112 }, { "completion_length": 248.5, "epoch": 0.9495798319327731, "grad_norm": 0.008699854835867882, "kl": 0.0038457466289401054, "learning_rate": 4.997426424476787e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 113 }, { "completion_length": 256.0, "epoch": 0.957983193277311, "grad_norm": 0.0041854046285152435, "kl": 0.000782247050665319, "learning_rate": 4.9970153433535855e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 114 }, { "completion_length": 256.0, "epoch": 0.9663865546218487, "grad_norm": 0.004786860663443804, "kl": 0.0008156679105013609, "learning_rate": 4.9965738368864345e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 115 }, { "completion_length": 256.0, "epoch": 0.9747899159663865, "grad_norm": 0.004513249732553959, "kl": 0.003867537248879671, "learning_rate": 4.996101910454953e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 116 }, { "completion_length": 256.0, "epoch": 0.9831932773109243, "grad_norm": 0.003847523359581828, "kl": 0.0024901253636926413, "learning_rate": 4.995599569809414e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 117 }, { "completion_length": 256.0, "epoch": 0.9915966386554622, "grad_norm": 0.004254134371876717, "kl": 0.0019215474603697658, "learning_rate": 4.9950668210706795e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 118 }, { "completion_length": 227.0, "epoch": 1.0, "grad_norm": 0.009334280155599117, "kl": 0.00401033041998744, "learning_rate": 4.994503670730126e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 119 }, { "completion_length": 256.0, "epoch": 1.0084033613445378, "grad_norm": 0.9668896198272705, "kl": 0.020097780972719193, "learning_rate": 4.993910125649561e-06, "loss": 0.0008, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 120 }, { "completion_length": 142.5, "epoch": 1.0168067226890756, "grad_norm": 0.007522458676248789, "kl": 0.00825230497866869, "learning_rate": 4.993286193061145e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 121 }, { "completion_length": 256.0, "epoch": 1.0252100840336134, "grad_norm": 0.0041074506007134914, "kl": 0.002673591487109661, "learning_rate": 4.992631880567301e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 122 }, { "completion_length": 200.0, "epoch": 1.0336134453781514, "grad_norm": 0.00902502704411745, "kl": 0.004741402808576822, "learning_rate": 4.991947196140619e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 123 }, { "completion_length": 256.0, "epoch": 1.0420168067226891, "grad_norm": 0.9168788194656372, "kl": 0.004580065608024597, "learning_rate": 4.9912321481237616e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 124 }, { "completion_length": 156.5, "epoch": 1.050420168067227, "grad_norm": 1.5969916582107544, "kl": 0.008216320537030697, "learning_rate": 4.990486745229364e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 125 }, { "completion_length": 256.0, "epoch": 1.0588235294117647, "grad_norm": 0.8538263440132141, "kl": 0.0009729882003739476, "learning_rate": 4.989710996539926e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 126 }, { "completion_length": 145.0, "epoch": 1.0672268907563025, "grad_norm": 0.00484515680000186, "kl": 0.001036220695823431, "learning_rate": 4.9889049115077e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 127 }, { "completion_length": 222.0, "epoch": 1.0756302521008403, "grad_norm": 0.011807057075202465, "kl": 0.010053349658846855, "learning_rate": 4.988068499954578e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 128 }, { "completion_length": 256.0, "epoch": 1.084033613445378, "grad_norm": 0.0037358258850872517, "kl": 0.0007541521918028593, "learning_rate": 4.987201772071971e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 129 }, { "completion_length": 256.0, "epoch": 1.092436974789916, "grad_norm": 0.006020596716552973, "kl": 0.005797234829515219, "learning_rate": 4.986304738420684e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 130 }, { "completion_length": 256.0, "epoch": 1.1008403361344539, "grad_norm": 0.8249835968017578, "kl": 0.005717151798307896, "learning_rate": 4.985377409930789e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 131 }, { "completion_length": 256.0, "epoch": 1.1092436974789917, "grad_norm": 0.00477702496573329, "kl": 0.0007496282923966646, "learning_rate": 4.984419797901491e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 132 }, { "completion_length": 256.0, "epoch": 1.1176470588235294, "grad_norm": 0.004853010643273592, "kl": 0.000954065821133554, "learning_rate": 4.983431914000991e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 133 }, { "completion_length": 256.0, "epoch": 1.1260504201680672, "grad_norm": 0.007084459997713566, "kl": 0.004249719902873039, "learning_rate": 4.9824137702663424e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 134 }, { "completion_length": 155.5, "epoch": 1.134453781512605, "grad_norm": 0.008030904456973076, "kl": 0.004385617095977068, "learning_rate": 4.981365379103306e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 135 }, { "completion_length": 256.0, "epoch": 1.1428571428571428, "grad_norm": 0.006705970503389835, "kl": 0.0021309468429535627, "learning_rate": 4.980286753286196e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 136 }, { "completion_length": 256.0, "epoch": 1.1512605042016806, "grad_norm": 0.003741542110219598, "kl": 0.0006714507471770048, "learning_rate": 4.979177905957726e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 137 }, { "completion_length": 91.5, "epoch": 1.1596638655462184, "grad_norm": 0.010227271355688572, "kl": 0.007595442235469818, "learning_rate": 4.978038850628855e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 138 }, { "completion_length": 149.5, "epoch": 1.1680672268907564, "grad_norm": 0.005960332229733467, "kl": 0.003082948736846447, "learning_rate": 4.9768696011786095e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 139 }, { "completion_length": 181.5, "epoch": 1.1764705882352942, "grad_norm": 0.011940766125917435, "kl": 0.004800532013177872, "learning_rate": 4.975670171853926e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 140 }, { "completion_length": 188.5, "epoch": 1.184873949579832, "grad_norm": 0.004682076629251242, "kl": 0.0019342084415256977, "learning_rate": 4.974440577269473e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 141 }, { "completion_length": 164.0, "epoch": 1.1932773109243697, "grad_norm": 0.014307919889688492, "kl": 0.01174528244882822, "learning_rate": 4.973180832407471e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 142 }, { "completion_length": 134.5, "epoch": 1.2016806722689075, "grad_norm": 0.013058997690677643, "kl": 0.009515737183392048, "learning_rate": 4.971890952617515e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 143 }, { "completion_length": 255.0, "epoch": 1.2100840336134453, "grad_norm": 0.00921135488897562, "kl": 0.004956630989909172, "learning_rate": 4.970570953616383e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 144 }, { "completion_length": 256.0, "epoch": 1.2184873949579833, "grad_norm": 0.005896175280213356, "kl": 0.0010380554012954235, "learning_rate": 4.9692208514878445e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 145 }, { "completion_length": 256.0, "epoch": 1.226890756302521, "grad_norm": 0.007672092877328396, "kl": 0.003832828253507614, "learning_rate": 4.96784066268247e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 146 }, { "completion_length": 256.0, "epoch": 1.2352941176470589, "grad_norm": 0.005527228582650423, "kl": 0.0018821400590240955, "learning_rate": 4.966430404017424e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 147 }, { "completion_length": 256.0, "epoch": 1.2436974789915967, "grad_norm": 0.006057245656847954, "kl": 0.0012837192043662071, "learning_rate": 4.964990092676263e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 148 }, { "completion_length": 249.0, "epoch": 1.2521008403361344, "grad_norm": 0.8715924024581909, "kl": 0.000737156136892736, "learning_rate": 4.963519746208726e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 149 }, { "completion_length": 109.0, "epoch": 1.2605042016806722, "grad_norm": 0.006599591579288244, "kl": 0.001600385643541813, "learning_rate": 4.962019382530521e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 150 }, { "completion_length": 256.0, "epoch": 1.26890756302521, "grad_norm": 0.009375480934977531, "kl": 0.0025813505053520203, "learning_rate": 4.960489019923105e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 151 }, { "completion_length": 245.5, "epoch": 1.2773109243697478, "grad_norm": 0.011371919885277748, "kl": 0.003889144165441394, "learning_rate": 4.958928677033465e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 152 }, { "completion_length": 256.0, "epoch": 1.2857142857142856, "grad_norm": 0.003929056227207184, "kl": 0.0007801703177392483, "learning_rate": 4.957338372873886e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 153 }, { "completion_length": 256.0, "epoch": 1.2941176470588236, "grad_norm": 0.0035867104306817055, "kl": 0.0007854284485802054, "learning_rate": 4.9557181268217225e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 154 }, { "completion_length": 256.0, "epoch": 1.3025210084033614, "grad_norm": 0.0065237972885370255, "kl": 0.0007837532320991158, "learning_rate": 4.9540679586191605e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 155 }, { "completion_length": 124.0, "epoch": 1.3109243697478992, "grad_norm": 0.005302933510392904, "kl": 0.001210737507790327, "learning_rate": 4.9523878883729794e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 156 }, { "completion_length": 130.5, "epoch": 1.319327731092437, "grad_norm": 0.014133527874946594, "kl": 0.0037793531082570553, "learning_rate": 4.9506779365543054e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 157 }, { "completion_length": 256.0, "epoch": 1.3277310924369747, "grad_norm": 0.0052296556532382965, "kl": 0.0008109417976811528, "learning_rate": 4.94893812399836e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 158 }, { "completion_length": 44.5, "epoch": 1.3361344537815127, "grad_norm": 0.04597881808876991, "kl": 0.008943047374486923, "learning_rate": 4.947168471904213e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 159 }, { "completion_length": 256.0, "epoch": 1.3445378151260505, "grad_norm": 0.0038872675504535437, "kl": 0.000858853105455637, "learning_rate": 4.9453690018345144e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 160 }, { "completion_length": 256.0, "epoch": 1.3529411764705883, "grad_norm": 0.8324688076972961, "kl": 0.0019200592068955302, "learning_rate": 4.9435397357152406e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 161 }, { "completion_length": 256.0, "epoch": 1.361344537815126, "grad_norm": 0.008330686017870903, "kl": 0.0015966518549248576, "learning_rate": 4.9416806958354206e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 162 }, { "completion_length": 256.0, "epoch": 1.3697478991596639, "grad_norm": 0.00630250945687294, "kl": 0.0008440567180514336, "learning_rate": 4.939791904846869e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 163 }, { "completion_length": 256.0, "epoch": 1.3781512605042017, "grad_norm": 0.003568010637536645, "kl": 0.0008867642609402537, "learning_rate": 4.937873385763909e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 164 }, { "completion_length": 145.0, "epoch": 1.3865546218487395, "grad_norm": 0.005690573249012232, "kl": 0.0008505560690537095, "learning_rate": 4.935925161963089e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 165 }, { "completion_length": 256.0, "epoch": 1.3949579831932772, "grad_norm": 0.006437583826482296, "kl": 0.0011363155208528042, "learning_rate": 4.933947257182901e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 166 }, { "completion_length": 256.0, "epoch": 1.403361344537815, "grad_norm": 0.005696263629943132, "kl": 0.0010357071878388524, "learning_rate": 4.9319396955234925e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 167 }, { "completion_length": 71.5, "epoch": 1.4117647058823528, "grad_norm": 0.012031813152134418, "kl": 0.0013488339027389884, "learning_rate": 4.9299025014463665e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 168 }, { "completion_length": 256.0, "epoch": 1.4201680672268908, "grad_norm": 0.8474714159965515, "kl": 0.0009324835846200585, "learning_rate": 4.92783569977409e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 169 }, { "completion_length": 256.0, "epoch": 1.4285714285714286, "grad_norm": 0.9582586884498596, "kl": 0.0013797767460346222, "learning_rate": 4.925739315689991e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 170 }, { "completion_length": 223.5, "epoch": 1.4369747899159664, "grad_norm": 1.0101722478866577, "kl": 0.0010548103600740433, "learning_rate": 4.923613374737848e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 171 }, { "completion_length": 256.0, "epoch": 1.4453781512605042, "grad_norm": 0.004653313662856817, "kl": 0.0007524277316406369, "learning_rate": 4.921457902821578e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 172 }, { "completion_length": 222.5, "epoch": 1.453781512605042, "grad_norm": 0.9772133231163025, "kl": 0.00135804433375597, "learning_rate": 4.9192729262049285e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 173 }, { "completion_length": 256.0, "epoch": 1.46218487394958, "grad_norm": 0.8014784455299377, "kl": 0.001044928445480764, "learning_rate": 4.917058471511149e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 174 }, { "completion_length": 221.0, "epoch": 1.4705882352941178, "grad_norm": 0.9333362579345703, "kl": 0.0015107585350051522, "learning_rate": 4.914814565722671e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 175 }, { "completion_length": 256.0, "epoch": 1.4789915966386555, "grad_norm": 0.0037342184223234653, "kl": 0.0007057149196043611, "learning_rate": 4.912541236180779e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 256.0, "epoch": 1.4873949579831933, "grad_norm": 0.00384446419775486, "kl": 0.0007572260219603777, "learning_rate": 4.910238510585275e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 177 }, { "completion_length": 125.0, "epoch": 1.495798319327731, "grad_norm": 0.009284799918532372, "kl": 0.0017010357696563005, "learning_rate": 4.907906416994146e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 178 }, { "completion_length": 256.0, "epoch": 1.504201680672269, "grad_norm": 0.006756384391337633, "kl": 0.002442682860419154, "learning_rate": 4.905544983823214e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 179 }, { "completion_length": 256.0, "epoch": 1.5126050420168067, "grad_norm": 0.004587305709719658, "kl": 0.0009264791151508689, "learning_rate": 4.903154239845798e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 180 }, { "completion_length": 256.0, "epoch": 1.5210084033613445, "grad_norm": 1.0707005262374878, "kl": 0.0029016206972301006, "learning_rate": 4.900734214192358e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 181 }, { "completion_length": 256.0, "epoch": 1.5294117647058822, "grad_norm": 0.012579025700688362, "kl": 0.004248331300914288, "learning_rate": 4.898284936350144e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 182 }, { "completion_length": 256.0, "epoch": 1.53781512605042, "grad_norm": 0.007560709025710821, "kl": 0.0021150950342416763, "learning_rate": 4.8958064361628334e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 183 }, { "completion_length": 256.0, "epoch": 1.5462184873949578, "grad_norm": 0.006023748777806759, "kl": 0.000939257675781846, "learning_rate": 4.893298743830168e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 184 }, { "completion_length": 256.0, "epoch": 1.5546218487394958, "grad_norm": 0.00577089237049222, "kl": 0.0008926928276196122, "learning_rate": 4.890761889907589e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 185 }, { "completion_length": 256.0, "epoch": 1.5630252100840336, "grad_norm": 0.007506064139306545, "kl": 0.0013733417727053165, "learning_rate": 4.888195905305859e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 186 }, { "completion_length": 246.0, "epoch": 1.5714285714285714, "grad_norm": 0.9005439877510071, "kl": 0.0025176748167723417, "learning_rate": 4.885600821290692e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 187 }, { "completion_length": 256.0, "epoch": 1.5798319327731094, "grad_norm": 0.004041940905153751, "kl": 0.0008455628994852304, "learning_rate": 4.882976669482368e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 256.0, "epoch": 1.5882352941176472, "grad_norm": 0.009158411994576454, "kl": 0.0010664876317605376, "learning_rate": 4.880323481855347e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 189 }, { "completion_length": 256.0, "epoch": 1.596638655462185, "grad_norm": 0.008501189760863781, "kl": 0.005526822526007891, "learning_rate": 4.8776412907378845e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 190 }, { "completion_length": 256.0, "epoch": 1.6050420168067228, "grad_norm": 0.9183163046836853, "kl": 0.003348825965076685, "learning_rate": 4.874930128811631e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 191 }, { "completion_length": 219.0, "epoch": 1.6134453781512605, "grad_norm": 0.004900599364191294, "kl": 0.0018337091896682978, "learning_rate": 4.8721900291112415e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 192 }, { "completion_length": 219.0, "epoch": 1.6218487394957983, "grad_norm": 0.010737612843513489, "kl": 0.005627624690532684, "learning_rate": 4.869421025023965e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 193 }, { "completion_length": 182.5, "epoch": 1.6302521008403361, "grad_norm": 0.01622971147298813, "kl": 0.005411152262240648, "learning_rate": 4.866623150289241e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 194 }, { "completion_length": 256.0, "epoch": 1.638655462184874, "grad_norm": 0.005806392058730125, "kl": 0.004783734679222107, "learning_rate": 4.863796438998293e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 195 }, { "completion_length": 85.0, "epoch": 1.6470588235294117, "grad_norm": 0.039722055196762085, "kl": 0.013465854339301586, "learning_rate": 4.860940925593703e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 196 }, { "completion_length": 256.0, "epoch": 1.6554621848739495, "grad_norm": 0.003573828609660268, "kl": 0.0006671528099104762, "learning_rate": 4.858056644869002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 197 }, { "completion_length": 256.0, "epoch": 1.6638655462184873, "grad_norm": 0.018704848363995552, "kl": 0.00550480792298913, "learning_rate": 4.855143631968242e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 198 }, { "completion_length": 256.0, "epoch": 1.6722689075630253, "grad_norm": 0.005803122650831938, "kl": 0.001172338379547, "learning_rate": 4.852201922385564e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 199 }, { "completion_length": 256.0, "epoch": 1.680672268907563, "grad_norm": 0.009645639918744564, "kl": 0.004893769975751638, "learning_rate": 4.849231551964771e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 200 }, { "completion_length": 256.0, "epoch": 1.6890756302521008, "grad_norm": 0.004433969967067242, "kl": 0.0009591968264430761, "learning_rate": 4.84623255689889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 201 }, { "completion_length": 173.0, "epoch": 1.6974789915966386, "grad_norm": 1.380650520324707, "kl": 0.00731604965403676, "learning_rate": 4.84320497372973e-06, "loss": 0.0003, "reward": -0.25949999690055847, "reward_std": 0.7205418348312378, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25949999690055847, "step": 202 }, { "completion_length": 127.0, "epoch": 1.7058823529411766, "grad_norm": 0.020572390407323837, "kl": 0.007145563140511513, "learning_rate": 4.840148839347434e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 203 }, { "completion_length": 256.0, "epoch": 1.7142857142857144, "grad_norm": 0.006618890445679426, "kl": 0.0010417441371828318, "learning_rate": 4.837064190990036e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 204 }, { "completion_length": 256.0, "epoch": 1.7226890756302522, "grad_norm": 0.004952332004904747, "kl": 0.0010001393966376781, "learning_rate": 4.833951066243004e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 205 }, { "completion_length": 164.0, "epoch": 1.73109243697479, "grad_norm": 1.577182412147522, "kl": 0.0038280219305306673, "learning_rate": 4.830809503038781e-06, "loss": 0.0002, "reward": 0.16049998998641968, "reward_std": 0.050204578787088394, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16049998998641968, "step": 206 }, { "completion_length": 171.0, "epoch": 1.7394957983193278, "grad_norm": 1.2453254461288452, "kl": 0.004065622575581074, "learning_rate": 4.8276395396563215e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 207 }, { "completion_length": 256.0, "epoch": 1.7478991596638656, "grad_norm": 0.9487254023551941, "kl": 0.0028025954961776733, "learning_rate": 4.824441214720629e-06, "loss": 0.0001, "reward": -0.3305000066757202, "reward_std": 0.820950984954834, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3305000066757202, "step": 208 }, { "completion_length": 256.0, "epoch": 1.7563025210084033, "grad_norm": 0.004560220055282116, "kl": 0.0008998264092952013, "learning_rate": 4.821214567202284e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 209 }, { "completion_length": 256.0, "epoch": 1.7647058823529411, "grad_norm": 0.0053642867133021355, "kl": 0.0009770356118679047, "learning_rate": 4.817959636416969e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 210 }, { "completion_length": 169.0, "epoch": 1.773109243697479, "grad_norm": 0.00861065462231636, "kl": 0.004959143232554197, "learning_rate": 4.814676462024988e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 211 }, { "completion_length": 256.0, "epoch": 1.7815126050420167, "grad_norm": 0.005447046831250191, "kl": 0.0009979407768696547, "learning_rate": 4.811365084030784e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 212 }, { "completion_length": 164.5, "epoch": 1.7899159663865545, "grad_norm": 0.007028386928141117, "kl": 0.0026942617259919643, "learning_rate": 4.808025542782453e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 213 }, { "completion_length": 256.0, "epoch": 1.7983193277310925, "grad_norm": 0.013715922832489014, "kl": 0.004401184152811766, "learning_rate": 4.804657878971252e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 214 }, { "completion_length": 110.0, "epoch": 1.8067226890756303, "grad_norm": 0.012042315676808357, "kl": 0.004624972119927406, "learning_rate": 4.801262133631101e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 215 }, { "completion_length": 243.5, "epoch": 1.815126050420168, "grad_norm": 0.9901986718177795, "kl": 0.004426907282322645, "learning_rate": 4.7978383481380865e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 216 }, { "completion_length": 179.5, "epoch": 1.8235294117647058, "grad_norm": 0.010182562284171581, "kl": 0.005482309497892857, "learning_rate": 4.794386564209953e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 217 }, { "completion_length": 223.5, "epoch": 1.8319327731092439, "grad_norm": 0.005752507597208023, "kl": 0.0014647431671619415, "learning_rate": 4.790906823905599e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 218 }, { "completion_length": 256.0, "epoch": 1.8403361344537816, "grad_norm": 0.8290576934814453, "kl": 0.003792291507124901, "learning_rate": 4.787399169624562e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 219 }, { "completion_length": 256.0, "epoch": 1.8487394957983194, "grad_norm": 0.0032124982681125402, "kl": 0.0006178023759275675, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 220 }, { "completion_length": 124.0, "epoch": 1.8571428571428572, "grad_norm": 0.014089931733906269, "kl": 0.005531516391783953, "learning_rate": 4.780300290430683e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 221 }, { "completion_length": 256.0, "epoch": 1.865546218487395, "grad_norm": 0.005224175285547972, "kl": 0.0010472533758729696, "learning_rate": 4.776709152015443e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 222 }, { "completion_length": 256.0, "epoch": 1.8739495798319328, "grad_norm": 0.0059339189901947975, "kl": 0.0009450206998735666, "learning_rate": 4.773090272617672e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 223 }, { "completion_length": 256.0, "epoch": 1.8823529411764706, "grad_norm": 0.01329121459275484, "kl": 0.004415084607899189, "learning_rate": 4.769443696332272e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 224 }, { "completion_length": 244.0, "epoch": 1.8907563025210083, "grad_norm": 0.004307709168642759, "kl": 0.0009177834144793451, "learning_rate": 4.765769467591626e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 225 }, { "completion_length": 256.0, "epoch": 1.8991596638655461, "grad_norm": 0.006044196896255016, "kl": 0.0011644138721749187, "learning_rate": 4.762067631165049e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 226 }, { "completion_length": 186.5, "epoch": 1.907563025210084, "grad_norm": 0.01879073679447174, "kl": 0.0029681730084121227, "learning_rate": 4.7583382321582525e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 227 }, { "completion_length": 256.0, "epoch": 1.9159663865546217, "grad_norm": 0.006566977594047785, "kl": 0.0032347175292670727, "learning_rate": 4.754581316012785e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 228 }, { "completion_length": 256.0, "epoch": 1.9243697478991597, "grad_norm": 0.8964810967445374, "kl": 0.003440819215029478, "learning_rate": 4.750796928505484e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 229 }, { "completion_length": 256.0, "epoch": 1.9327731092436975, "grad_norm": 0.006367870140820742, "kl": 0.0010411246912553906, "learning_rate": 4.746985115747918e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 230 }, { "completion_length": 256.0, "epoch": 1.9411764705882353, "grad_norm": 0.00598109420388937, "kl": 0.0020273823756724596, "learning_rate": 4.743145924185821e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 231 }, { "completion_length": 256.0, "epoch": 1.949579831932773, "grad_norm": 0.8848045468330383, "kl": 0.0023713926784694195, "learning_rate": 4.7392794005985324e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 232 }, { "completion_length": 256.0, "epoch": 1.957983193277311, "grad_norm": 0.8128588199615479, "kl": 0.0027044611051678658, "learning_rate": 4.735385592098421e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 233 }, { "completion_length": 222.0, "epoch": 1.9663865546218489, "grad_norm": 0.004925818648189306, "kl": 0.0024337400682270527, "learning_rate": 4.731464546130315e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 234 }, { "completion_length": 123.0, "epoch": 1.9747899159663866, "grad_norm": 0.016816632822155952, "kl": 0.008543828502297401, "learning_rate": 4.72751631047092e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 235 }, { "completion_length": 256.0, "epoch": 1.9831932773109244, "grad_norm": 0.8649259209632874, "kl": 0.0030287099070847034, "learning_rate": 4.723540933228245e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 236 }, { "completion_length": 256.0, "epoch": 1.9915966386554622, "grad_norm": 0.6898112893104553, "kl": 0.0008830466540530324, "learning_rate": 4.719538462841003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 237 }, { "completion_length": 146.5, "epoch": 2.0, "grad_norm": 0.005423458758741617, "kl": 0.002108775544911623, "learning_rate": 4.715508948078037e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 238 }, { "completion_length": 256.0, "epoch": 2.008403361344538, "grad_norm": 0.004916418343782425, "kl": 0.000963216763921082, "learning_rate": 4.71145243803771e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 239 }, { "completion_length": 256.0, "epoch": 2.0168067226890756, "grad_norm": 0.004709788132458925, "kl": 0.0009735514177009463, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 240 }, { "completion_length": 118.5, "epoch": 2.0252100840336134, "grad_norm": 0.012133480980992317, "kl": 0.008173803798854351, "learning_rate": 4.703258630162481e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 241 }, { "completion_length": 256.0, "epoch": 2.033613445378151, "grad_norm": 0.007062504068017006, "kl": 0.005827554501593113, "learning_rate": 4.699121432166542e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 242 }, { "completion_length": 107.0, "epoch": 2.042016806722689, "grad_norm": 0.008775112219154835, "kl": 0.0037312167696654797, "learning_rate": 4.6949574385699514e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 243 }, { "completion_length": 256.0, "epoch": 2.0504201680672267, "grad_norm": 0.006465710233896971, "kl": 0.00470481812953949, "learning_rate": 4.690766700109659e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 244 }, { "completion_length": 256.0, "epoch": 2.0588235294117645, "grad_norm": 0.004648490808904171, "kl": 0.000982738216407597, "learning_rate": 4.68654926784849e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 245 }, { "completion_length": 203.0, "epoch": 2.0672268907563027, "grad_norm": 0.007828323170542717, "kl": 0.0024520133156329393, "learning_rate": 4.682305193174524e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 246 }, { "completion_length": 256.0, "epoch": 2.0756302521008405, "grad_norm": 0.013207961805164814, "kl": 0.006465165875852108, "learning_rate": 4.6780345278004744e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 247 }, { "completion_length": 256.0, "epoch": 2.0840336134453783, "grad_norm": 0.011041865684092045, "kl": 0.00543825700879097, "learning_rate": 4.673737323763048e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 248 }, { "completion_length": 225.0, "epoch": 2.092436974789916, "grad_norm": 0.007988007739186287, "kl": 0.0037802893202751875, "learning_rate": 4.669413633422322e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 249 }, { "completion_length": 78.0, "epoch": 2.100840336134454, "grad_norm": 0.040553364902734756, "kl": 0.007264666259288788, "learning_rate": 4.665063509461098e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 250 }, { "completion_length": 256.0, "epoch": 2.1092436974789917, "grad_norm": 0.006546806078404188, "kl": 0.0009515463607385755, "learning_rate": 4.6606870048842626e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 251 }, { "completion_length": 256.0, "epoch": 2.1176470588235294, "grad_norm": 0.006159324664622545, "kl": 0.002381978090852499, "learning_rate": 4.656284173018144e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 252 }, { "completion_length": 256.0, "epoch": 2.1260504201680672, "grad_norm": 0.005558946635574102, "kl": 0.0009347390150651336, "learning_rate": 4.65185506750986e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 253 }, { "completion_length": 139.5, "epoch": 2.134453781512605, "grad_norm": 0.020493706688284874, "kl": 0.006739134434610605, "learning_rate": 4.6473997423266615e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 254 }, { "completion_length": 256.0, "epoch": 2.142857142857143, "grad_norm": 0.004329713527113199, "kl": 0.001024471246637404, "learning_rate": 4.642918251755281e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 255 }, { "completion_length": 256.0, "epoch": 2.1512605042016806, "grad_norm": 0.00672161253169179, "kl": 0.0017339837504550815, "learning_rate": 4.638410650401267e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 256 }, { "completion_length": 256.0, "epoch": 2.1596638655462184, "grad_norm": 0.924403190612793, "kl": 0.0015088937943801284, "learning_rate": 4.633876993188319e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 257 }, { "completion_length": 256.0, "epoch": 2.168067226890756, "grad_norm": 0.005911551415920258, "kl": 0.0013076518662273884, "learning_rate": 4.62931733535762e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 256.0, "epoch": 2.176470588235294, "grad_norm": 0.004104703664779663, "kl": 0.0008186419727280736, "learning_rate": 4.62473173246716e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 259 }, { "completion_length": 190.5, "epoch": 2.184873949579832, "grad_norm": 0.011577471159398556, "kl": 0.0030835571233183146, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 260 }, { "completion_length": 256.0, "epoch": 2.19327731092437, "grad_norm": 0.007146902848035097, "kl": 0.001160011044703424, "learning_rate": 4.6154829153189105e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 261 }, { "completion_length": 204.0, "epoch": 2.2016806722689077, "grad_norm": 0.017429562285542488, "kl": 0.0022080186754465103, "learning_rate": 4.610819813755038e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 262 }, { "completion_length": 247.5, "epoch": 2.2100840336134455, "grad_norm": 0.007932069711387157, "kl": 0.002166855614632368, "learning_rate": 4.60613099251787e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 263 }, { "completion_length": 256.0, "epoch": 2.2184873949579833, "grad_norm": 0.005104547832161188, "kl": 0.0009710314916446805, "learning_rate": 4.601416508739211e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 264 }, { "completion_length": 248.0, "epoch": 2.226890756302521, "grad_norm": 0.00961034931242466, "kl": 0.0016581231029704213, "learning_rate": 4.596676419863561e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 265 }, { "completion_length": 163.5, "epoch": 2.235294117647059, "grad_norm": 1.296470284461975, "kl": 0.0015804753638803959, "learning_rate": 4.591910783647405e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 266 }, { "completion_length": 241.0, "epoch": 2.2436974789915967, "grad_norm": 0.8834806084632874, "kl": 0.002288698684424162, "learning_rate": 4.587119658158517e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 267 }, { "completion_length": 193.5, "epoch": 2.2521008403361344, "grad_norm": 1.0448172092437744, "kl": 0.0014066891744732857, "learning_rate": 4.582303101775249e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 268 }, { "completion_length": 256.0, "epoch": 2.2605042016806722, "grad_norm": 0.004820965230464935, "kl": 0.0010580024681985378, "learning_rate": 4.577461173185821e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 269 }, { "completion_length": 256.0, "epoch": 2.26890756302521, "grad_norm": 0.0049538020975887775, "kl": 0.0011701802723109722, "learning_rate": 4.572593931387604e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 270 }, { "completion_length": 256.0, "epoch": 2.277310924369748, "grad_norm": 0.0066798035986721516, "kl": 0.0015597355086356401, "learning_rate": 4.567701435686405e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 271 }, { "completion_length": 93.0, "epoch": 2.2857142857142856, "grad_norm": 0.01923462003469467, "kl": 0.003633297048509121, "learning_rate": 4.562783745695738e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 272 }, { "completion_length": 256.0, "epoch": 2.2941176470588234, "grad_norm": 0.9482860565185547, "kl": 0.0019293668447062373, "learning_rate": 4.5578409213361055e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 273 }, { "completion_length": 118.0, "epoch": 2.302521008403361, "grad_norm": 1.9454665184020996, "kl": 0.0012456348631531, "learning_rate": 4.55287302283426e-06, "loss": 0.0, "reward": -0.016499996185302734, "reward_std": 0.3768879175186157, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.016499996185302734, "step": 274 }, { "completion_length": 256.0, "epoch": 2.310924369747899, "grad_norm": 0.005923879332840443, "kl": 0.0011328846449032426, "learning_rate": 4.54788011072248e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 275 }, { "completion_length": 92.0, "epoch": 2.3193277310924367, "grad_norm": 0.017477739602327347, "kl": 0.004191963467746973, "learning_rate": 4.542862245837821e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 276 }, { "completion_length": 256.0, "epoch": 2.327731092436975, "grad_norm": 0.8662137985229492, "kl": 0.001381626701913774, "learning_rate": 4.537819489321385e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 277 }, { "completion_length": 256.0, "epoch": 2.3361344537815127, "grad_norm": 0.0037669725716114044, "kl": 0.00095718028023839, "learning_rate": 4.5327519026175694e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 278 }, { "completion_length": 256.0, "epoch": 2.3445378151260505, "grad_norm": 0.7727221250534058, "kl": 0.0014364065136760473, "learning_rate": 4.527659547473317e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 279 }, { "completion_length": 256.0, "epoch": 2.3529411764705883, "grad_norm": 0.00593525031581521, "kl": 0.0012578320456668735, "learning_rate": 4.522542485937369e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 280 }, { "completion_length": 256.0, "epoch": 2.361344537815126, "grad_norm": 0.006361262407153845, "kl": 0.0012078359723091125, "learning_rate": 4.517400780359505e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 281 }, { "completion_length": 159.0, "epoch": 2.369747899159664, "grad_norm": 0.01598692685365677, "kl": 0.004868132993578911, "learning_rate": 4.512234493389785e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 282 }, { "completion_length": 256.0, "epoch": 2.3781512605042017, "grad_norm": 0.00504391361027956, "kl": 0.0015002915170043707, "learning_rate": 4.507043687977787e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 283 }, { "completion_length": 256.0, "epoch": 2.3865546218487395, "grad_norm": 0.007316946517676115, "kl": 0.0013453759020194411, "learning_rate": 4.501828427371834e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 284 }, { "completion_length": 256.0, "epoch": 2.3949579831932772, "grad_norm": 0.8927034735679626, "kl": 0.001895469962619245, "learning_rate": 4.496588775118232e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 285 }, { "completion_length": 256.0, "epoch": 2.403361344537815, "grad_norm": 0.004997409414499998, "kl": 0.0010235446970909834, "learning_rate": 4.491324795060491e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 286 }, { "completion_length": 182.5, "epoch": 2.411764705882353, "grad_norm": 1.085240125656128, "kl": 0.002495179418474436, "learning_rate": 4.4860365513385456e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 287 }, { "completion_length": 156.5, "epoch": 2.4201680672268906, "grad_norm": 0.011411869898438454, "kl": 0.0023628431372344494, "learning_rate": 4.4807241083879774e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 288 }, { "completion_length": 102.5, "epoch": 2.4285714285714284, "grad_norm": 0.03103415109217167, "kl": 0.00728704733774066, "learning_rate": 4.475387530939226e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 289 }, { "completion_length": 143.5, "epoch": 2.4369747899159666, "grad_norm": 0.007647826336324215, "kl": 0.001687856623902917, "learning_rate": 4.470026884016805e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 290 }, { "completion_length": 256.0, "epoch": 2.4453781512605044, "grad_norm": 0.006875835824757814, "kl": 0.0012205814709886909, "learning_rate": 4.464642232938505e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 291 }, { "completion_length": 256.0, "epoch": 2.453781512605042, "grad_norm": 0.0053266058675944805, "kl": 0.0011853022733703256, "learning_rate": 4.4592336433146e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 292 }, { "completion_length": 256.0, "epoch": 2.46218487394958, "grad_norm": 0.006170874461531639, "kl": 0.0012551668332889676, "learning_rate": 4.453801181047047e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 293 }, { "completion_length": 134.0, "epoch": 2.4705882352941178, "grad_norm": 0.007096804678440094, "kl": 0.0012923497706651688, "learning_rate": 4.448344912328686e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 294 }, { "completion_length": 256.0, "epoch": 2.4789915966386555, "grad_norm": 0.9318930506706238, "kl": 0.002471773186698556, "learning_rate": 4.442864903642428e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 295 }, { "completion_length": 250.0, "epoch": 2.4873949579831933, "grad_norm": 0.9217692613601685, "kl": 0.0022414210252463818, "learning_rate": 4.437361221760449e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 296 }, { "completion_length": 256.0, "epoch": 2.495798319327731, "grad_norm": 0.007443012669682503, "kl": 0.0013197270454838872, "learning_rate": 4.431833933743378e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 221.0, "epoch": 2.504201680672269, "grad_norm": 0.009797207079827785, "kl": 0.0030650878325104713, "learning_rate": 4.426283106939474e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 298 }, { "completion_length": 256.0, "epoch": 2.5126050420168067, "grad_norm": 0.0059292190708220005, "kl": 0.0011019123485311866, "learning_rate": 4.420708808983809e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 299 }, { "completion_length": 256.0, "epoch": 2.5210084033613445, "grad_norm": 0.010347111150622368, "kl": 0.003908773884177208, "learning_rate": 4.415111107797445e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 300 }, { "completion_length": 243.0, "epoch": 2.5294117647058822, "grad_norm": 0.006082999054342508, "kl": 0.0018603194039314985, "learning_rate": 4.409490071586606e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 301 }, { "completion_length": 252.5, "epoch": 2.53781512605042, "grad_norm": 0.011642835102975368, "kl": 0.0035670981742441654, "learning_rate": 4.403845768841842e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 302 }, { "completion_length": 256.0, "epoch": 2.546218487394958, "grad_norm": 0.004744979087263346, "kl": 0.001151197124272585, "learning_rate": 4.398178268337202e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 303 }, { "completion_length": 256.0, "epoch": 2.5546218487394956, "grad_norm": 0.005991446319967508, "kl": 0.0016701575368642807, "learning_rate": 4.3924876391293915e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 304 }, { "completion_length": 177.5, "epoch": 2.5630252100840334, "grad_norm": 1.2158524990081787, "kl": 0.004391872324049473, "learning_rate": 4.386773950556931e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 305 }, { "completion_length": 181.0, "epoch": 2.571428571428571, "grad_norm": 0.010774364694952965, "kl": 0.004015970975160599, "learning_rate": 4.381037272239311e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 306 }, { "completion_length": 241.5, "epoch": 2.5798319327731094, "grad_norm": 1.1785914897918701, "kl": 0.0034467417281121016, "learning_rate": 4.3752776740761495e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 307 }, { "completion_length": 185.5, "epoch": 2.588235294117647, "grad_norm": 1.245256781578064, "kl": 0.0024693934246897697, "learning_rate": 4.36949522624633e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 308 }, { "completion_length": 190.5, "epoch": 2.596638655462185, "grad_norm": 0.005182513035833836, "kl": 0.0018564693164080381, "learning_rate": 4.3636899992071555e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 309 }, { "completion_length": 236.0, "epoch": 2.6050420168067228, "grad_norm": 0.0078032417222857475, "kl": 0.003304337151348591, "learning_rate": 4.357862063693486e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 310 }, { "completion_length": 256.0, "epoch": 2.6134453781512605, "grad_norm": 0.007424879353493452, "kl": 0.0030658075120300055, "learning_rate": 4.352011490716875e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 311 }, { "completion_length": 256.0, "epoch": 2.6218487394957983, "grad_norm": 0.9187365174293518, "kl": 0.0011164260795339942, "learning_rate": 4.346138351564711e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 312 }, { "completion_length": 72.0, "epoch": 2.630252100840336, "grad_norm": 0.012908521108329296, "kl": 0.0052327243611216545, "learning_rate": 4.340242717799337e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 313 }, { "completion_length": 256.0, "epoch": 2.638655462184874, "grad_norm": 0.006098590791225433, "kl": 0.001321260817348957, "learning_rate": 4.334324661257191e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 314 }, { "completion_length": 256.0, "epoch": 2.6470588235294117, "grad_norm": 0.008011511527001858, "kl": 0.004073180258274078, "learning_rate": 4.328384254047927e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 315 }, { "completion_length": 256.0, "epoch": 2.6554621848739495, "grad_norm": 0.005408373195677996, "kl": 0.0011219182051718235, "learning_rate": 4.322421568553529e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 316 }, { "completion_length": 256.0, "epoch": 2.6638655462184873, "grad_norm": 0.009479737840592861, "kl": 0.004906356334686279, "learning_rate": 4.316436677427441e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 317 }, { "completion_length": 136.0, "epoch": 2.6722689075630255, "grad_norm": 0.012419478967785835, "kl": 0.005350068211555481, "learning_rate": 4.3104296535936695e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 318 }, { "completion_length": 256.0, "epoch": 2.6806722689075633, "grad_norm": 0.8488293886184692, "kl": 0.0031672129407525063, "learning_rate": 4.3044005702459055e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 319 }, { "completion_length": 55.5, "epoch": 2.689075630252101, "grad_norm": 0.01818913221359253, "kl": 0.0020805918611586094, "learning_rate": 4.2983495008466285e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 320 }, { "completion_length": 116.0, "epoch": 2.697478991596639, "grad_norm": 0.011956282891333103, "kl": 0.007011523470282555, "learning_rate": 4.2922765191262075e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 321 }, { "completion_length": 256.0, "epoch": 2.7058823529411766, "grad_norm": 0.0041866665706038475, "kl": 0.0009950622916221619, "learning_rate": 4.286181699082008e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 322 }, { "completion_length": 158.0, "epoch": 2.7142857142857144, "grad_norm": 0.013078922405838966, "kl": 0.00418114522472024, "learning_rate": 4.280065114977492e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 323 }, { "completion_length": 256.0, "epoch": 2.722689075630252, "grad_norm": 0.010862285271286964, "kl": 0.0022238311357796192, "learning_rate": 4.273926841341303e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 324 }, { "completion_length": 229.0, "epoch": 2.73109243697479, "grad_norm": 0.9844045639038086, "kl": 0.00439240038394928, "learning_rate": 4.267766952966369e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 325 }, { "completion_length": 218.5, "epoch": 2.7394957983193278, "grad_norm": 0.01354632806032896, "kl": 0.004831475205719471, "learning_rate": 4.261585524908987e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 326 }, { "completion_length": 163.5, "epoch": 2.7478991596638656, "grad_norm": 1.4198167324066162, "kl": 0.003538253717124462, "learning_rate": 4.255382632487907e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 327 }, { "completion_length": 256.0, "epoch": 2.7563025210084033, "grad_norm": 0.006598836742341518, "kl": 0.0013419209280982614, "learning_rate": 4.249158351283414e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 328 }, { "completion_length": 256.0, "epoch": 2.764705882352941, "grad_norm": 0.004856065381318331, "kl": 0.0010341902961954474, "learning_rate": 4.242912757136412e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 329 }, { "completion_length": 256.0, "epoch": 2.773109243697479, "grad_norm": 0.010164064355194569, "kl": 0.0033468115143477917, "learning_rate": 4.236645926147493e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 330 }, { "completion_length": 252.5, "epoch": 2.7815126050420167, "grad_norm": 0.8806132078170776, "kl": 0.003796371165663004, "learning_rate": 4.230357934676017e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 331 }, { "completion_length": 256.0, "epoch": 2.7899159663865545, "grad_norm": 0.008999250829219818, "kl": 0.003320937743410468, "learning_rate": 4.224048859339175e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 332 }, { "completion_length": 185.5, "epoch": 2.7983193277310923, "grad_norm": 1.0830488204956055, "kl": 0.00378972920589149, "learning_rate": 4.217718777011058e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 333 }, { "completion_length": 57.5, "epoch": 2.80672268907563, "grad_norm": 2.2401227951049805, "kl": 0.007417288143187761, "learning_rate": 4.211367764821722e-06, "loss": 0.0003, "reward": 0.23749999701976776, "reward_std": 0.01767767407000065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.23749999701976776, "step": 334 }, { "completion_length": 220.0, "epoch": 2.815126050420168, "grad_norm": 1.0498411655426025, "kl": 0.0032221656292676926, "learning_rate": 4.204995900156247e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 335 }, { "completion_length": 256.0, "epoch": 2.8235294117647056, "grad_norm": 0.9649432897567749, "kl": 0.0030688131228089333, "learning_rate": 4.198603260653792e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 336 }, { "completion_length": 233.5, "epoch": 2.831932773109244, "grad_norm": 0.009273418225347996, "kl": 0.0027417270466685295, "learning_rate": 4.192189924206652e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 337 }, { "completion_length": 256.0, "epoch": 2.8403361344537816, "grad_norm": 0.9031000733375549, "kl": 0.0012246679980307817, "learning_rate": 4.185755968959308e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 338 }, { "completion_length": 173.0, "epoch": 2.8487394957983194, "grad_norm": 1.549201488494873, "kl": 0.00690976157784462, "learning_rate": 4.179301473307476e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 339 }, { "completion_length": 256.0, "epoch": 2.857142857142857, "grad_norm": 0.005585837177932262, "kl": 0.001547823310829699, "learning_rate": 4.172826515897146e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 340 }, { "completion_length": 256.0, "epoch": 2.865546218487395, "grad_norm": 0.9775158166885376, "kl": 0.007837183773517609, "learning_rate": 4.166331175623631e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 341 }, { "completion_length": 192.5, "epoch": 2.8739495798319328, "grad_norm": 0.007147891912609339, "kl": 0.0019490179838612676, "learning_rate": 4.159815531630604e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 342 }, { "completion_length": 90.5, "epoch": 2.8823529411764706, "grad_norm": 0.008082506246864796, "kl": 0.0026914807967841625, "learning_rate": 4.15327966330913e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 343 }, { "completion_length": 145.5, "epoch": 2.8907563025210083, "grad_norm": 0.010312359780073166, "kl": 0.0023588715121150017, "learning_rate": 4.146723650296701e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 344 }, { "completion_length": 97.5, "epoch": 2.899159663865546, "grad_norm": 0.009897630661725998, "kl": 0.006416564807295799, "learning_rate": 4.140147572476269e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 345 }, { "completion_length": 256.0, "epoch": 2.907563025210084, "grad_norm": 0.008187379688024521, "kl": 0.0016987619455903769, "learning_rate": 4.133551509975264e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 346 }, { "completion_length": 256.0, "epoch": 2.9159663865546217, "grad_norm": 0.005051834043115377, "kl": 0.0010223834542557597, "learning_rate": 4.126935543164628e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 347 }, { "completion_length": 256.0, "epoch": 2.92436974789916, "grad_norm": 0.006105738691985607, "kl": 0.0012232668232172728, "learning_rate": 4.120299752657828e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 348 }, { "completion_length": 256.0, "epoch": 2.9327731092436977, "grad_norm": 0.004115545656532049, "kl": 0.000886144582182169, "learning_rate": 4.113644219309877e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 349 }, { "completion_length": 220.0, "epoch": 2.9411764705882355, "grad_norm": 0.00585637241601944, "kl": 0.00310779782012105, "learning_rate": 4.106969024216348e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 350 }, { "completion_length": 246.5, "epoch": 2.9495798319327733, "grad_norm": 0.9735901951789856, "kl": 0.00537056103348732, "learning_rate": 4.1002742487123896e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 351 }, { "completion_length": 256.0, "epoch": 2.957983193277311, "grad_norm": 0.00575208431109786, "kl": 0.0010723159648478031, "learning_rate": 4.093559974371725e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 352 }, { "completion_length": 220.5, "epoch": 2.966386554621849, "grad_norm": 0.010382087901234627, "kl": 0.0023396199103444815, "learning_rate": 4.086826283005669e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 353 }, { "completion_length": 256.0, "epoch": 2.9747899159663866, "grad_norm": 0.005074927117675543, "kl": 0.0014040363021194935, "learning_rate": 4.080073256662128e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 354 }, { "completion_length": 256.0, "epoch": 2.9831932773109244, "grad_norm": 0.0057141841389238834, "kl": 0.0020395773462951183, "learning_rate": 4.073300977624594e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 355 }, { "completion_length": 256.0, "epoch": 2.991596638655462, "grad_norm": 0.8314705491065979, "kl": 0.0037925192154943943, "learning_rate": 4.066509528411151e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 356 }, { "completion_length": 224.5, "epoch": 3.0, "grad_norm": 0.008335424587130547, "kl": 0.0069442018866539, "learning_rate": 4.059698991773466e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 357 }, { "completion_length": 256.0, "epoch": 3.008403361344538, "grad_norm": 0.004900103900581598, "kl": 0.001880184281617403, "learning_rate": 4.052869450695776e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 358 }, { "completion_length": 256.0, "epoch": 3.0168067226890756, "grad_norm": 0.00499701127409935, "kl": 0.0013233657227829099, "learning_rate": 4.046020988393886e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 359 }, { "completion_length": 256.0, "epoch": 3.0252100840336134, "grad_norm": 0.005984700284898281, "kl": 0.002488317433744669, "learning_rate": 4.039153688314146e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 360 }, { "completion_length": 253.0, "epoch": 3.033613445378151, "grad_norm": 0.005614330992102623, "kl": 0.003915461245924234, "learning_rate": 4.032267634132442e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 361 }, { "completion_length": 77.0, "epoch": 3.042016806722689, "grad_norm": 0.010190341621637344, "kl": 0.0030109998770058155, "learning_rate": 4.02536290975317e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 362 }, { "completion_length": 256.0, "epoch": 3.0504201680672267, "grad_norm": 0.007005635648965836, "kl": 0.007471464108675718, "learning_rate": 4.018439599308217e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 363 }, { "completion_length": 237.5, "epoch": 3.0588235294117645, "grad_norm": 1.1555765867233276, "kl": 0.009077747352421284, "learning_rate": 4.011497787155938e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 364 }, { "completion_length": 187.5, "epoch": 3.0672268907563027, "grad_norm": 0.010233343578875065, "kl": 0.005797446705400944, "learning_rate": 4.0045375578801216e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 365 }, { "completion_length": 256.0, "epoch": 3.0756302521008405, "grad_norm": 0.005784002598375082, "kl": 0.0013660314725711942, "learning_rate": 3.997558996288965e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 366 }, { "completion_length": 107.5, "epoch": 3.0840336134453783, "grad_norm": 0.010106592439115047, "kl": 0.003698749002069235, "learning_rate": 3.9905621874140396e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 367 }, { "completion_length": 256.0, "epoch": 3.092436974789916, "grad_norm": 0.006695791147649288, "kl": 0.0014633414102718234, "learning_rate": 3.983547216509254e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 368 }, { "completion_length": 247.0, "epoch": 3.100840336134454, "grad_norm": 0.9652962684631348, "kl": 0.007674430496990681, "learning_rate": 3.976514169049814e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 369 }, { "completion_length": 225.5, "epoch": 3.1092436974789917, "grad_norm": 0.009551841765642166, "kl": 0.004749645013362169, "learning_rate": 3.969463130731183e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 370 }, { "completion_length": 256.0, "epoch": 3.1176470588235294, "grad_norm": 0.01000471506267786, "kl": 0.007583713624626398, "learning_rate": 3.96239418746804e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 371 }, { "completion_length": 177.5, "epoch": 3.1260504201680672, "grad_norm": 0.005635623820126057, "kl": 0.005622419528663158, "learning_rate": 3.955307425393224e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 372 }, { "completion_length": 161.0, "epoch": 3.134453781512605, "grad_norm": 0.011093245819211006, "kl": 0.006594919599592686, "learning_rate": 3.948202930856697e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 373 }, { "completion_length": 256.0, "epoch": 3.142857142857143, "grad_norm": 0.0072677480056881905, "kl": 0.0015927801141515374, "learning_rate": 3.941080790424483e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 374 }, { "completion_length": 256.0, "epoch": 3.1512605042016806, "grad_norm": 0.7831045389175415, "kl": 0.0011670574313029647, "learning_rate": 3.933941090877615e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 375 }, { "completion_length": 63.0, "epoch": 3.1596638655462184, "grad_norm": 0.007736688945442438, "kl": 0.0054659973829984665, "learning_rate": 3.92678391921108e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 376 }, { "completion_length": 256.0, "epoch": 3.168067226890756, "grad_norm": 0.005385331343859434, "kl": 0.0012579853646457195, "learning_rate": 3.9196093626327535e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 377 }, { "completion_length": 174.5, "epoch": 3.176470588235294, "grad_norm": 0.012397095561027527, "kl": 0.007383386138826609, "learning_rate": 3.912417508562345e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 378 }, { "completion_length": 157.0, "epoch": 3.184873949579832, "grad_norm": 1.410620093345642, "kl": 0.008962561376392841, "learning_rate": 3.905208444630326e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 379 }, { "completion_length": 162.0, "epoch": 3.19327731092437, "grad_norm": 0.012630729004740715, "kl": 0.010116135701537132, "learning_rate": 3.897982258676867e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 380 }, { "completion_length": 256.0, "epoch": 3.2016806722689077, "grad_norm": 0.004417209420353174, "kl": 0.0010214921785518527, "learning_rate": 3.890739038750763e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 381 }, { "completion_length": 256.0, "epoch": 3.2100840336134455, "grad_norm": 0.007314969785511494, "kl": 0.001297884969972074, "learning_rate": 3.88347887310836e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 382 }, { "completion_length": 256.0, "epoch": 3.2184873949579833, "grad_norm": 0.00696180434897542, "kl": 0.0029381189960986376, "learning_rate": 3.876201850212489e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 383 }, { "completion_length": 256.0, "epoch": 3.226890756302521, "grad_norm": 0.00670697120949626, "kl": 0.004512769635766745, "learning_rate": 3.868908058731376e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 384 }, { "completion_length": 93.5, "epoch": 3.235294117647059, "grad_norm": 0.016166705638170242, "kl": 0.0022817079443484545, "learning_rate": 3.861597587537568e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 385 }, { "completion_length": 256.0, "epoch": 3.2436974789915967, "grad_norm": 0.011589192785322666, "kl": 0.009432366117835045, "learning_rate": 3.85427052570685e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 386 }, { "completion_length": 256.0, "epoch": 3.2521008403361344, "grad_norm": 0.005171815864741802, "kl": 0.0015070427907630801, "learning_rate": 3.846926962517158e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 387 }, { "completion_length": 222.5, "epoch": 3.2605042016806722, "grad_norm": 0.009549574926495552, "kl": 0.0056788683868944645, "learning_rate": 3.839566987447492e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 388 }, { "completion_length": 256.0, "epoch": 3.26890756302521, "grad_norm": 0.005340673960745335, "kl": 0.0012753112241625786, "learning_rate": 3.832190690176825e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 389 }, { "completion_length": 256.0, "epoch": 3.277310924369748, "grad_norm": 1.028187870979309, "kl": 0.004712908994406462, "learning_rate": 3.824798160583012e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 390 }, { "completion_length": 256.0, "epoch": 3.2857142857142856, "grad_norm": 0.9399821758270264, "kl": 0.006183180026710033, "learning_rate": 3.817389488741694e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 391 }, { "completion_length": 148.5, "epoch": 3.2941176470588234, "grad_norm": 0.01059084851294756, "kl": 0.003190993797034025, "learning_rate": 3.8099647649251984e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 392 }, { "completion_length": 256.0, "epoch": 3.302521008403361, "grad_norm": 0.005274099763482809, "kl": 0.0028215793427079916, "learning_rate": 3.802524079601442e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 393 }, { "completion_length": 105.0, "epoch": 3.310924369747899, "grad_norm": 0.012682311236858368, "kl": 0.011069456115365028, "learning_rate": 3.795067523432826e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 394 }, { "completion_length": 256.0, "epoch": 3.3193277310924367, "grad_norm": 0.009523636661469936, "kl": 0.0014646160416305065, "learning_rate": 3.787595187275136e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 395 }, { "completion_length": 97.0, "epoch": 3.327731092436975, "grad_norm": 0.007293744012713432, "kl": 0.0013781224843114614, "learning_rate": 3.780107162176429e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 396 }, { "completion_length": 256.0, "epoch": 3.3361344537815127, "grad_norm": 0.006446072366088629, "kl": 0.0012894963147118688, "learning_rate": 3.772603539375929e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 397 }, { "completion_length": 152.5, "epoch": 3.3445378151260505, "grad_norm": 0.008547332137823105, "kl": 0.00517616281285882, "learning_rate": 3.7650844103029093e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 398 }, { "completion_length": 146.0, "epoch": 3.3529411764705883, "grad_norm": 0.005650275852531195, "kl": 0.004115119110792875, "learning_rate": 3.7575498665755884e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 399 }, { "completion_length": 221.0, "epoch": 3.361344537815126, "grad_norm": 1.0115008354187012, "kl": 0.0045968578197062016, "learning_rate": 3.7500000000000005e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 400 }, { "completion_length": 256.0, "epoch": 3.369747899159664, "grad_norm": 0.004271083511412144, "kl": 0.0019594451878219843, "learning_rate": 3.742434902568889e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 401 }, { "completion_length": 97.5, "epoch": 3.3781512605042017, "grad_norm": 0.01183786615729332, "kl": 0.007584511302411556, "learning_rate": 3.7348546664605777e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 402 }, { "completion_length": 256.0, "epoch": 3.3865546218487395, "grad_norm": 0.8068878054618835, "kl": 0.009242486208677292, "learning_rate": 3.7272593840378526e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 403 }, { "completion_length": 256.0, "epoch": 3.3949579831932772, "grad_norm": 0.006462923716753721, "kl": 0.004626350477337837, "learning_rate": 3.7196491478468322e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 404 }, { "completion_length": 195.5, "epoch": 3.403361344537815, "grad_norm": 0.009624393656849861, "kl": 0.002902559470385313, "learning_rate": 3.7120240506158433e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 405 }, { "completion_length": 256.0, "epoch": 3.411764705882353, "grad_norm": 0.006253241095691919, "kl": 0.0011607762426137924, "learning_rate": 3.7043841852542884e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 406 }, { "completion_length": 186.5, "epoch": 3.4201680672268906, "grad_norm": 0.011465064249932766, "kl": 0.007122526876628399, "learning_rate": 3.6967296448515176e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 407 }, { "completion_length": 174.5, "epoch": 3.4285714285714284, "grad_norm": 0.016065042465925217, "kl": 0.004176133777946234, "learning_rate": 3.689060522675689e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 408 }, { "completion_length": 120.0, "epoch": 3.4369747899159666, "grad_norm": 0.008138079196214676, "kl": 0.006820288486778736, "learning_rate": 3.6813769121726356e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 409 }, { "completion_length": 256.0, "epoch": 3.4453781512605044, "grad_norm": 0.006991416681557894, "kl": 0.005011546425521374, "learning_rate": 3.6736789069647273e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 410 }, { "completion_length": 183.5, "epoch": 3.453781512605042, "grad_norm": 0.01055091992020607, "kl": 0.007287868298590183, "learning_rate": 3.6659666008497287e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 411 }, { "completion_length": 241.5, "epoch": 3.46218487394958, "grad_norm": 0.010439754463732243, "kl": 0.006538182031363249, "learning_rate": 3.658240087799655e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 412 }, { "completion_length": 256.0, "epoch": 3.4705882352941178, "grad_norm": 0.00570059334859252, "kl": 0.005376460961997509, "learning_rate": 3.6504994619596295e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 413 }, { "completion_length": 256.0, "epoch": 3.4789915966386555, "grad_norm": 0.8112176060676575, "kl": 0.0014454604825004935, "learning_rate": 3.642744817646736e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 414 }, { "completion_length": 256.0, "epoch": 3.4873949579831933, "grad_norm": 0.008428104221820831, "kl": 0.005492928437888622, "learning_rate": 3.634976249348867e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 415 }, { "completion_length": 214.0, "epoch": 3.495798319327731, "grad_norm": 0.012927563861012459, "kl": 0.007722257170826197, "learning_rate": 3.627193851723577e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 416 }, { "completion_length": 256.0, "epoch": 3.504201680672269, "grad_norm": 0.007967905141413212, "kl": 0.0013451453996822238, "learning_rate": 3.6193977195969243e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 417 }, { "completion_length": 250.0, "epoch": 3.5126050420168067, "grad_norm": 0.010402580723166466, "kl": 0.006082219537347555, "learning_rate": 3.611587947962319e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 418 }, { "completion_length": 256.0, "epoch": 3.5210084033613445, "grad_norm": 0.8342515230178833, "kl": 0.004308309871703386, "learning_rate": 3.6037646319793635e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 419 }, { "completion_length": 256.0, "epoch": 3.5294117647058822, "grad_norm": 0.014491788111627102, "kl": 0.007393305655568838, "learning_rate": 3.595927866972694e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 420 }, { "completion_length": 256.0, "epoch": 3.53781512605042, "grad_norm": 0.004937848076224327, "kl": 0.0013074050657451153, "learning_rate": 3.5880777484308193e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 421 }, { "completion_length": 256.0, "epoch": 3.546218487394958, "grad_norm": 0.004944907035678625, "kl": 0.0010391217656433582, "learning_rate": 3.5802143720049565e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 422 }, { "completion_length": 190.5, "epoch": 3.5546218487394956, "grad_norm": 0.007325359620153904, "kl": 0.004195088520646095, "learning_rate": 3.5723378335078653e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 423 }, { "completion_length": 256.0, "epoch": 3.5630252100840334, "grad_norm": 0.005978034809231758, "kl": 0.0013448785757645965, "learning_rate": 3.564448228912682e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 424 }, { "completion_length": 160.0, "epoch": 3.571428571428571, "grad_norm": 0.00782028865069151, "kl": 0.0029302944894880056, "learning_rate": 3.556545654351749e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 425 }, { "completion_length": 256.0, "epoch": 3.5798319327731094, "grad_norm": 0.007717370055615902, "kl": 0.0014613766688853502, "learning_rate": 3.5486302061154433e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 426 }, { "completion_length": 256.0, "epoch": 3.588235294117647, "grad_norm": 0.005369290243834257, "kl": 0.0011799318017438054, "learning_rate": 3.5407019806510035e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 427 }, { "completion_length": 132.5, "epoch": 3.596638655462185, "grad_norm": 0.009358669631183147, "kl": 0.004041342996060848, "learning_rate": 3.532761074561355e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 428 }, { "completion_length": 256.0, "epoch": 3.6050420168067228, "grad_norm": 0.012556117959320545, "kl": 0.0017709294334053993, "learning_rate": 3.524807584603932e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 429 }, { "completion_length": 256.0, "epoch": 3.6134453781512605, "grad_norm": 0.006893865764141083, "kl": 0.002710767788812518, "learning_rate": 3.516841607689501e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 430 }, { "completion_length": 227.5, "epoch": 3.6218487394957983, "grad_norm": 0.01461038924753666, "kl": 0.0056550586596131325, "learning_rate": 3.5088632408809757e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 431 }, { "completion_length": 59.0, "epoch": 3.630252100840336, "grad_norm": 0.029620766639709473, "kl": 0.007676382549107075, "learning_rate": 3.5008725813922383e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 432 }, { "completion_length": 256.0, "epoch": 3.638655462184874, "grad_norm": 0.8965004086494446, "kl": 0.003964036703109741, "learning_rate": 3.4928697265869516e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 433 }, { "completion_length": 256.0, "epoch": 3.6470588235294117, "grad_norm": 0.007352401502430439, "kl": 0.003299277974292636, "learning_rate": 3.4848547739773782e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 434 }, { "completion_length": 168.5, "epoch": 3.6554621848739495, "grad_norm": 1.317054271697998, "kl": 0.00451608095318079, "learning_rate": 3.476827821223184e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 435 }, { "completion_length": 256.0, "epoch": 3.6638655462184873, "grad_norm": 0.7991107702255249, "kl": 0.0038162926211953163, "learning_rate": 3.4687889661302577e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 436 }, { "completion_length": 256.0, "epoch": 3.6722689075630255, "grad_norm": 0.013884101063013077, "kl": 0.004812534898519516, "learning_rate": 3.460738306649509e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 437 }, { "completion_length": 256.0, "epoch": 3.6806722689075633, "grad_norm": 1.0757262706756592, "kl": 0.00270366994664073, "learning_rate": 3.452675940875686e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 438 }, { "completion_length": 256.0, "epoch": 3.689075630252101, "grad_norm": 0.006082956679165363, "kl": 0.0011224534828215837, "learning_rate": 3.4446019670461684e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 439 }, { "completion_length": 256.0, "epoch": 3.697478991596639, "grad_norm": 0.01014415267854929, "kl": 0.004205282777547836, "learning_rate": 3.436516483539781e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 440 }, { "completion_length": 256.0, "epoch": 3.7058823529411766, "grad_norm": 0.891716718673706, "kl": 0.0021522974129766226, "learning_rate": 3.4284195888755877e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 441 }, { "completion_length": 256.0, "epoch": 3.7142857142857144, "grad_norm": 0.011942795477807522, "kl": 0.005065535195171833, "learning_rate": 3.4203113817116955e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 442 }, { "completion_length": 223.5, "epoch": 3.722689075630252, "grad_norm": 0.9813386797904968, "kl": 0.002765173790976405, "learning_rate": 3.412191960844049e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 443 }, { "completion_length": 256.0, "epoch": 3.73109243697479, "grad_norm": 0.006678609177470207, "kl": 0.0018052314408123493, "learning_rate": 3.4040614252052305e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 444 }, { "completion_length": 53.5, "epoch": 3.7394957983193278, "grad_norm": 0.04845777899026871, "kl": 0.009280219674110413, "learning_rate": 3.39591987386325e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 445 }, { "completion_length": 207.5, "epoch": 3.7478991596638656, "grad_norm": 1.0265278816223145, "kl": 0.0033423197455704212, "learning_rate": 3.387767406020343e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 446 }, { "completion_length": 173.0, "epoch": 3.7563025210084033, "grad_norm": 0.012940281070768833, "kl": 0.0037863338366150856, "learning_rate": 3.3796041210117545e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 447 }, { "completion_length": 256.0, "epoch": 3.764705882352941, "grad_norm": 0.006528416648507118, "kl": 0.001202148268930614, "learning_rate": 3.3714301183045382e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 448 }, { "completion_length": 256.0, "epoch": 3.773109243697479, "grad_norm": 0.007893442176282406, "kl": 0.0012523537734523416, "learning_rate": 3.3632454974963368e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 449 }, { "completion_length": 181.0, "epoch": 3.7815126050420167, "grad_norm": 1.0977665185928345, "kl": 0.0030348082073032856, "learning_rate": 3.3550503583141726e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 450 }, { "completion_length": 239.0, "epoch": 3.7899159663865545, "grad_norm": 0.009191147983074188, "kl": 0.003200841136276722, "learning_rate": 3.346844800613229e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 451 }, { "completion_length": 256.0, "epoch": 3.7983193277310923, "grad_norm": 0.005020318552851677, "kl": 0.0013238872634246945, "learning_rate": 3.338628924375638e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 452 }, { "completion_length": 256.0, "epoch": 3.80672268907563, "grad_norm": 0.005834945011883974, "kl": 0.0015783096896484494, "learning_rate": 3.3304028297092583e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 453 }, { "completion_length": 168.5, "epoch": 3.815126050420168, "grad_norm": 0.018046200275421143, "kl": 0.003341714618727565, "learning_rate": 3.3221666168464584e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 454 }, { "completion_length": 256.0, "epoch": 3.8235294117647056, "grad_norm": 0.004400371573865414, "kl": 0.0010116810444742441, "learning_rate": 3.313920386142892e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 455 }, { "completion_length": 256.0, "epoch": 3.831932773109244, "grad_norm": 0.006444923579692841, "kl": 0.0015354679198935628, "learning_rate": 3.3056642380762783e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 456 }, { "completion_length": 210.5, "epoch": 3.8403361344537816, "grad_norm": 0.811423122882843, "kl": 0.0009610719862394035, "learning_rate": 3.2973982732451753e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 457 }, { "completion_length": 256.0, "epoch": 3.8487394957983194, "grad_norm": 0.005560672841966152, "kl": 0.0011815006146207452, "learning_rate": 3.2891225923677565e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 458 }, { "completion_length": 256.0, "epoch": 3.857142857142857, "grad_norm": 0.007573983166366816, "kl": 0.002982199192047119, "learning_rate": 3.280837296280582e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 459 }, { "completion_length": 179.5, "epoch": 3.865546218487395, "grad_norm": 0.005451290402561426, "kl": 0.0014059357345104218, "learning_rate": 3.272542485937369e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 460 }, { "completion_length": 256.0, "epoch": 3.8739495798319328, "grad_norm": 0.006299573462456465, "kl": 0.0014008242869749665, "learning_rate": 3.2642382624077647e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 461 }, { "completion_length": 256.0, "epoch": 3.8823529411764706, "grad_norm": 0.006671134382486343, "kl": 0.0014036985812708735, "learning_rate": 3.2559247268761117e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 462 }, { "completion_length": 254.5, "epoch": 3.8907563025210083, "grad_norm": 1.005020260810852, "kl": 0.0029663098976016045, "learning_rate": 3.247601980640217e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 463 }, { "completion_length": 111.5, "epoch": 3.899159663865546, "grad_norm": 0.01182496640831232, "kl": 0.0036092600785195827, "learning_rate": 3.2392701251101172e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 464 }, { "completion_length": 99.5, "epoch": 3.907563025210084, "grad_norm": 0.0145535534247756, "kl": 0.004007051698863506, "learning_rate": 3.230929261806842e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 465 }, { "completion_length": 256.0, "epoch": 3.9159663865546217, "grad_norm": 0.00581360375508666, "kl": 0.0015754429623484612, "learning_rate": 3.222579492361179e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 466 }, { "completion_length": 256.0, "epoch": 3.92436974789916, "grad_norm": 0.00808662734925747, "kl": 0.003097210079431534, "learning_rate": 3.214220918512434e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 467 }, { "completion_length": 256.0, "epoch": 3.9327731092436977, "grad_norm": 0.007082223892211914, "kl": 0.001682876143604517, "learning_rate": 3.205853642107192e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 468 }, { "completion_length": 256.0, "epoch": 3.9411764705882355, "grad_norm": 0.8255936503410339, "kl": 0.0018416057573631406, "learning_rate": 3.1974777650980737e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 469 }, { "completion_length": 256.0, "epoch": 3.9495798319327733, "grad_norm": 0.9889159798622131, "kl": 0.004538574256002903, "learning_rate": 3.189093389542498e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 470 }, { "completion_length": 256.0, "epoch": 3.957983193277311, "grad_norm": 0.004752832930535078, "kl": 0.0013536623446270823, "learning_rate": 3.180700617601436e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 471 }, { "completion_length": 256.0, "epoch": 3.966386554621849, "grad_norm": 0.6684840321540833, "kl": 0.0020658092107623816, "learning_rate": 3.1722995515381644e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 472 }, { "completion_length": 256.0, "epoch": 3.9747899159663866, "grad_norm": 0.011976254172623158, "kl": 0.004110087640583515, "learning_rate": 3.1638902937170224e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 473 }, { "completion_length": 256.0, "epoch": 3.9831932773109244, "grad_norm": 0.005737712606787682, "kl": 0.0015543876215815544, "learning_rate": 3.155472946602162e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 474 }, { "completion_length": 160.5, "epoch": 3.991596638655462, "grad_norm": 1.8022584915161133, "kl": 0.003239160403609276, "learning_rate": 3.147047612756302e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 475 }, { "completion_length": 256.0, "epoch": 4.0, "grad_norm": 0.004591153468936682, "kl": 0.0011422380339354277, "learning_rate": 3.1386143948394764e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 476 }, { "completion_length": 256.0, "epoch": 4.008403361344538, "grad_norm": 0.5948164463043213, "kl": 0.0022058887407183647, "learning_rate": 3.130173395607785e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 477 }, { "completion_length": 136.5, "epoch": 4.016806722689076, "grad_norm": 0.013069775886833668, "kl": 0.0021158354356884956, "learning_rate": 3.121724717912138e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 478 }, { "completion_length": 256.0, "epoch": 4.025210084033613, "grad_norm": 0.005377096123993397, "kl": 0.0015631868736818433, "learning_rate": 3.1132684646970068e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 479 }, { "completion_length": 256.0, "epoch": 4.033613445378151, "grad_norm": 0.004411065485328436, "kl": 0.0022693309001624584, "learning_rate": 3.1048047389991693e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 480 }, { "completion_length": 224.5, "epoch": 4.042016806722689, "grad_norm": 0.0049795713275671005, "kl": 0.0024725371040403843, "learning_rate": 3.0963336439464527e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 481 }, { "completion_length": 234.5, "epoch": 4.050420168067227, "grad_norm": 0.012814347632229328, "kl": 0.006012073718011379, "learning_rate": 3.087855282756475e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 482 }, { "completion_length": 256.0, "epoch": 4.0588235294117645, "grad_norm": 0.006354542449116707, "kl": 0.0015080994926393032, "learning_rate": 3.079369758735393e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 483 }, { "completion_length": 229.0, "epoch": 4.067226890756302, "grad_norm": 0.015650996938347816, "kl": 0.007206182926893234, "learning_rate": 3.0708771752766397e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 484 }, { "completion_length": 193.5, "epoch": 4.07563025210084, "grad_norm": 1.0958764553070068, "kl": 0.005706274416297674, "learning_rate": 3.062377635859663e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 485 }, { "completion_length": 105.5, "epoch": 4.084033613445378, "grad_norm": 0.011874957010149956, "kl": 0.0028147497214376926, "learning_rate": 3.053871244048669e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 486 }, { "completion_length": 256.0, "epoch": 4.092436974789916, "grad_norm": 0.9047056436538696, "kl": 0.004364797379821539, "learning_rate": 3.045358103491357e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 487 }, { "completion_length": 256.0, "epoch": 4.100840336134453, "grad_norm": 0.9570077657699585, "kl": 0.003153773257508874, "learning_rate": 3.0368383179176584e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 488 }, { "completion_length": 256.0, "epoch": 4.109243697478991, "grad_norm": 0.005621154326945543, "kl": 0.0014768983237445354, "learning_rate": 3.0283119911384724e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 489 }, { "completion_length": 256.0, "epoch": 4.117647058823529, "grad_norm": 0.004762676078826189, "kl": 0.0011970241321250796, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 256.0, "epoch": 4.126050420168067, "grad_norm": 0.009406670928001404, "kl": 0.004937203601002693, "learning_rate": 3.0112401296044756e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 491 }, { "completion_length": 253.0, "epoch": 4.1344537815126055, "grad_norm": 0.7030842900276184, "kl": 0.0013840115861967206, "learning_rate": 3.002694802864912e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 492 }, { "completion_length": 175.5, "epoch": 4.142857142857143, "grad_norm": 1.375542402267456, "kl": 0.00493666622787714, "learning_rate": 2.9941433509478157e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 493 }, { "completion_length": 220.0, "epoch": 4.151260504201681, "grad_norm": 0.016102584078907967, "kl": 0.006187045015394688, "learning_rate": 2.98558587804993e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 494 }, { "completion_length": 196.5, "epoch": 4.159663865546219, "grad_norm": 0.008582793176174164, "kl": 0.003388018812984228, "learning_rate": 2.9770224884413625e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 495 }, { "completion_length": 256.0, "epoch": 4.168067226890757, "grad_norm": 0.007323360536247492, "kl": 0.005748100578784943, "learning_rate": 2.9684532864643123e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 496 }, { "completion_length": 222.5, "epoch": 4.176470588235294, "grad_norm": 0.006812721956521273, "kl": 0.0031685088761150837, "learning_rate": 2.9598783765318005e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 497 }, { "completion_length": 256.0, "epoch": 4.184873949579832, "grad_norm": 0.009013836272060871, "kl": 0.0017818395281210542, "learning_rate": 2.9512978631264006e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 498 }, { "completion_length": 256.0, "epoch": 4.19327731092437, "grad_norm": 0.012076469138264656, "kl": 0.002936260076239705, "learning_rate": 2.942711850798959e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 499 }, { "completion_length": 239.5, "epoch": 4.201680672268908, "grad_norm": 0.008779402822256088, "kl": 0.005604768171906471, "learning_rate": 2.9341204441673267e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 500 }, { "completion_length": 76.5, "epoch": 4.2100840336134455, "grad_norm": 0.02335500344634056, "kl": 0.009651167318224907, "learning_rate": 2.9255237479150815e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 501 }, { "completion_length": 130.5, "epoch": 4.218487394957983, "grad_norm": 1.2524582147598267, "kl": 0.005804387852549553, "learning_rate": 2.9169218667902562e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 502 }, { "completion_length": 256.0, "epoch": 4.226890756302521, "grad_norm": 0.006882464978843927, "kl": 0.0027065572794526815, "learning_rate": 2.908314905604056e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 503 }, { "completion_length": 62.0, "epoch": 4.235294117647059, "grad_norm": 0.01219162717461586, "kl": 0.0034767217002809048, "learning_rate": 2.8997029692295875e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 504 }, { "completion_length": 256.0, "epoch": 4.243697478991597, "grad_norm": 0.006074296776205301, "kl": 0.0036840885877609253, "learning_rate": 2.8910861626005774e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 505 }, { "completion_length": 256.0, "epoch": 4.2521008403361344, "grad_norm": 0.8462468981742859, "kl": 0.0046837469562888145, "learning_rate": 2.8824645907100957e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 506 }, { "completion_length": 86.5, "epoch": 4.260504201680672, "grad_norm": 0.007137172389775515, "kl": 0.002282999688759446, "learning_rate": 2.8738383586092745e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 507 }, { "completion_length": 256.0, "epoch": 4.26890756302521, "grad_norm": 0.016919517889618874, "kl": 0.0027719922363758087, "learning_rate": 2.8652075714060296e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 508 }, { "completion_length": 256.0, "epoch": 4.277310924369748, "grad_norm": 0.008852407336235046, "kl": 0.0017030982999131083, "learning_rate": 2.8565723342637797e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 509 }, { "completion_length": 85.5, "epoch": 4.285714285714286, "grad_norm": 0.011606723070144653, "kl": 0.0017054345225915313, "learning_rate": 2.847932752400164e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 510 }, { "completion_length": 240.5, "epoch": 4.294117647058823, "grad_norm": 0.011789804324507713, "kl": 0.005251291673630476, "learning_rate": 2.8392889310857615e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 511 }, { "completion_length": 256.0, "epoch": 4.302521008403361, "grad_norm": 0.0073866331949830055, "kl": 0.00388048542663455, "learning_rate": 2.8306409756428067e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 512 }, { "completion_length": 256.0, "epoch": 4.310924369747899, "grad_norm": 0.005215913988649845, "kl": 0.0012249015271663666, "learning_rate": 2.8219889914439073e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 513 }, { "completion_length": 256.0, "epoch": 4.319327731092437, "grad_norm": 0.007592489477247, "kl": 0.0016890396364033222, "learning_rate": 2.813333083910761e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 514 }, { "completion_length": 205.0, "epoch": 4.3277310924369745, "grad_norm": 1.0085983276367188, "kl": 0.0014712801203131676, "learning_rate": 2.804673358512869e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 515 }, { "completion_length": 242.0, "epoch": 4.336134453781512, "grad_norm": 0.00572913559153676, "kl": 0.0018727960996329784, "learning_rate": 2.7960099207662535e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 516 }, { "completion_length": 256.0, "epoch": 4.34453781512605, "grad_norm": 0.011275004595518112, "kl": 0.0018798578530550003, "learning_rate": 2.7873428762321667e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 517 }, { "completion_length": 256.0, "epoch": 4.352941176470588, "grad_norm": 0.9920996427536011, "kl": 0.003402052214369178, "learning_rate": 2.778672330515814e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 518 }, { "completion_length": 256.0, "epoch": 4.361344537815126, "grad_norm": 0.010556753724813461, "kl": 0.0022839251905679703, "learning_rate": 2.769998389265057e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 519 }, { "completion_length": 92.0, "epoch": 4.369747899159664, "grad_norm": 0.011728877201676369, "kl": 0.00521480618044734, "learning_rate": 2.761321158169134e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 520 }, { "completion_length": 256.0, "epoch": 4.378151260504202, "grad_norm": 0.003867239924147725, "kl": 0.002509140409529209, "learning_rate": 2.752640742957366e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 521 }, { "completion_length": 125.0, "epoch": 4.38655462184874, "grad_norm": 0.012611414305865765, "kl": 0.00456651346758008, "learning_rate": 2.743957249397874e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 522 }, { "completion_length": 225.0, "epoch": 4.394957983193278, "grad_norm": 0.007873370312154293, "kl": 0.0036875163204967976, "learning_rate": 2.7352707832962865e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 523 }, { "completion_length": 256.0, "epoch": 4.4033613445378155, "grad_norm": 0.9747622609138489, "kl": 0.0020197294652462006, "learning_rate": 2.726581450494451e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 524 }, { "completion_length": 256.0, "epoch": 4.411764705882353, "grad_norm": 1.0448710918426514, "kl": 0.003962778020650148, "learning_rate": 2.717889356869146e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 525 }, { "completion_length": 72.0, "epoch": 4.420168067226891, "grad_norm": 0.01841016113758087, "kl": 0.007685003336519003, "learning_rate": 2.70919460833079e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 526 }, { "completion_length": 256.0, "epoch": 4.428571428571429, "grad_norm": 0.006075122393667698, "kl": 0.003358443733304739, "learning_rate": 2.700497310822147e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 527 }, { "completion_length": 256.0, "epoch": 4.436974789915967, "grad_norm": 0.005677506327629089, "kl": 0.002728064078837633, "learning_rate": 2.6917975703170466e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 528 }, { "completion_length": 218.5, "epoch": 4.445378151260504, "grad_norm": 0.008011857978999615, "kl": 0.004073669668287039, "learning_rate": 2.6830954928190795e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 529 }, { "completion_length": 193.0, "epoch": 4.453781512605042, "grad_norm": 0.008171777240931988, "kl": 0.003784970613196492, "learning_rate": 2.6743911843603134e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 530 }, { "completion_length": 256.0, "epoch": 4.46218487394958, "grad_norm": 0.005833589006215334, "kl": 0.0014463107800111175, "learning_rate": 2.6656847510000013e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 531 }, { "completion_length": 256.0, "epoch": 4.470588235294118, "grad_norm": 0.8191768527030945, "kl": 0.0032632295042276382, "learning_rate": 2.6569762988232838e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 532 }, { "completion_length": 256.0, "epoch": 4.4789915966386555, "grad_norm": 0.0072319842875003815, "kl": 0.0016874033026397228, "learning_rate": 2.6482659339399047e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 533 }, { "completion_length": 256.0, "epoch": 4.487394957983193, "grad_norm": 0.0052573406137526035, "kl": 0.0014978109393268824, "learning_rate": 2.63955376248291e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 534 }, { "completion_length": 256.0, "epoch": 4.495798319327731, "grad_norm": 0.8704063296318054, "kl": 0.0018389747710898519, "learning_rate": 2.6308398906073603e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 535 }, { "completion_length": 256.0, "epoch": 4.504201680672269, "grad_norm": 0.005076680798083544, "kl": 0.0013596423668786883, "learning_rate": 2.6221244244890336e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 536 }, { "completion_length": 226.0, "epoch": 4.512605042016807, "grad_norm": 0.010387287475168705, "kl": 0.0037782860454171896, "learning_rate": 2.613407470323134e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 537 }, { "completion_length": 173.0, "epoch": 4.5210084033613445, "grad_norm": 0.007874002680182457, "kl": 0.0031023588962852955, "learning_rate": 2.604689134322999e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 538 }, { "completion_length": 197.5, "epoch": 4.529411764705882, "grad_norm": 0.007560485042631626, "kl": 0.003182163927704096, "learning_rate": 2.5959695227188e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 539 }, { "completion_length": 149.0, "epoch": 4.53781512605042, "grad_norm": 0.9829975366592407, "kl": 0.005002903752028942, "learning_rate": 2.587248741756253e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 540 }, { "completion_length": 256.0, "epoch": 4.546218487394958, "grad_norm": 0.008099050261080265, "kl": 0.0012714212061837316, "learning_rate": 2.578526897695321e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 541 }, { "completion_length": 256.0, "epoch": 4.554621848739496, "grad_norm": 0.00789720006287098, "kl": 0.0017992565408349037, "learning_rate": 2.569804096808923e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 542 }, { "completion_length": 256.0, "epoch": 4.563025210084033, "grad_norm": 0.00541194761171937, "kl": 0.0014942132402211428, "learning_rate": 2.5610804453816333e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 543 }, { "completion_length": 256.0, "epoch": 4.571428571428571, "grad_norm": 0.9269772171974182, "kl": 0.003507905174046755, "learning_rate": 2.5523560497083927e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 544 }, { "completion_length": 132.5, "epoch": 4.579831932773109, "grad_norm": 0.010006463155150414, "kl": 0.003050783183425665, "learning_rate": 2.543631016093209e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 545 }, { "completion_length": 130.0, "epoch": 4.588235294117647, "grad_norm": 0.015947597101330757, "kl": 0.003334134817123413, "learning_rate": 2.5349054508478636e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 546 }, { "completion_length": 256.0, "epoch": 4.5966386554621845, "grad_norm": 0.7701818346977234, "kl": 0.0018773985793814063, "learning_rate": 2.526179460290615e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 547 }, { "completion_length": 256.0, "epoch": 4.605042016806722, "grad_norm": 0.004993731621652842, "kl": 0.0010455910814926028, "learning_rate": 2.517453150744904e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 548 }, { "completion_length": 194.5, "epoch": 4.61344537815126, "grad_norm": 0.00883491337299347, "kl": 0.0054036714136600494, "learning_rate": 2.5087266285380597e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 549 }, { "completion_length": 256.0, "epoch": 4.621848739495798, "grad_norm": 0.9423441886901855, "kl": 0.004415466915816069, "learning_rate": 2.5e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 550 }, { "completion_length": 165.0, "epoch": 4.630252100840336, "grad_norm": 0.015781870111823082, "kl": 0.006485165562480688, "learning_rate": 2.4912733714619415e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 551 }, { "completion_length": 224.5, "epoch": 4.6386554621848735, "grad_norm": 0.012608792632818222, "kl": 0.003756849095225334, "learning_rate": 2.482546849255096e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 552 }, { "completion_length": 256.0, "epoch": 4.647058823529412, "grad_norm": 0.00909358263015747, "kl": 0.0042377253994345665, "learning_rate": 2.4738205397093863e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 553 }, { "completion_length": 221.5, "epoch": 4.65546218487395, "grad_norm": 0.9097227454185486, "kl": 0.003059991169720888, "learning_rate": 2.4650945491521372e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 554 }, { "completion_length": 256.0, "epoch": 4.663865546218488, "grad_norm": 0.9847775101661682, "kl": 0.003004086669534445, "learning_rate": 2.4563689839067913e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 555 }, { "completion_length": 256.0, "epoch": 4.6722689075630255, "grad_norm": 0.0057894145138561726, "kl": 0.00207483465783298, "learning_rate": 2.447643950291608e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 556 }, { "completion_length": 256.0, "epoch": 4.680672268907563, "grad_norm": 0.005377934779971838, "kl": 0.0012246626429259777, "learning_rate": 2.4389195546183676e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 557 }, { "completion_length": 201.0, "epoch": 4.689075630252101, "grad_norm": 0.00967450626194477, "kl": 0.00491796201094985, "learning_rate": 2.4301959031910785e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 558 }, { "completion_length": 256.0, "epoch": 4.697478991596639, "grad_norm": 0.007963428273797035, "kl": 0.004405993968248367, "learning_rate": 2.4214731023046795e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 559 }, { "completion_length": 256.0, "epoch": 4.705882352941177, "grad_norm": 0.006179140880703926, "kl": 0.0015872953226789832, "learning_rate": 2.4127512582437486e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 560 }, { "completion_length": 247.0, "epoch": 4.714285714285714, "grad_norm": 0.8192089200019836, "kl": 0.0043661887757480145, "learning_rate": 2.4040304772812002e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 561 }, { "completion_length": 256.0, "epoch": 4.722689075630252, "grad_norm": 0.008218436501920223, "kl": 0.001822912017814815, "learning_rate": 2.3953108656770018e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 562 }, { "completion_length": 256.0, "epoch": 4.73109243697479, "grad_norm": 0.005964998155832291, "kl": 0.0014693336561322212, "learning_rate": 2.3865925296768658e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 563 }, { "completion_length": 256.0, "epoch": 4.739495798319328, "grad_norm": 0.006740473210811615, "kl": 0.0026240209117531776, "learning_rate": 2.377875575510967e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 564 }, { "completion_length": 222.5, "epoch": 4.7478991596638656, "grad_norm": 0.013996557332575321, "kl": 0.009519457817077637, "learning_rate": 2.3691601093926406e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 565 }, { "completion_length": 256.0, "epoch": 4.756302521008403, "grad_norm": 0.0056951311416924, "kl": 0.0015081402380019426, "learning_rate": 2.3604462375170905e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 566 }, { "completion_length": 235.5, "epoch": 4.764705882352941, "grad_norm": 0.008997224271297455, "kl": 0.003933947999030352, "learning_rate": 2.3517340660600965e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 567 }, { "completion_length": 256.0, "epoch": 4.773109243697479, "grad_norm": 0.00782682653516531, "kl": 0.002357708290219307, "learning_rate": 2.3430237011767166e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 568 }, { "completion_length": 81.0, "epoch": 4.781512605042017, "grad_norm": 0.010526585392653942, "kl": 0.0018760713282972574, "learning_rate": 2.3343152490000004e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 569 }, { "completion_length": 208.0, "epoch": 4.7899159663865545, "grad_norm": 0.012593524530529976, "kl": 0.007487342227250338, "learning_rate": 2.325608815639687e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 570 }, { "completion_length": 256.0, "epoch": 4.798319327731092, "grad_norm": 0.007433582097291946, "kl": 0.0013662229757755995, "learning_rate": 2.3169045071809217e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 571 }, { "completion_length": 256.0, "epoch": 4.80672268907563, "grad_norm": 0.004651958122849464, "kl": 0.0013910499401390553, "learning_rate": 2.3082024296829538e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 572 }, { "completion_length": 256.0, "epoch": 4.815126050420168, "grad_norm": 0.005414290819317102, "kl": 0.001637004897929728, "learning_rate": 2.2995026891778533e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 573 }, { "completion_length": 215.0, "epoch": 4.823529411764706, "grad_norm": 0.008105294778943062, "kl": 0.002776593901216984, "learning_rate": 2.290805391669212e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 574 }, { "completion_length": 256.0, "epoch": 4.831932773109243, "grad_norm": 0.005041723605245352, "kl": 0.001295585767365992, "learning_rate": 2.2821106431308546e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 575 }, { "completion_length": 256.0, "epoch": 4.840336134453781, "grad_norm": 0.9494178891181946, "kl": 0.003161403816193342, "learning_rate": 2.2734185495055503e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 576 }, { "completion_length": 256.0, "epoch": 4.848739495798319, "grad_norm": 0.0093673225492239, "kl": 0.004909573122859001, "learning_rate": 2.2647292167037143e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 577 }, { "completion_length": 162.0, "epoch": 4.857142857142857, "grad_norm": 0.011539540253579617, "kl": 0.00792689435184002, "learning_rate": 2.256042750602127e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 578 }, { "completion_length": 47.0, "epoch": 4.865546218487395, "grad_norm": 0.011545287445187569, "kl": 0.004370143637061119, "learning_rate": 2.2473592570426343e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 579 }, { "completion_length": 149.0, "epoch": 4.873949579831933, "grad_norm": 0.019975053146481514, "kl": 0.009017436765134335, "learning_rate": 2.238678841830867e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 580 }, { "completion_length": 163.5, "epoch": 4.882352941176471, "grad_norm": 0.00912118423730135, "kl": 0.0029539535753428936, "learning_rate": 2.230001610734943e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 581 }, { "completion_length": 256.0, "epoch": 4.890756302521009, "grad_norm": 0.007083150092512369, "kl": 0.0018709826981648803, "learning_rate": 2.2213276694841866e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 582 }, { "completion_length": 256.0, "epoch": 4.899159663865547, "grad_norm": 0.004667225759476423, "kl": 0.001464409870095551, "learning_rate": 2.212657123767834e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 583 }, { "completion_length": 256.0, "epoch": 4.907563025210084, "grad_norm": 0.005719732493162155, "kl": 0.001290852203965187, "learning_rate": 2.2039900792337477e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 584 }, { "completion_length": 238.5, "epoch": 4.915966386554622, "grad_norm": 0.009425394237041473, "kl": 0.004328674636781216, "learning_rate": 2.195326641487132e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 585 }, { "completion_length": 256.0, "epoch": 4.92436974789916, "grad_norm": 0.005701694171875715, "kl": 0.0012399954721331596, "learning_rate": 2.186666916089239e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 586 }, { "completion_length": 177.0, "epoch": 4.932773109243698, "grad_norm": 1.0734918117523193, "kl": 0.002691796747967601, "learning_rate": 2.1780110085560935e-06, "loss": 0.0001, "reward": 0.13099999725818634, "reward_std": 0.16829141974449158, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13099999725818634, "step": 587 }, { "completion_length": 256.0, "epoch": 4.9411764705882355, "grad_norm": 0.004647921770811081, "kl": 0.0013063537189736962, "learning_rate": 2.1693590243571937e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 588 }, { "completion_length": 256.0, "epoch": 4.949579831932773, "grad_norm": 0.005380407907068729, "kl": 0.0011306003434583545, "learning_rate": 2.1607110689142393e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 589 }, { "completion_length": 107.0, "epoch": 4.957983193277311, "grad_norm": 1.5422990322113037, "kl": 0.005496771540492773, "learning_rate": 2.1520672475998374e-06, "loss": 0.0002, "reward": 0.011500000953674316, "reward_std": 0.3372899293899536, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011500000953674316, "step": 590 }, { "completion_length": 256.0, "epoch": 4.966386554621849, "grad_norm": 0.7495124340057373, "kl": 0.002157171256840229, "learning_rate": 2.143427665736221e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 591 }, { "completion_length": 256.0, "epoch": 4.974789915966387, "grad_norm": 0.005112155806273222, "kl": 0.0013939437922090292, "learning_rate": 2.134792428593971e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 592 }, { "completion_length": 256.0, "epoch": 4.983193277310924, "grad_norm": 0.9338951706886292, "kl": 0.0017637673299759626, "learning_rate": 2.1261616413907267e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 593 }, { "completion_length": 256.0, "epoch": 4.991596638655462, "grad_norm": 0.009793954901397228, "kl": 0.003739282488822937, "learning_rate": 2.117535409289905e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 594 }, { "completion_length": 256.0, "epoch": 5.0, "grad_norm": 0.778442919254303, "kl": 0.0019971742294728756, "learning_rate": 2.1089138373994226e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 595 }, { "completion_length": 183.5, "epoch": 5.008403361344538, "grad_norm": 1.1267075538635254, "kl": 0.0035133552737534046, "learning_rate": 2.1002970307704134e-06, "loss": 0.0001, "reward": 0.05350000038743019, "reward_std": 0.10111626982688904, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05350000038743019, "step": 596 }, { "completion_length": 81.5, "epoch": 5.016806722689076, "grad_norm": 0.010034611448645592, "kl": 0.0018179346807301044, "learning_rate": 2.0916850943959453e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 597 }, { "completion_length": 192.5, "epoch": 5.025210084033613, "grad_norm": 0.014215071685612202, "kl": 0.00495352316647768, "learning_rate": 2.0830781332097446e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 598 }, { "completion_length": 256.0, "epoch": 5.033613445378151, "grad_norm": 0.007996601983904839, "kl": 0.002420376054942608, "learning_rate": 2.0744762520849193e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 599 }, { "completion_length": 256.0, "epoch": 5.042016806722689, "grad_norm": 0.9023677706718445, "kl": 0.0030358454678207636, "learning_rate": 2.0658795558326745e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 600 }, { "completion_length": 256.0, "epoch": 5.050420168067227, "grad_norm": 0.007396819535642862, "kl": 0.002313095610588789, "learning_rate": 2.0572881492010423e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 601 }, { "completion_length": 106.0, "epoch": 5.0588235294117645, "grad_norm": 1.4781310558319092, "kl": 0.004530521109700203, "learning_rate": 2.0487021368736002e-06, "loss": 0.0002, "reward": -0.036500006914138794, "reward_std": 0.405172199010849, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.036500006914138794, "step": 602 }, { "completion_length": 147.0, "epoch": 5.067226890756302, "grad_norm": 1.5048121213912964, "kl": 0.0028237123042345047, "learning_rate": 2.0401216234682e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 603 }, { "completion_length": 256.0, "epoch": 5.07563025210084, "grad_norm": 0.9168094992637634, "kl": 0.002460700459778309, "learning_rate": 2.031546713535688e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 604 }, { "completion_length": 256.0, "epoch": 5.084033613445378, "grad_norm": 0.0052018435671925545, "kl": 0.0012284009717404842, "learning_rate": 2.022977511558638e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 605 }, { "completion_length": 256.0, "epoch": 5.092436974789916, "grad_norm": 0.00673392228782177, "kl": 0.0020043575204908848, "learning_rate": 2.0144141219500707e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 606 }, { "completion_length": 174.0, "epoch": 5.100840336134453, "grad_norm": 0.006595225539058447, "kl": 0.0011504953727126122, "learning_rate": 2.0058566490521848e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 607 }, { "completion_length": 256.0, "epoch": 5.109243697478991, "grad_norm": 0.007908586412668228, "kl": 0.002720660762861371, "learning_rate": 1.997305197135089e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 608 }, { "completion_length": 114.0, "epoch": 5.117647058823529, "grad_norm": 0.01399979181587696, "kl": 0.0044239601120352745, "learning_rate": 1.9887598703955244e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 609 }, { "completion_length": 256.0, "epoch": 5.126050420168067, "grad_norm": 0.010668579488992691, "kl": 0.003608780214563012, "learning_rate": 1.9802207729556023e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 610 }, { "completion_length": 256.0, "epoch": 5.1344537815126055, "grad_norm": 0.6893883943557739, "kl": 0.0010929639684036374, "learning_rate": 1.971688008861529e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 611 }, { "completion_length": 256.0, "epoch": 5.142857142857143, "grad_norm": 0.00859519187361002, "kl": 0.0022062319330871105, "learning_rate": 1.963161682082342e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 612 }, { "completion_length": 256.0, "epoch": 5.151260504201681, "grad_norm": 0.010591890662908554, "kl": 0.001303770812228322, "learning_rate": 1.9546418965086444e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 613 }, { "completion_length": 256.0, "epoch": 5.159663865546219, "grad_norm": 0.005686614662408829, "kl": 0.0012636040337383747, "learning_rate": 1.946128755951332e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 614 }, { "completion_length": 256.0, "epoch": 5.168067226890757, "grad_norm": 0.010728970170021057, "kl": 0.0033416524529457092, "learning_rate": 1.937622364140338e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 615 }, { "completion_length": 124.0, "epoch": 5.176470588235294, "grad_norm": 0.009243376553058624, "kl": 0.0029700498562306166, "learning_rate": 1.9291228247233607e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 616 }, { "completion_length": 256.0, "epoch": 5.184873949579832, "grad_norm": 0.007049089763313532, "kl": 0.0016103996895253658, "learning_rate": 1.9206302412646074e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 617 }, { "completion_length": 254.0, "epoch": 5.19327731092437, "grad_norm": 0.008066593669354916, "kl": 0.0027826607692986727, "learning_rate": 1.912144717243525e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 618 }, { "completion_length": 256.0, "epoch": 5.201680672268908, "grad_norm": 0.00692775147035718, "kl": 0.0014902278780937195, "learning_rate": 1.9036663560535484e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 619 }, { "completion_length": 256.0, "epoch": 5.2100840336134455, "grad_norm": 0.004956497810781002, "kl": 0.0014455565251410007, "learning_rate": 1.895195261000831e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 620 }, { "completion_length": 256.0, "epoch": 5.218487394957983, "grad_norm": 0.00778032885864377, "kl": 0.002033608965575695, "learning_rate": 1.8867315353029937e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 621 }, { "completion_length": 61.0, "epoch": 5.226890756302521, "grad_norm": 0.01777837611734867, "kl": 0.004060863517224789, "learning_rate": 1.8782752820878636e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 622 }, { "completion_length": 256.0, "epoch": 5.235294117647059, "grad_norm": 0.005578409414738417, "kl": 0.0013669562758877873, "learning_rate": 1.8698266043922159e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 623 }, { "completion_length": 256.0, "epoch": 5.243697478991597, "grad_norm": 0.010048930533230305, "kl": 0.0016810063971206546, "learning_rate": 1.8613856051605242e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 624 }, { "completion_length": 256.0, "epoch": 5.2521008403361344, "grad_norm": 0.0064218840561807156, "kl": 0.0015595563454553485, "learning_rate": 1.852952387243698e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 625 }, { "completion_length": 256.0, "epoch": 5.260504201680672, "grad_norm": 0.00970874261111021, "kl": 0.00171267322730273, "learning_rate": 1.8445270533978387e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 626 }, { "completion_length": 256.0, "epoch": 5.26890756302521, "grad_norm": 0.8007743954658508, "kl": 0.002233549254015088, "learning_rate": 1.836109706282978e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 627 }, { "completion_length": 256.0, "epoch": 5.277310924369748, "grad_norm": 0.006997249089181423, "kl": 0.0015015705721452832, "learning_rate": 1.827700448461836e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 628 }, { "completion_length": 256.0, "epoch": 5.285714285714286, "grad_norm": 0.0077733504585921764, "kl": 0.0017355125164613128, "learning_rate": 1.8192993823985643e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 629 }, { "completion_length": 256.0, "epoch": 5.294117647058823, "grad_norm": 0.9654385447502136, "kl": 0.001992179546505213, "learning_rate": 1.8109066104575023e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 630 }, { "completion_length": 185.5, "epoch": 5.302521008403361, "grad_norm": 0.9311289191246033, "kl": 0.0022277659736573696, "learning_rate": 1.8025222349019273e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 631 }, { "completion_length": 256.0, "epoch": 5.310924369747899, "grad_norm": 1.0901347398757935, "kl": 0.002340996637940407, "learning_rate": 1.7941463578928088e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 632 }, { "completion_length": 216.0, "epoch": 5.319327731092437, "grad_norm": 0.9156938791275024, "kl": 0.0022040936164557934, "learning_rate": 1.7857790814875665e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 633 }, { "completion_length": 227.0, "epoch": 5.3277310924369745, "grad_norm": 0.007956058718264103, "kl": 0.0018517525168135762, "learning_rate": 1.7774205076388207e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 634 }, { "completion_length": 256.0, "epoch": 5.336134453781512, "grad_norm": 0.8314154148101807, "kl": 0.0011289010290056467, "learning_rate": 1.7690707381931585e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 635 }, { "completion_length": 256.0, "epoch": 5.34453781512605, "grad_norm": 0.8889763951301575, "kl": 0.0030732210725545883, "learning_rate": 1.7607298748898844e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 636 }, { "completion_length": 141.0, "epoch": 5.352941176470588, "grad_norm": 0.007906573824584484, "kl": 0.0014149026246741414, "learning_rate": 1.7523980193597837e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 637 }, { "completion_length": 256.0, "epoch": 5.361344537815126, "grad_norm": 0.006772981956601143, "kl": 0.0013478193432092667, "learning_rate": 1.744075273123889e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 638 }, { "completion_length": 256.0, "epoch": 5.369747899159664, "grad_norm": 0.008181373588740826, "kl": 0.0017395949689671397, "learning_rate": 1.735761737592236e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 639 }, { "completion_length": 256.0, "epoch": 5.378151260504202, "grad_norm": 0.005370739381760359, "kl": 0.001072113518603146, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 640 }, { "completion_length": 232.5, "epoch": 5.38655462184874, "grad_norm": 0.9679915308952332, "kl": 0.0021699154749512672, "learning_rate": 1.7191627037194187e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 641 }, { "completion_length": 201.5, "epoch": 5.394957983193278, "grad_norm": 1.1151145696640015, "kl": 0.0028039617463946342, "learning_rate": 1.7108774076322443e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 642 }, { "completion_length": 255.0, "epoch": 5.4033613445378155, "grad_norm": 1.0353115797042847, "kl": 0.00225728377699852, "learning_rate": 1.702601726754825e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 643 }, { "completion_length": 256.0, "epoch": 5.411764705882353, "grad_norm": 0.006345115136355162, "kl": 0.00175963225774467, "learning_rate": 1.6943357619237227e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 644 }, { "completion_length": 256.0, "epoch": 5.420168067226891, "grad_norm": 0.003881037700921297, "kl": 0.0009053158573806286, "learning_rate": 1.686079613857109e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 645 }, { "completion_length": 256.0, "epoch": 5.428571428571429, "grad_norm": 0.888401448726654, "kl": 0.0035264804027974606, "learning_rate": 1.677833383153542e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 646 }, { "completion_length": 256.0, "epoch": 5.436974789915967, "grad_norm": 0.9069778323173523, "kl": 0.0028541414067149162, "learning_rate": 1.6695971702907425e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 647 }, { "completion_length": 256.0, "epoch": 5.445378151260504, "grad_norm": 0.010582579299807549, "kl": 0.002708235289901495, "learning_rate": 1.661371075624363e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 648 }, { "completion_length": 228.0, "epoch": 5.453781512605042, "grad_norm": 0.013263936154544353, "kl": 0.004309320822358131, "learning_rate": 1.6531551993867717e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 649 }, { "completion_length": 256.0, "epoch": 5.46218487394958, "grad_norm": 1.0819220542907715, "kl": 0.0037744459696114063, "learning_rate": 1.6449496416858285e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 650 }, { "completion_length": 256.0, "epoch": 5.470588235294118, "grad_norm": 0.006522004958242178, "kl": 0.0014624681789427996, "learning_rate": 1.6367545025036634e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 651 }, { "completion_length": 256.0, "epoch": 5.4789915966386555, "grad_norm": 0.7281799912452698, "kl": 0.0016807356150820851, "learning_rate": 1.6285698816954626e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 652 }, { "completion_length": 188.5, "epoch": 5.487394957983193, "grad_norm": 1.1055647134780884, "kl": 0.004313978832215071, "learning_rate": 1.6203958789882457e-06, "loss": 0.0002, "reward": 0.02499999850988388, "reward_std": 0.1414213627576828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02499999850988388, "step": 653 }, { "completion_length": 256.0, "epoch": 5.495798319327731, "grad_norm": 0.006589329335838556, "kl": 0.0013943819794803858, "learning_rate": 1.612232593979658e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 654 }, { "completion_length": 164.0, "epoch": 5.504201680672269, "grad_norm": 1.3845067024230957, "kl": 0.003736726939678192, "learning_rate": 1.6040801261367494e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 655 }, { "completion_length": 205.5, "epoch": 5.512605042016807, "grad_norm": 1.0448923110961914, "kl": 0.00400229636579752, "learning_rate": 1.5959385747947697e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 656 }, { "completion_length": 256.0, "epoch": 5.5210084033613445, "grad_norm": 0.006103715393692255, "kl": 0.001473315292969346, "learning_rate": 1.5878080391559507e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 657 }, { "completion_length": 256.0, "epoch": 5.529411764705882, "grad_norm": 0.005370413884520531, "kl": 0.0012742540566250682, "learning_rate": 1.5796886182883053e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 658 }, { "completion_length": 226.0, "epoch": 5.53781512605042, "grad_norm": 0.8645743727684021, "kl": 0.00478063989430666, "learning_rate": 1.5715804111244138e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 659 }, { "completion_length": 190.5, "epoch": 5.546218487394958, "grad_norm": 0.012256038375198841, "kl": 0.0029752724803984165, "learning_rate": 1.56348351646022e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 660 }, { "completion_length": 256.0, "epoch": 5.554621848739496, "grad_norm": 0.008987871930003166, "kl": 0.0017543898429721594, "learning_rate": 1.5553980329538326e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 661 }, { "completion_length": 151.0, "epoch": 5.563025210084033, "grad_norm": 0.013877416029572487, "kl": 0.003725190181285143, "learning_rate": 1.547324059124315e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 662 }, { "completion_length": 256.0, "epoch": 5.571428571428571, "grad_norm": 0.9429757595062256, "kl": 0.003095134161412716, "learning_rate": 1.539261693350491e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 663 }, { "completion_length": 229.0, "epoch": 5.579831932773109, "grad_norm": 0.008015704341232777, "kl": 0.0036908253096044064, "learning_rate": 1.5312110338697427e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 664 }, { "completion_length": 256.0, "epoch": 5.588235294117647, "grad_norm": 0.7633675932884216, "kl": 0.0033783838152885437, "learning_rate": 1.5231721787768162e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 665 }, { "completion_length": 256.0, "epoch": 5.5966386554621845, "grad_norm": 0.0047802249900996685, "kl": 0.0011932000052183867, "learning_rate": 1.5151452260226224e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 666 }, { "completion_length": 256.0, "epoch": 5.605042016806722, "grad_norm": 0.005264324601739645, "kl": 0.0013470638077706099, "learning_rate": 1.5071302734130488e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 667 }, { "completion_length": 256.0, "epoch": 5.61344537815126, "grad_norm": 0.015357961878180504, "kl": 0.005388913210481405, "learning_rate": 1.4991274186077632e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 668 }, { "completion_length": 92.5, "epoch": 5.621848739495798, "grad_norm": 1.4165841341018677, "kl": 0.003169140312820673, "learning_rate": 1.491136759119025e-06, "loss": 0.0001, "reward": 0.10300000011920929, "reward_std": 0.2078893929719925, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10300000011920929, "step": 669 }, { "completion_length": 153.0, "epoch": 5.630252100840336, "grad_norm": 0.01995934173464775, "kl": 0.012218874879181385, "learning_rate": 1.4831583923105e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 670 }, { "completion_length": 196.0, "epoch": 5.6386554621848735, "grad_norm": 0.01334353070706129, "kl": 0.003748738905414939, "learning_rate": 1.4751924153960681e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 671 }, { "completion_length": 218.5, "epoch": 5.647058823529412, "grad_norm": 0.007993188686668873, "kl": 0.0033607950899749994, "learning_rate": 1.467238925438646e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 672 }, { "completion_length": 131.0, "epoch": 5.65546218487395, "grad_norm": 0.009187704883515835, "kl": 0.0036307224072515965, "learning_rate": 1.4592980193489975e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 673 }, { "completion_length": 256.0, "epoch": 5.663865546218488, "grad_norm": 0.0057126060128211975, "kl": 0.0013398557202890515, "learning_rate": 1.4513697938845571e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 674 }, { "completion_length": 256.0, "epoch": 5.6722689075630255, "grad_norm": 0.8608607053756714, "kl": 0.004331071395426989, "learning_rate": 1.443454345648252e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 675 }, { "completion_length": 115.5, "epoch": 5.680672268907563, "grad_norm": 0.03891675919294357, "kl": 0.003764514811336994, "learning_rate": 1.4355517710873184e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 676 }, { "completion_length": 226.0, "epoch": 5.689075630252101, "grad_norm": 1.0531610250473022, "kl": 0.005441954359412193, "learning_rate": 1.4276621664921358e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 677 }, { "completion_length": 256.0, "epoch": 5.697478991596639, "grad_norm": 0.005322211422026157, "kl": 0.001616867957636714, "learning_rate": 1.419785627995044e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 678 }, { "completion_length": 231.5, "epoch": 5.705882352941177, "grad_norm": 0.011647414416074753, "kl": 0.0055820532143116, "learning_rate": 1.4119222515691817e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 679 }, { "completion_length": 256.0, "epoch": 5.714285714285714, "grad_norm": 0.00758923776447773, "kl": 0.00217791018076241, "learning_rate": 1.4040721330273063e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 680 }, { "completion_length": 57.0, "epoch": 5.722689075630252, "grad_norm": 0.032685499638319016, "kl": 0.009352599270641804, "learning_rate": 1.3962353680206372e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 681 }, { "completion_length": 256.0, "epoch": 5.73109243697479, "grad_norm": 0.006033693440258503, "kl": 0.0013116300106048584, "learning_rate": 1.388412052037682e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 682 }, { "completion_length": 256.0, "epoch": 5.739495798319328, "grad_norm": 0.00581104913726449, "kl": 0.0014421817613765597, "learning_rate": 1.380602280403076e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 683 }, { "completion_length": 256.0, "epoch": 5.7478991596638656, "grad_norm": 0.005783306434750557, "kl": 0.0015577011508867145, "learning_rate": 1.3728061482764238e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 684 }, { "completion_length": 256.0, "epoch": 5.756302521008403, "grad_norm": 0.007445093709975481, "kl": 0.004359540995210409, "learning_rate": 1.3650237506511333e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 685 }, { "completion_length": 91.0, "epoch": 5.764705882352941, "grad_norm": 0.009693341329693794, "kl": 0.0033527310006320477, "learning_rate": 1.3572551823532654e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 686 }, { "completion_length": 160.5, "epoch": 5.773109243697479, "grad_norm": 0.006293878424912691, "kl": 0.0018183618085458875, "learning_rate": 1.349500538040371e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 687 }, { "completion_length": 214.5, "epoch": 5.781512605042017, "grad_norm": 0.9091009497642517, "kl": 0.001669886289164424, "learning_rate": 1.3417599122003464e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 688 }, { "completion_length": 256.0, "epoch": 5.7899159663865545, "grad_norm": 0.005193948280066252, "kl": 0.0012114944402128458, "learning_rate": 1.3340333991502723e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 689 }, { "completion_length": 166.5, "epoch": 5.798319327731092, "grad_norm": 0.026883814483880997, "kl": 0.01199945155531168, "learning_rate": 1.3263210930352737e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 690 }, { "completion_length": 240.0, "epoch": 5.80672268907563, "grad_norm": 0.7627358436584473, "kl": 0.003694333368912339, "learning_rate": 1.3186230878273654e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 691 }, { "completion_length": 256.0, "epoch": 5.815126050420168, "grad_norm": 0.004555881954729557, "kl": 0.0012313422048464417, "learning_rate": 1.3109394773243117e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 692 }, { "completion_length": 256.0, "epoch": 5.823529411764706, "grad_norm": 0.0075421710498631, "kl": 0.0017848407151177526, "learning_rate": 1.3032703551484832e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 693 }, { "completion_length": 256.0, "epoch": 5.831932773109243, "grad_norm": 0.008081227540969849, "kl": 0.001596082467585802, "learning_rate": 1.2956158147457116e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 694 }, { "completion_length": 256.0, "epoch": 5.840336134453781, "grad_norm": 0.008879891596734524, "kl": 0.0040673753246665, "learning_rate": 1.2879759493841577e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 695 }, { "completion_length": 256.0, "epoch": 5.848739495798319, "grad_norm": 0.00888338778167963, "kl": 0.0022750969510525465, "learning_rate": 1.280350852153168e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 696 }, { "completion_length": 256.0, "epoch": 5.857142857142857, "grad_norm": 0.006257389206439257, "kl": 0.0026615536771714687, "learning_rate": 1.272740615962148e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 697 }, { "completion_length": 226.0, "epoch": 5.865546218487395, "grad_norm": 0.007410340942442417, "kl": 0.0032301319297403097, "learning_rate": 1.2651453335394232e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 698 }, { "completion_length": 107.0, "epoch": 5.873949579831933, "grad_norm": 1.191349983215332, "kl": 0.005140444729477167, "learning_rate": 1.2575650974311118e-06, "loss": 0.0002, "reward": -0.024000002071261406, "reward_std": 0.07778174430131912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.024000002071261406, "step": 699 }, { "completion_length": 256.0, "epoch": 5.882352941176471, "grad_norm": 0.008943052031099796, "kl": 0.002110084518790245, "learning_rate": 1.2500000000000007e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 700 }, { "completion_length": 256.0, "epoch": 5.890756302521009, "grad_norm": 0.008983226493000984, "kl": 0.005151739344000816, "learning_rate": 1.2424501334244124e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 701 }, { "completion_length": 118.5, "epoch": 5.899159663865547, "grad_norm": 0.025301320478320122, "kl": 0.0027322620153427124, "learning_rate": 1.234915589697091e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 702 }, { "completion_length": 256.0, "epoch": 5.907563025210084, "grad_norm": 0.004267112351953983, "kl": 0.002253488637506962, "learning_rate": 1.2273964606240718e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 703 }, { "completion_length": 218.0, "epoch": 5.915966386554622, "grad_norm": 0.01178926695138216, "kl": 0.003751843236386776, "learning_rate": 1.2198928378235717e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 704 }, { "completion_length": 190.5, "epoch": 5.92436974789916, "grad_norm": 1.1801326274871826, "kl": 0.00372373778373003, "learning_rate": 1.2124048127248644e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 705 }, { "completion_length": 117.5, "epoch": 5.932773109243698, "grad_norm": 0.012497540563344955, "kl": 0.008952782489359379, "learning_rate": 1.204932476567175e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 706 }, { "completion_length": 256.0, "epoch": 5.9411764705882355, "grad_norm": 0.004102770239114761, "kl": 0.0012483976315706968, "learning_rate": 1.19747592039856e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 707 }, { "completion_length": 256.0, "epoch": 5.949579831932773, "grad_norm": 0.9020516276359558, "kl": 0.0023504141718149185, "learning_rate": 1.1900352350748026e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 708 }, { "completion_length": 256.0, "epoch": 5.957983193277311, "grad_norm": 0.006130140274763107, "kl": 0.0014110131887719035, "learning_rate": 1.1826105112583061e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 709 }, { "completion_length": 185.5, "epoch": 5.966386554621849, "grad_norm": 0.00854600127786398, "kl": 0.003469187766313553, "learning_rate": 1.1752018394169882e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 710 }, { "completion_length": 256.0, "epoch": 5.974789915966387, "grad_norm": 0.004993244539946318, "kl": 0.0013796681305393577, "learning_rate": 1.1678093098231748e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 711 }, { "completion_length": 256.0, "epoch": 5.983193277310924, "grad_norm": 0.014543093740940094, "kl": 0.00401286268606782, "learning_rate": 1.160433012552508e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 712 }, { "completion_length": 116.0, "epoch": 5.991596638655462, "grad_norm": 0.01835561916232109, "kl": 0.007430621888488531, "learning_rate": 1.1530730374828422e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 713 }, { "completion_length": 256.0, "epoch": 6.0, "grad_norm": 0.9012296199798584, "kl": 0.0021608606912195683, "learning_rate": 1.1457294742931508e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 714 }, { "completion_length": 221.5, "epoch": 6.008403361344538, "grad_norm": 1.1278198957443237, "kl": 0.004194601904600859, "learning_rate": 1.1384024124624324e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 715 }, { "completion_length": 60.5, "epoch": 6.016806722689076, "grad_norm": 0.01712752878665924, "kl": 0.003639211179688573, "learning_rate": 1.1310919412686248e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 716 }, { "completion_length": 256.0, "epoch": 6.025210084033613, "grad_norm": 0.8987089395523071, "kl": 0.0019052221905440092, "learning_rate": 1.1237981497875112e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 717 }, { "completion_length": 256.0, "epoch": 6.033613445378151, "grad_norm": 0.9605427980422974, "kl": 0.0033167132642120123, "learning_rate": 1.11652112689164e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 718 }, { "completion_length": 256.0, "epoch": 6.042016806722689, "grad_norm": 0.0068783825263381, "kl": 0.002976843621581793, "learning_rate": 1.109260961249238e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 719 }, { "completion_length": 193.0, "epoch": 6.050420168067227, "grad_norm": 0.008896666578948498, "kl": 0.004134940914809704, "learning_rate": 1.1020177413231334e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 720 }, { "completion_length": 256.0, "epoch": 6.0588235294117645, "grad_norm": 0.005991390440613031, "kl": 0.001516838907264173, "learning_rate": 1.0947915553696742e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 721 }, { "completion_length": 256.0, "epoch": 6.067226890756302, "grad_norm": 0.005408760160207748, "kl": 0.00116773194167763, "learning_rate": 1.0875824914376555e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 722 }, { "completion_length": 256.0, "epoch": 6.07563025210084, "grad_norm": 0.006918430794030428, "kl": 0.002318293321877718, "learning_rate": 1.0803906373672477e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 723 }, { "completion_length": 194.0, "epoch": 6.084033613445378, "grad_norm": 0.013505814597010612, "kl": 0.004727458581328392, "learning_rate": 1.073216080788921e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 724 }, { "completion_length": 155.0, "epoch": 6.092436974789916, "grad_norm": 0.010015221312642097, "kl": 0.003654142376035452, "learning_rate": 1.0660589091223854e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 725 }, { "completion_length": 256.0, "epoch": 6.100840336134453, "grad_norm": 0.7654206156730652, "kl": 0.0014363096561282873, "learning_rate": 1.0589192095755172e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 726 }, { "completion_length": 256.0, "epoch": 6.109243697478991, "grad_norm": 0.009250784292817116, "kl": 0.004437427967786789, "learning_rate": 1.0517970691433035e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 727 }, { "completion_length": 256.0, "epoch": 6.117647058823529, "grad_norm": 0.010434146970510483, "kl": 0.004598037805408239, "learning_rate": 1.0446925746067768e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 728 }, { "completion_length": 114.0, "epoch": 6.126050420168067, "grad_norm": 0.012809588573873043, "kl": 0.002113214461132884, "learning_rate": 1.0376058125319614e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 729 }, { "completion_length": 172.5, "epoch": 6.1344537815126055, "grad_norm": 1.1865767240524292, "kl": 0.005900771357119083, "learning_rate": 1.0305368692688175e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 730 }, { "completion_length": 256.0, "epoch": 6.142857142857143, "grad_norm": 0.007890144363045692, "kl": 0.0031150872819125652, "learning_rate": 1.0234858309501864e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 731 }, { "completion_length": 83.0, "epoch": 6.151260504201681, "grad_norm": 0.015904506668448448, "kl": 0.0027061104774475098, "learning_rate": 1.0164527834907468e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 732 }, { "completion_length": 256.0, "epoch": 6.159663865546219, "grad_norm": 0.9100740551948547, "kl": 0.0029932598117738962, "learning_rate": 1.0094378125859602e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 733 }, { "completion_length": 216.0, "epoch": 6.168067226890757, "grad_norm": 0.012830119580030441, "kl": 0.00440669059753418, "learning_rate": 1.0024410037110358e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 734 }, { "completion_length": 217.0, "epoch": 6.176470588235294, "grad_norm": 0.009386016055941582, "kl": 0.0046503255143761635, "learning_rate": 9.95462442119879e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 735 }, { "completion_length": 256.0, "epoch": 6.184873949579832, "grad_norm": 0.9780017137527466, "kl": 0.005229763686656952, "learning_rate": 9.88502212844063e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 736 }, { "completion_length": 215.5, "epoch": 6.19327731092437, "grad_norm": 0.8275498151779175, "kl": 0.00303781614638865, "learning_rate": 9.815604006917839e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 737 }, { "completion_length": 256.0, "epoch": 6.201680672268908, "grad_norm": 0.8428306579589844, "kl": 0.0013902327045798302, "learning_rate": 9.746370902468311e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 738 }, { "completion_length": 95.0, "epoch": 6.2100840336134455, "grad_norm": 0.01699717715382576, "kl": 0.0033557408023625612, "learning_rate": 9.677323658675594e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 739 }, { "completion_length": 238.0, "epoch": 6.218487394957983, "grad_norm": 0.005434748250991106, "kl": 0.002309538424015045, "learning_rate": 9.608463116858544e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 740 }, { "completion_length": 256.0, "epoch": 6.226890756302521, "grad_norm": 0.007232552859932184, "kl": 0.0015264039393514395, "learning_rate": 9.53979011606115e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 741 }, { "completion_length": 256.0, "epoch": 6.235294117647059, "grad_norm": 0.009089554660022259, "kl": 0.004168184474110603, "learning_rate": 9.471305493042243e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 742 }, { "completion_length": 256.0, "epoch": 6.243697478991597, "grad_norm": 0.8439703583717346, "kl": 0.0026104371063411236, "learning_rate": 9.403010082265351e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 743 }, { "completion_length": 256.0, "epoch": 6.2521008403361344, "grad_norm": 0.005178522784262896, "kl": 0.0012348935706540942, "learning_rate": 9.334904715888496e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 744 }, { "completion_length": 256.0, "epoch": 6.260504201680672, "grad_norm": 0.9164646863937378, "kl": 0.00483954232186079, "learning_rate": 9.266990223754069e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 745 }, { "completion_length": 256.0, "epoch": 6.26890756302521, "grad_norm": 0.005140118766576052, "kl": 0.0017012034077197313, "learning_rate": 9.199267433378728e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 746 }, { "completion_length": 256.0, "epoch": 6.277310924369748, "grad_norm": 0.006662223022431135, "kl": 0.001744074048474431, "learning_rate": 9.131737169943314e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 747 }, { "completion_length": 256.0, "epoch": 6.285714285714286, "grad_norm": 0.8268362879753113, "kl": 0.001963095273822546, "learning_rate": 9.064400256282757e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 748 }, { "completion_length": 256.0, "epoch": 6.294117647058823, "grad_norm": 0.005923266988247633, "kl": 0.0011791144497692585, "learning_rate": 8.99725751287611e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 749 }, { "completion_length": 256.0, "epoch": 6.302521008403361, "grad_norm": 0.007620212621986866, "kl": 0.001782333361916244, "learning_rate": 8.930309757836517e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 750 }, { "completion_length": 45.5, "epoch": 6.310924369747899, "grad_norm": 0.03757859766483307, "kl": 0.007096529006958008, "learning_rate": 8.863557806901233e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 751 }, { "completion_length": 106.5, "epoch": 6.319327731092437, "grad_norm": 0.01423617359250784, "kl": 0.003934054635465145, "learning_rate": 8.797002473421729e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 752 }, { "completion_length": 256.0, "epoch": 6.3277310924369745, "grad_norm": 0.007662131916731596, "kl": 0.0038692676462233067, "learning_rate": 8.73064456835373e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 753 }, { "completion_length": 256.0, "epoch": 6.336134453781512, "grad_norm": 0.006066367495805025, "kl": 0.0018577268347144127, "learning_rate": 8.664484900247363e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 754 }, { "completion_length": 209.0, "epoch": 6.34453781512605, "grad_norm": 0.007878031581640244, "kl": 0.0019280521664768457, "learning_rate": 8.598524275237321e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 755 }, { "completion_length": 256.0, "epoch": 6.352941176470588, "grad_norm": 0.006384390406310558, "kl": 0.0018315539928153157, "learning_rate": 8.532763497032987e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 756 }, { "completion_length": 240.5, "epoch": 6.361344537815126, "grad_norm": 0.00816658977419138, "kl": 0.002657170407474041, "learning_rate": 8.467203366908708e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 757 }, { "completion_length": 216.0, "epoch": 6.369747899159664, "grad_norm": 0.015445982106029987, "kl": 0.005547353066504002, "learning_rate": 8.40184468369396e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 758 }, { "completion_length": 256.0, "epoch": 6.378151260504202, "grad_norm": 0.006266208831220865, "kl": 0.0014345430536195636, "learning_rate": 8.336688243763691e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 759 }, { "completion_length": 256.0, "epoch": 6.38655462184874, "grad_norm": 0.005002750549465418, "kl": 0.0015214410377666354, "learning_rate": 8.271734841028553e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 760 }, { "completion_length": 256.0, "epoch": 6.394957983193278, "grad_norm": 0.0058407532051205635, "kl": 0.0016842943150550127, "learning_rate": 8.206985266925249e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 761 }, { "completion_length": 256.0, "epoch": 6.4033613445378155, "grad_norm": 0.006668625865131617, "kl": 0.001604369841516018, "learning_rate": 8.142440310406923e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 762 }, { "completion_length": 256.0, "epoch": 6.411764705882353, "grad_norm": 0.8945080041885376, "kl": 0.002713439054787159, "learning_rate": 8.078100757933486e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 763 }, { "completion_length": 204.0, "epoch": 6.420168067226891, "grad_norm": 1.0291050672531128, "kl": 0.0041989427991211414, "learning_rate": 8.013967393462094e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 764 }, { "completion_length": 213.0, "epoch": 6.428571428571429, "grad_norm": 0.012050893157720566, "kl": 0.0039482107385993, "learning_rate": 7.950040998437541e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 765 }, { "completion_length": 256.0, "epoch": 6.436974789915967, "grad_norm": 0.008341366425156593, "kl": 0.003175565507262945, "learning_rate": 7.886322351782782e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 766 }, { "completion_length": 71.5, "epoch": 6.445378151260504, "grad_norm": 0.012267528101801872, "kl": 0.0022559938952326775, "learning_rate": 7.822812229889429e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 767 }, { "completion_length": 256.0, "epoch": 6.453781512605042, "grad_norm": 0.006133852526545525, "kl": 0.002749292878434062, "learning_rate": 7.759511406608255e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 768 }, { "completion_length": 256.0, "epoch": 6.46218487394958, "grad_norm": 0.004883850924670696, "kl": 0.0014348834520205855, "learning_rate": 7.696420653239834e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 769 }, { "completion_length": 256.0, "epoch": 6.470588235294118, "grad_norm": 0.00618027476593852, "kl": 0.0017550690099596977, "learning_rate": 7.633540738525066e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 770 }, { "completion_length": 178.0, "epoch": 6.4789915966386555, "grad_norm": 0.009770109318196774, "kl": 0.0030032857321202755, "learning_rate": 7.57087242863589e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 771 }, { "completion_length": 124.5, "epoch": 6.487394957983193, "grad_norm": 0.011353318579494953, "kl": 0.0020142176654189825, "learning_rate": 7.508416487165862e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 772 }, { "completion_length": 256.0, "epoch": 6.495798319327731, "grad_norm": 0.003665080526843667, "kl": 0.000846462557092309, "learning_rate": 7.44617367512094e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 773 }, { "completion_length": 249.0, "epoch": 6.504201680672269, "grad_norm": 0.006633428856730461, "kl": 0.0027980273589491844, "learning_rate": 7.384144750910133e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 774 }, { "completion_length": 187.5, "epoch": 6.512605042016807, "grad_norm": 0.009850937873125076, "kl": 0.004101440776139498, "learning_rate": 7.322330470336314e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 775 }, { "completion_length": 204.0, "epoch": 6.5210084033613445, "grad_norm": 0.005315656773746014, "kl": 0.0019136992050334811, "learning_rate": 7.260731586586983e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 776 }, { "completion_length": 256.0, "epoch": 6.529411764705882, "grad_norm": 0.005549764260649681, "kl": 0.0016751977382227778, "learning_rate": 7.199348850225091e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 777 }, { "completion_length": 185.0, "epoch": 6.53781512605042, "grad_norm": 1.4883253574371338, "kl": 0.007286296226084232, "learning_rate": 7.138183009179922e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 778 }, { "completion_length": 226.5, "epoch": 6.546218487394958, "grad_norm": 1.000180959701538, "kl": 0.0024688635021448135, "learning_rate": 7.077234808737932e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 779 }, { "completion_length": 256.0, "epoch": 6.554621848739496, "grad_norm": 0.8772112727165222, "kl": 0.0036038514226675034, "learning_rate": 7.016504991533727e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 780 }, { "completion_length": 147.5, "epoch": 6.563025210084033, "grad_norm": 0.01524580828845501, "kl": 0.005266121588647366, "learning_rate": 6.955994297540947e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 781 }, { "completion_length": 256.0, "epoch": 6.571428571428571, "grad_norm": 0.005076350644230843, "kl": 0.0017252396792173386, "learning_rate": 6.895703464063319e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 782 }, { "completion_length": 256.0, "epoch": 6.579831932773109, "grad_norm": 0.006800441071391106, "kl": 0.001671045320108533, "learning_rate": 6.835633225725604e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 783 }, { "completion_length": 256.0, "epoch": 6.588235294117647, "grad_norm": 0.005922805052250624, "kl": 0.001341227674856782, "learning_rate": 6.775784314464717e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 784 }, { "completion_length": 256.0, "epoch": 6.5966386554621845, "grad_norm": 0.007290327455848455, "kl": 0.0019139719661325216, "learning_rate": 6.716157459520739e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 785 }, { "completion_length": 256.0, "epoch": 6.605042016806722, "grad_norm": 1.1342507600784302, "kl": 0.0042749736458063126, "learning_rate": 6.656753387428089e-07, "loss": 0.0002, "reward": -0.2939999997615814, "reward_std": 0.5925554633140564, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2939999997615814, "step": 786 }, { "completion_length": 158.0, "epoch": 6.61344537815126, "grad_norm": 0.015359540469944477, "kl": 0.004113453906029463, "learning_rate": 6.597572822006643e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 787 }, { "completion_length": 256.0, "epoch": 6.621848739495798, "grad_norm": 0.8589726686477661, "kl": 0.003585189115256071, "learning_rate": 6.538616484352902e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 788 }, { "completion_length": 140.0, "epoch": 6.630252100840336, "grad_norm": 0.016509469598531723, "kl": 0.003742868546396494, "learning_rate": 6.479885092831251e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 789 }, { "completion_length": 256.0, "epoch": 6.6386554621848735, "grad_norm": 0.009591222740709782, "kl": 0.004702563397586346, "learning_rate": 6.421379363065142e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 790 }, { "completion_length": 256.0, "epoch": 6.647058823529412, "grad_norm": 0.008564326912164688, "kl": 0.0022036409936845303, "learning_rate": 6.363100007928447e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 791 }, { "completion_length": 256.0, "epoch": 6.65546218487395, "grad_norm": 0.009090062230825424, "kl": 0.0037582223303616047, "learning_rate": 6.305047737536707e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 792 }, { "completion_length": 173.0, "epoch": 6.663865546218488, "grad_norm": 1.1992688179016113, "kl": 0.002320108935236931, "learning_rate": 6.247223259238511e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 793 }, { "completion_length": 166.5, "epoch": 6.6722689075630255, "grad_norm": 0.007426231633871794, "kl": 0.001896584639325738, "learning_rate": 6.189627277606894e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 794 }, { "completion_length": 256.0, "epoch": 6.680672268907563, "grad_norm": 0.00796377845108509, "kl": 0.002106861677020788, "learning_rate": 6.1322604944307e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 795 }, { "completion_length": 256.0, "epoch": 6.689075630252101, "grad_norm": 0.007140349131077528, "kl": 0.0033823708072304726, "learning_rate": 6.075123608706093e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 796 }, { "completion_length": 188.0, "epoch": 6.697478991596639, "grad_norm": 1.3269833326339722, "kl": 0.00577466981485486, "learning_rate": 6.01821731662798e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 797 }, { "completion_length": 256.0, "epoch": 6.705882352941177, "grad_norm": 0.006082034669816494, "kl": 0.0015925714978948236, "learning_rate": 5.961542311581586e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 798 }, { "completion_length": 235.0, "epoch": 6.714285714285714, "grad_norm": 1.1020289659500122, "kl": 0.004151785746216774, "learning_rate": 5.905099284133953e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 799 }, { "completion_length": 201.0, "epoch": 6.722689075630252, "grad_norm": 1.3012391328811646, "kl": 0.004908351227641106, "learning_rate": 5.848888922025553e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 800 }, { "completion_length": 256.0, "epoch": 6.73109243697479, "grad_norm": 0.007310961838811636, "kl": 0.001577533781528473, "learning_rate": 5.792911910161922e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 801 }, { "completion_length": 256.0, "epoch": 6.739495798319328, "grad_norm": 0.005833889357745647, "kl": 0.0013196519576013088, "learning_rate": 5.737168930605272e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 802 }, { "completion_length": 227.5, "epoch": 6.7478991596638656, "grad_norm": 0.006547561846673489, "kl": 0.002823244547471404, "learning_rate": 5.681660662566225e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 803 }, { "completion_length": 215.0, "epoch": 6.756302521008403, "grad_norm": 0.940949022769928, "kl": 0.0028834554832428694, "learning_rate": 5.626387782395512e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 804 }, { "completion_length": 208.5, "epoch": 6.764705882352941, "grad_norm": 0.007036335300654173, "kl": 0.0023760178592056036, "learning_rate": 5.571350963575728e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 805 }, { "completion_length": 256.0, "epoch": 6.773109243697479, "grad_norm": 0.011280566453933716, "kl": 0.004278623033314943, "learning_rate": 5.516550876713142e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 806 }, { "completion_length": 200.5, "epoch": 6.781512605042017, "grad_norm": 0.01363059040158987, "kl": 0.005083064083009958, "learning_rate": 5.461988189529529e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 807 }, { "completion_length": 256.0, "epoch": 6.7899159663865545, "grad_norm": 0.007506730034947395, "kl": 0.0018291235901415348, "learning_rate": 5.407663566854008e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 808 }, { "completion_length": 256.0, "epoch": 6.798319327731092, "grad_norm": 0.006049768067896366, "kl": 0.001213733688928187, "learning_rate": 5.353577670614951e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 809 }, { "completion_length": 256.0, "epoch": 6.80672268907563, "grad_norm": 0.00714668445289135, "kl": 0.003204375971108675, "learning_rate": 5.299731159831953e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 810 }, { "completion_length": 104.5, "epoch": 6.815126050420168, "grad_norm": 0.009213129989802837, "kl": 0.0033985283225774765, "learning_rate": 5.24612469060774e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 811 }, { "completion_length": 222.5, "epoch": 6.823529411764706, "grad_norm": 0.011363384313881397, "kl": 0.0020898596849292517, "learning_rate": 5.192758916120236e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 812 }, { "completion_length": 163.5, "epoch": 6.831932773109243, "grad_norm": 1.4704203605651855, "kl": 0.004970216657966375, "learning_rate": 5.139634486614544e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 813 }, { "completion_length": 256.0, "epoch": 6.840336134453781, "grad_norm": 0.00781720969825983, "kl": 0.002430472057312727, "learning_rate": 5.086752049395094e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 814 }, { "completion_length": 256.0, "epoch": 6.848739495798319, "grad_norm": 0.005107533652335405, "kl": 0.001449321280233562, "learning_rate": 5.034112248817685e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 815 }, { "completion_length": 256.0, "epoch": 6.857142857142857, "grad_norm": 0.007801196537911892, "kl": 0.0032830918207764626, "learning_rate": 4.981715726281666e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 816 }, { "completion_length": 228.0, "epoch": 6.865546218487395, "grad_norm": 0.009398819878697395, "kl": 0.0036698374897241592, "learning_rate": 4.929563120222142e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 817 }, { "completion_length": 256.0, "epoch": 6.873949579831933, "grad_norm": 0.005455057602375746, "kl": 0.0015282605309039354, "learning_rate": 4.87765506610215e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 818 }, { "completion_length": 256.0, "epoch": 6.882352941176471, "grad_norm": 0.009603669866919518, "kl": 0.0042014578357338905, "learning_rate": 4.825992196404958e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 819 }, { "completion_length": 256.0, "epoch": 6.890756302521009, "grad_norm": 0.007294767070561647, "kl": 0.0016039509791880846, "learning_rate": 4.774575140626317e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 820 }, { "completion_length": 256.0, "epoch": 6.899159663865547, "grad_norm": 0.004842757247388363, "kl": 0.001981245819479227, "learning_rate": 4.7234045252668393e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 821 }, { "completion_length": 256.0, "epoch": 6.907563025210084, "grad_norm": 0.004924348555505276, "kl": 0.001481986022554338, "learning_rate": 4.672480973824312e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 822 }, { "completion_length": 187.5, "epoch": 6.915966386554622, "grad_norm": 1.2070057392120361, "kl": 0.006619948893785477, "learning_rate": 4.6218051067861423e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 823 }, { "completion_length": 148.5, "epoch": 6.92436974789916, "grad_norm": 0.008655420504510403, "kl": 0.002734950976446271, "learning_rate": 4.5713775416217884e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 824 }, { "completion_length": 154.5, "epoch": 6.932773109243698, "grad_norm": 0.01674504019320011, "kl": 0.007207968272268772, "learning_rate": 4.5211988927752026e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 825 }, { "completion_length": 256.0, "epoch": 6.9411764705882355, "grad_norm": 0.005762606859207153, "kl": 0.0013286244357004762, "learning_rate": 4.4712697716573994e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 826 }, { "completion_length": 256.0, "epoch": 6.949579831932773, "grad_norm": 0.00646938243880868, "kl": 0.0013432218693196774, "learning_rate": 4.421590786638952e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 827 }, { "completion_length": 234.0, "epoch": 6.957983193277311, "grad_norm": 0.9113871455192566, "kl": 0.003751195967197418, "learning_rate": 4.372162543042624e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 828 }, { "completion_length": 256.0, "epoch": 6.966386554621849, "grad_norm": 0.00773683050647378, "kl": 0.0019388979999348521, "learning_rate": 4.3229856431359516e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 829 }, { "completion_length": 256.0, "epoch": 6.974789915966387, "grad_norm": 0.005338478367775679, "kl": 0.001396447536535561, "learning_rate": 4.27406068612396e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 830 }, { "completion_length": 256.0, "epoch": 6.983193277310924, "grad_norm": 0.0065873656421899796, "kl": 0.0021328995935618877, "learning_rate": 4.225388268141797e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 831 }, { "completion_length": 72.0, "epoch": 6.991596638655462, "grad_norm": 0.009879650548100471, "kl": 0.002757261972874403, "learning_rate": 4.1769689822475147e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 832 }, { "completion_length": 136.0, "epoch": 7.0, "grad_norm": 0.9775848984718323, "kl": 0.00801941193640232, "learning_rate": 4.12880341841484e-07, "loss": 0.0003, "reward": -0.18900001049041748, "reward_std": 0.6208397746086121, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18900001049041748, "step": 833 }, { "completion_length": 242.5, "epoch": 7.008403361344538, "grad_norm": 0.006945634260773659, "kl": 0.0021713541354984045, "learning_rate": 4.0808921635259595e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 834 }, { "completion_length": 256.0, "epoch": 7.016806722689076, "grad_norm": 0.0071526821702718735, "kl": 0.0014991157222539186, "learning_rate": 4.033235801364402e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 835 }, { "completion_length": 226.5, "epoch": 7.025210084033613, "grad_norm": 1.1307116746902466, "kl": 0.003096789587289095, "learning_rate": 3.9858349126078945e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 836 }, { "completion_length": 208.5, "epoch": 7.033613445378151, "grad_norm": 0.007204082328826189, "kl": 0.002395703922957182, "learning_rate": 3.938690074821314e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 837 }, { "completion_length": 256.0, "epoch": 7.042016806722689, "grad_norm": 0.005168038886040449, "kl": 0.001182298525236547, "learning_rate": 3.891801862449629e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 838 }, { "completion_length": 256.0, "epoch": 7.050420168067227, "grad_norm": 0.009011463262140751, "kl": 0.004783933982253075, "learning_rate": 3.8451708468109026e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 839 }, { "completion_length": 198.0, "epoch": 7.0588235294117645, "grad_norm": 1.0867758989334106, "kl": 0.002387113869190216, "learning_rate": 3.798797596089351e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 840 }, { "completion_length": 256.0, "epoch": 7.067226890756302, "grad_norm": 0.008707946166396141, "kl": 0.004143321420997381, "learning_rate": 3.7526826753284065e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 841 }, { "completion_length": 256.0, "epoch": 7.07563025210084, "grad_norm": 0.005323693156242371, "kl": 0.0012886356562376022, "learning_rate": 3.7068266464238085e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 842 }, { "completion_length": 256.0, "epoch": 7.084033613445378, "grad_norm": 0.0058458768762648106, "kl": 0.0014633372193202376, "learning_rate": 3.661230068116811e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 843 }, { "completion_length": 208.5, "epoch": 7.092436974789916, "grad_norm": 0.011365764774382114, "kl": 0.004584240727126598, "learning_rate": 3.615893495987335e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 844 }, { "completion_length": 256.0, "epoch": 7.100840336134453, "grad_norm": 0.6882547736167908, "kl": 0.002714487724006176, "learning_rate": 3.5708174824471947e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 845 }, { "completion_length": 162.0, "epoch": 7.109243697478991, "grad_norm": 0.015195288695394993, "kl": 0.005958933383226395, "learning_rate": 3.5260025767333894e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 846 }, { "completion_length": 256.0, "epoch": 7.117647058823529, "grad_norm": 0.004124050959944725, "kl": 0.0011034641647711396, "learning_rate": 3.481449324901412e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 847 }, { "completion_length": 216.5, "epoch": 7.126050420168067, "grad_norm": 0.011551975272595882, "kl": 0.0035149413160979748, "learning_rate": 3.4371582698185636e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 848 }, { "completion_length": 112.5, "epoch": 7.1344537815126055, "grad_norm": 0.014387990348041058, "kl": 0.005169201642274857, "learning_rate": 3.393129951157384e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 849 }, { "completion_length": 160.0, "epoch": 7.142857142857143, "grad_norm": 0.01080798078328371, "kl": 0.003539423691108823, "learning_rate": 3.3493649053890325e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 850 }, { "completion_length": 256.0, "epoch": 7.151260504201681, "grad_norm": 0.008672610856592655, "kl": 0.0016550447326153517, "learning_rate": 3.3058636657767927e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 851 }, { "completion_length": 73.0, "epoch": 7.159663865546219, "grad_norm": 1.9830690622329712, "kl": 0.00827146228402853, "learning_rate": 3.262626762369525e-07, "loss": 0.0003, "reward": 0.21149998903274536, "reward_std": 0.0544472262263298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.21149998903274536, "step": 852 }, { "completion_length": 256.0, "epoch": 7.168067226890757, "grad_norm": 0.8415823578834534, "kl": 0.005483152344822884, "learning_rate": 3.219654721995266e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 853 }, { "completion_length": 256.0, "epoch": 7.176470588235294, "grad_norm": 0.012542271055281162, "kl": 0.0029310660902410746, "learning_rate": 3.176948068254762e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 854 }, { "completion_length": 73.5, "epoch": 7.184873949579832, "grad_norm": 0.01396624930202961, "kl": 0.0034931343980133533, "learning_rate": 3.134507321515107e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 855 }, { "completion_length": 209.0, "epoch": 7.19327731092437, "grad_norm": 1.236088752746582, "kl": 0.002833005739375949, "learning_rate": 3.092332998903416e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 856 }, { "completion_length": 256.0, "epoch": 7.201680672268908, "grad_norm": 0.006115151569247246, "kl": 0.00146389938890934, "learning_rate": 3.050425614300487e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 857 }, { "completion_length": 256.0, "epoch": 7.2100840336134455, "grad_norm": 0.0075344243086874485, "kl": 0.001952876802533865, "learning_rate": 3.0087856783345916e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 858 }, { "completion_length": 256.0, "epoch": 7.218487394957983, "grad_norm": 0.006570841185748577, "kl": 0.001895903260447085, "learning_rate": 2.967413698375196e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 859 }, { "completion_length": 256.0, "epoch": 7.226890756302521, "grad_norm": 0.9664897322654724, "kl": 0.0015864297747612, "learning_rate": 2.9263101785268253e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 860 }, { "completion_length": 239.5, "epoch": 7.235294117647059, "grad_norm": 0.012150914408266544, "kl": 0.004403703846037388, "learning_rate": 2.8854756196229017e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 861 }, { "completion_length": 251.5, "epoch": 7.243697478991597, "grad_norm": 0.008573394268751144, "kl": 0.0030458783730864525, "learning_rate": 2.844910519219632e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 862 }, { "completion_length": 256.0, "epoch": 7.2521008403361344, "grad_norm": 0.009591581299901009, "kl": 0.0031162111554294825, "learning_rate": 2.8046153715899695e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 863 }, { "completion_length": 256.0, "epoch": 7.260504201680672, "grad_norm": 0.007741772104054689, "kl": 0.00208290945738554, "learning_rate": 2.764590667717562e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 864 }, { "completion_length": 256.0, "epoch": 7.26890756302521, "grad_norm": 0.006048483308404684, "kl": 0.0018178317695856094, "learning_rate": 2.7248368952908055e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 865 }, { "completion_length": 256.0, "epoch": 7.277310924369748, "grad_norm": 0.8954395651817322, "kl": 0.004095052368938923, "learning_rate": 2.6853545386968607e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 866 }, { "completion_length": 256.0, "epoch": 7.285714285714286, "grad_norm": 0.009550808928906918, "kl": 0.004435448907315731, "learning_rate": 2.6461440790157974e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 867 }, { "completion_length": 256.0, "epoch": 7.294117647058823, "grad_norm": 0.012647325173020363, "kl": 0.005678193643689156, "learning_rate": 2.6072059940146775e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 868 }, { "completion_length": 237.5, "epoch": 7.302521008403361, "grad_norm": 0.004283882211893797, "kl": 0.0014394832542166114, "learning_rate": 2.568540758141791e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 869 }, { "completion_length": 256.0, "epoch": 7.310924369747899, "grad_norm": 0.005182648077607155, "kl": 0.002508516889065504, "learning_rate": 2.53014884252083e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 870 }, { "completion_length": 256.0, "epoch": 7.319327731092437, "grad_norm": 0.9002636671066284, "kl": 0.0037400806322693825, "learning_rate": 2.492030714945162e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 871 }, { "completion_length": 142.5, "epoch": 7.3277310924369745, "grad_norm": 0.01334297563880682, "kl": 0.003932413179427385, "learning_rate": 2.454186839872158e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 872 }, { "completion_length": 256.0, "epoch": 7.336134453781512, "grad_norm": 0.006335263140499592, "kl": 0.0017655100673437119, "learning_rate": 2.4166176784174795e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 873 }, { "completion_length": 214.5, "epoch": 7.34453781512605, "grad_norm": 0.008558849804103374, "kl": 0.0036917163524776697, "learning_rate": 2.3793236883495164e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 874 }, { "completion_length": 124.0, "epoch": 7.352941176470588, "grad_norm": 0.01766756735742092, "kl": 0.0038475499022752047, "learning_rate": 2.3423053240837518e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 875 }, { "completion_length": 256.0, "epoch": 7.361344537815126, "grad_norm": 0.00576981995254755, "kl": 0.0014689944218844175, "learning_rate": 2.3055630366772857e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 876 }, { "completion_length": 256.0, "epoch": 7.369747899159664, "grad_norm": 0.0044573531486094, "kl": 0.0011580396676436067, "learning_rate": 2.269097273823287e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 877 }, { "completion_length": 94.0, "epoch": 7.378151260504202, "grad_norm": 0.02928989566862583, "kl": 0.0060923462733626366, "learning_rate": 2.2329084798455747e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 878 }, { "completion_length": 220.5, "epoch": 7.38655462184874, "grad_norm": 0.006734924390912056, "kl": 0.002124710939824581, "learning_rate": 2.1969970956931762e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 879 }, { "completion_length": 256.0, "epoch": 7.394957983193278, "grad_norm": 1.0677118301391602, "kl": 0.0033596553839743137, "learning_rate": 2.1613635589349756e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 880 }, { "completion_length": 256.0, "epoch": 7.4033613445378155, "grad_norm": 0.8797102570533752, "kl": 0.001824253355152905, "learning_rate": 2.1260083037543817e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 881 }, { "completion_length": 194.5, "epoch": 7.411764705882353, "grad_norm": 0.009378692135214806, "kl": 0.0040238699875772, "learning_rate": 2.0909317609440093e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 882 }, { "completion_length": 222.0, "epoch": 7.420168067226891, "grad_norm": 0.006435913033783436, "kl": 0.0017432831227779388, "learning_rate": 2.0561343579004716e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 883 }, { "completion_length": 256.0, "epoch": 7.428571428571429, "grad_norm": 0.006500550080090761, "kl": 0.002906560432165861, "learning_rate": 2.0216165186191406e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 884 }, { "completion_length": 240.5, "epoch": 7.436974789915967, "grad_norm": 0.00812520831823349, "kl": 0.004356591962277889, "learning_rate": 1.9873786636889908e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 885 }, { "completion_length": 256.0, "epoch": 7.445378151260504, "grad_norm": 0.006396442651748657, "kl": 0.0020432290621101856, "learning_rate": 1.95342121028749e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 886 }, { "completion_length": 256.0, "epoch": 7.453781512605042, "grad_norm": 0.00533335143700242, "kl": 0.0021820450201630592, "learning_rate": 1.9197445721754777e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 887 }, { "completion_length": 200.5, "epoch": 7.46218487394958, "grad_norm": 0.01010017842054367, "kl": 0.002626066328957677, "learning_rate": 1.8863491596921745e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 888 }, { "completion_length": 256.0, "epoch": 7.470588235294118, "grad_norm": 0.009015013463795185, "kl": 0.0035524903796613216, "learning_rate": 1.8532353797501318e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 889 }, { "completion_length": 256.0, "epoch": 7.4789915966386555, "grad_norm": 0.006481496151536703, "kl": 0.0015803215792402625, "learning_rate": 1.8204036358303173e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 890 }, { "completion_length": 256.0, "epoch": 7.487394957983193, "grad_norm": 0.008498352952301502, "kl": 0.0038802213966846466, "learning_rate": 1.787854327977162e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 891 }, { "completion_length": 215.5, "epoch": 7.495798319327731, "grad_norm": 0.008856778964400291, "kl": 0.003848181338980794, "learning_rate": 1.7555878527937164e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 892 }, { "completion_length": 256.0, "epoch": 7.504201680672269, "grad_norm": 0.006828887388110161, "kl": 0.0031058997847139835, "learning_rate": 1.7236046034367959e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 893 }, { "completion_length": 232.0, "epoch": 7.512605042016807, "grad_norm": 0.9078944325447083, "kl": 0.004007690120488405, "learning_rate": 1.6919049696121957e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 894 }, { "completion_length": 256.0, "epoch": 7.5210084033613445, "grad_norm": 0.7900701761245728, "kl": 0.0034677290823310614, "learning_rate": 1.6604893375699594e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 895 }, { "completion_length": 106.0, "epoch": 7.529411764705882, "grad_norm": 0.013893961906433105, "kl": 0.004573720507323742, "learning_rate": 1.629358090099639e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 896 }, { "completion_length": 256.0, "epoch": 7.53781512605042, "grad_norm": 0.8935194611549377, "kl": 0.002059739548712969, "learning_rate": 1.5985116065256683e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 897 }, { "completion_length": 215.5, "epoch": 7.546218487394958, "grad_norm": 1.0523637533187866, "kl": 0.0023355232551693916, "learning_rate": 1.567950262702714e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 898 }, { "completion_length": 256.0, "epoch": 7.554621848739496, "grad_norm": 0.005016495008021593, "kl": 0.0010381457395851612, "learning_rate": 1.5376744310111019e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 899 }, { "completion_length": 106.5, "epoch": 7.563025210084033, "grad_norm": 0.008760005235671997, "kl": 0.001980091445147991, "learning_rate": 1.507684480352292e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 900 }, { "completion_length": 256.0, "epoch": 7.571428571428571, "grad_norm": 0.005708491429686546, "kl": 0.0012594442814588547, "learning_rate": 1.4779807761443638e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 901 }, { "completion_length": 81.5, "epoch": 7.579831932773109, "grad_norm": 0.010364564135670662, "kl": 0.0034439684823155403, "learning_rate": 1.4485636803175828e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 902 }, { "completion_length": 221.5, "epoch": 7.588235294117647, "grad_norm": 0.8227521777153015, "kl": 0.003628258593380451, "learning_rate": 1.419433551309976e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 903 }, { "completion_length": 213.0, "epoch": 7.5966386554621845, "grad_norm": 1.155175805091858, "kl": 0.0038738653529435396, "learning_rate": 1.3905907440629752e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 904 }, { "completion_length": 210.5, "epoch": 7.605042016806722, "grad_norm": 0.8970864415168762, "kl": 0.00421702116727829, "learning_rate": 1.362035610017079e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 905 }, { "completion_length": 256.0, "epoch": 7.61344537815126, "grad_norm": 0.004933992400765419, "kl": 0.0014493797207251191, "learning_rate": 1.3337684971075932e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 906 }, { "completion_length": 219.5, "epoch": 7.621848739495798, "grad_norm": 0.9020504951477051, "kl": 0.0057899439707398415, "learning_rate": 1.305789749760361e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 907 }, { "completion_length": 256.0, "epoch": 7.630252100840336, "grad_norm": 0.9547077417373657, "kl": 0.005229324102401733, "learning_rate": 1.278099708887587e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 908 }, { "completion_length": 256.0, "epoch": 7.6386554621848735, "grad_norm": 0.007069614715874195, "kl": 0.0016959388740360737, "learning_rate": 1.2506987118836912e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 909 }, { "completion_length": 61.5, "epoch": 7.647058823529412, "grad_norm": 0.0207932461053133, "kl": 0.0025781523436307907, "learning_rate": 1.223587092621162e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 910 }, { "completion_length": 255.0, "epoch": 7.65546218487395, "grad_norm": 0.005624694284051657, "kl": 0.002034971257671714, "learning_rate": 1.1967651814465353e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 911 }, { "completion_length": 256.0, "epoch": 7.663865546218488, "grad_norm": 0.8655515909194946, "kl": 0.003953531384468079, "learning_rate": 1.1702333051763271e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 912 }, { "completion_length": 256.0, "epoch": 7.6722689075630255, "grad_norm": 0.008253159001469612, "kl": 0.0019437936134636402, "learning_rate": 1.1439917870930795e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 913 }, { "completion_length": 256.0, "epoch": 7.680672268907563, "grad_norm": 0.003957549575716257, "kl": 0.001197000965476036, "learning_rate": 1.1180409469414094e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 914 }, { "completion_length": 256.0, "epoch": 7.689075630252101, "grad_norm": 0.005414122249931097, "kl": 0.0012987841619178653, "learning_rate": 1.0923811009241142e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 915 }, { "completion_length": 75.5, "epoch": 7.697478991596639, "grad_norm": 0.013497910462319851, "kl": 0.005083886440843344, "learning_rate": 1.067012561698319e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 916 }, { "completion_length": 256.0, "epoch": 7.705882352941177, "grad_norm": 0.8506262302398682, "kl": 0.00306339911185205, "learning_rate": 1.041935638371669e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 917 }, { "completion_length": 223.0, "epoch": 7.714285714285714, "grad_norm": 0.8511884808540344, "kl": 0.003050882602110505, "learning_rate": 1.0171506364985622e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 918 }, { "completion_length": 256.0, "epoch": 7.722689075630252, "grad_norm": 0.8621631264686584, "kl": 0.0033832965418696404, "learning_rate": 9.926578580764234e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 919 }, { "completion_length": 231.0, "epoch": 7.73109243697479, "grad_norm": 0.014862588606774807, "kl": 0.006442396901547909, "learning_rate": 9.684576015420277e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 920 }, { "completion_length": 224.0, "epoch": 7.739495798319328, "grad_norm": 1.0450727939605713, "kl": 0.0034931160043925047, "learning_rate": 9.445501617678654e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 921 }, { "completion_length": 156.5, "epoch": 7.7478991596638656, "grad_norm": 0.01929403282701969, "kl": 0.0040522669441998005, "learning_rate": 9.209358300585474e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 922 }, { "completion_length": 256.0, "epoch": 7.756302521008403, "grad_norm": 0.007035123184323311, "kl": 0.0035215544048696756, "learning_rate": 8.9761489414725e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 923 }, { "completion_length": 191.0, "epoch": 7.764705882352941, "grad_norm": 0.015435402281582355, "kl": 0.005384809337556362, "learning_rate": 8.745876381922147e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 924 }, { "completion_length": 256.0, "epoch": 7.773109243697479, "grad_norm": 0.005274576134979725, "kl": 0.002153881127014756, "learning_rate": 8.518543427732951e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 925 }, { "completion_length": 204.5, "epoch": 7.781512605042017, "grad_norm": 0.8903023600578308, "kl": 0.0051206136122345924, "learning_rate": 8.294152848885156e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 926 }, { "completion_length": 256.0, "epoch": 7.7899159663865545, "grad_norm": 0.878285825252533, "kl": 0.0027358955703675747, "learning_rate": 8.072707379507217e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 927 }, { "completion_length": 256.0, "epoch": 7.798319327731092, "grad_norm": 0.015547667630016804, "kl": 0.003042032476514578, "learning_rate": 7.854209717842231e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 928 }, { "completion_length": 129.0, "epoch": 7.80672268907563, "grad_norm": 0.011668304912745953, "kl": 0.002862770576030016, "learning_rate": 7.638662526215284e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 929 }, { "completion_length": 256.0, "epoch": 7.815126050420168, "grad_norm": 0.008492356166243553, "kl": 0.0019664831925183535, "learning_rate": 7.426068431000883e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 930 }, { "completion_length": 219.5, "epoch": 7.823529411764706, "grad_norm": 0.007373413071036339, "kl": 0.0025103178340941668, "learning_rate": 7.216430022591009e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 931 }, { "completion_length": 256.0, "epoch": 7.831932773109243, "grad_norm": 0.010425623506307602, "kl": 0.0035429110284894705, "learning_rate": 7.009749855363457e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 932 }, { "completion_length": 256.0, "epoch": 7.840336134453781, "grad_norm": 0.005711731966584921, "kl": 0.0013695968082174659, "learning_rate": 6.806030447650879e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 933 }, { "completion_length": 176.5, "epoch": 7.848739495798319, "grad_norm": 0.01851927861571312, "kl": 0.006873239763081074, "learning_rate": 6.605274281709929e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 934 }, { "completion_length": 256.0, "epoch": 7.857142857142857, "grad_norm": 0.0054503027349710464, "kl": 0.0015665149549022317, "learning_rate": 6.407483803691216e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 935 }, { "completion_length": 161.5, "epoch": 7.865546218487395, "grad_norm": 0.012459240853786469, "kl": 0.005014646332710981, "learning_rate": 6.212661423609184e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 936 }, { "completion_length": 256.0, "epoch": 7.873949579831933, "grad_norm": 0.016420535743236542, "kl": 0.00556594505906105, "learning_rate": 6.020809515313141e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 937 }, { "completion_length": 256.0, "epoch": 7.882352941176471, "grad_norm": 0.9018909335136414, "kl": 0.0017713907873257995, "learning_rate": 5.83193041645802e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 938 }, { "completion_length": 256.0, "epoch": 7.890756302521009, "grad_norm": 0.006852818187326193, "kl": 0.001372675527818501, "learning_rate": 5.6460264284760316e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 939 }, { "completion_length": 256.0, "epoch": 7.899159663865547, "grad_norm": 0.8550215363502502, "kl": 0.0019658529199659824, "learning_rate": 5.463099816548578e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 940 }, { "completion_length": 256.0, "epoch": 7.907563025210084, "grad_norm": 0.6682001948356628, "kl": 0.0015068496577441692, "learning_rate": 5.283152809578751e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 941 }, { "completion_length": 240.5, "epoch": 7.915966386554622, "grad_norm": 0.8020376563072205, "kl": 0.0010811786632984877, "learning_rate": 5.106187600163987e-08, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 942 }, { "completion_length": 256.0, "epoch": 7.92436974789916, "grad_norm": 0.006142708472907543, "kl": 0.0014996298123151064, "learning_rate": 4.932206344569562e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 943 }, { "completion_length": 66.5, "epoch": 7.932773109243698, "grad_norm": 0.009661697782576084, "kl": 0.0012731223832815886, "learning_rate": 4.761211162702117e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 944 }, { "completion_length": 180.0, "epoch": 7.9411764705882355, "grad_norm": 0.007925852201879025, "kl": 0.0019000464817509055, "learning_rate": 4.593204138084006e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 945 }, { "completion_length": 256.0, "epoch": 7.949579831932773, "grad_norm": 0.010862650349736214, "kl": 0.005121957045048475, "learning_rate": 4.428187317827848e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 946 }, { "completion_length": 256.0, "epoch": 7.957983193277311, "grad_norm": 0.014693349599838257, "kl": 0.0025509949773550034, "learning_rate": 4.26616271261146e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 947 }, { "completion_length": 256.0, "epoch": 7.966386554621849, "grad_norm": 0.006174801383167505, "kl": 0.0018048429628834128, "learning_rate": 4.1071322966535487e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 948 }, { "completion_length": 256.0, "epoch": 7.974789915966387, "grad_norm": 0.7854158878326416, "kl": 0.0038359705358743668, "learning_rate": 3.95109800768953e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 949 }, { "completion_length": 256.0, "epoch": 7.983193277310924, "grad_norm": 0.0063298712484538555, "kl": 0.0014807393308728933, "learning_rate": 3.798061746947995e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 950 }, { "completion_length": 256.0, "epoch": 7.991596638655462, "grad_norm": 0.014239716343581676, "kl": 0.0040963757783174515, "learning_rate": 3.648025379127479e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 951 }, { "completion_length": 215.5, "epoch": 8.0, "grad_norm": 1.0584113597869873, "kl": 0.003608936909586191, "learning_rate": 3.5009907323737826e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 952 }, { "completion_length": 105.0, "epoch": 8.008403361344538, "grad_norm": 0.007620012387633324, "kl": 0.0024689948186278343, "learning_rate": 3.3569595982576584e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 953 }, { "completion_length": 256.0, "epoch": 8.016806722689076, "grad_norm": 0.010016584768891335, "kl": 0.0037446990609169006, "learning_rate": 3.2159337317530234e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 954 }, { "completion_length": 176.5, "epoch": 8.025210084033613, "grad_norm": 0.008861626498401165, "kl": 0.0036016590893268585, "learning_rate": 3.077914851215585e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 955 }, { "completion_length": 195.0, "epoch": 8.033613445378151, "grad_norm": 0.006272186059504747, "kl": 0.002680514007806778, "learning_rate": 2.9429046383618042e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 956 }, { "completion_length": 147.5, "epoch": 8.042016806722689, "grad_norm": 0.03004760853946209, "kl": 0.005793701857328415, "learning_rate": 2.810904738248549e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 957 }, { "completion_length": 256.0, "epoch": 8.050420168067227, "grad_norm": 0.0052776276133954525, "kl": 0.0011807953706011176, "learning_rate": 2.681916759252917e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 958 }, { "completion_length": 256.0, "epoch": 8.058823529411764, "grad_norm": 0.8772838711738586, "kl": 0.004750865511596203, "learning_rate": 2.555942273052753e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 959 }, { "completion_length": 256.0, "epoch": 8.067226890756302, "grad_norm": 0.9010158181190491, "kl": 0.002602271270006895, "learning_rate": 2.4329828146074096e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 960 }, { "completion_length": 256.0, "epoch": 8.07563025210084, "grad_norm": 0.011562367901206017, "kl": 0.004691132344305515, "learning_rate": 2.313039882139101e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 961 }, { "completion_length": 140.5, "epoch": 8.084033613445378, "grad_norm": 1.2488014698028564, "kl": 0.003145235124975443, "learning_rate": 2.1961149371145795e-08, "loss": 0.0001, "reward": -0.01000000536441803, "reward_std": 0.36769551038742065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01000000536441803, "step": 962 }, { "completion_length": 188.5, "epoch": 8.092436974789916, "grad_norm": 0.014288817532360554, "kl": 0.007699177134782076, "learning_rate": 2.082209404227403e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 963 }, { "completion_length": 256.0, "epoch": 8.100840336134453, "grad_norm": 0.008828825317323208, "kl": 0.004723397083580494, "learning_rate": 1.9713246713805588e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 964 }, { "completion_length": 256.0, "epoch": 8.109243697478991, "grad_norm": 0.005150586366653442, "kl": 0.0016746178735047579, "learning_rate": 1.8634620896695044e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 965 }, { "completion_length": 256.0, "epoch": 8.117647058823529, "grad_norm": 0.007571014575660229, "kl": 0.002188426675274968, "learning_rate": 1.7586229733657646e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 966 }, { "completion_length": 256.0, "epoch": 8.126050420168067, "grad_norm": 0.00575451273471117, "kl": 0.0012173590948805213, "learning_rate": 1.6568085999008886e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 967 }, { "completion_length": 256.0, "epoch": 8.134453781512605, "grad_norm": 0.007901258766651154, "kl": 0.0021511928644031286, "learning_rate": 1.5580202098509078e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 968 }, { "completion_length": 254.5, "epoch": 8.142857142857142, "grad_norm": 0.7763856053352356, "kl": 0.00271783908829093, "learning_rate": 1.4622590069211517e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 969 }, { "completion_length": 249.5, "epoch": 8.15126050420168, "grad_norm": 0.0095134312286973, "kl": 0.0035448165144771338, "learning_rate": 1.3695261579316776e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 970 }, { "completion_length": 136.5, "epoch": 8.159663865546218, "grad_norm": 0.009128165431320667, "kl": 0.0016892245039343834, "learning_rate": 1.2798227928029483e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 971 }, { "completion_length": 230.5, "epoch": 8.168067226890756, "grad_norm": 0.00985642708837986, "kl": 0.004231259226799011, "learning_rate": 1.193150004542204e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 972 }, { "completion_length": 256.0, "epoch": 8.176470588235293, "grad_norm": 0.006507782265543938, "kl": 0.0013068061089143157, "learning_rate": 1.109508849230001e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 973 }, { "completion_length": 199.0, "epoch": 8.184873949579831, "grad_norm": 0.009809632785618305, "kl": 0.0029625543393194675, "learning_rate": 1.0289003460074165e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 974 }, { "completion_length": 256.0, "epoch": 8.193277310924369, "grad_norm": 0.004974020179361105, "kl": 0.0011391467414796352, "learning_rate": 9.513254770636138e-09, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 975 }, { "completion_length": 159.0, "epoch": 8.201680672268907, "grad_norm": 1.9078142642974854, "kl": 0.005038903560489416, "learning_rate": 8.767851876239075e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 976 }, { "completion_length": 62.0, "epoch": 8.210084033613445, "grad_norm": 0.008387730456888676, "kl": 0.0018561738543212414, "learning_rate": 8.052803859382174e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 977 }, { "completion_length": 256.0, "epoch": 8.218487394957982, "grad_norm": 0.004827218595892191, "kl": 0.0019515759777277708, "learning_rate": 7.368119432699383e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 978 }, { "completion_length": 256.0, "epoch": 8.22689075630252, "grad_norm": 0.006704502273350954, "kl": 0.0017579683335497975, "learning_rate": 6.7138069388547614e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 979 }, { "completion_length": 254.0, "epoch": 8.235294117647058, "grad_norm": 0.8928896188735962, "kl": 0.0026945732533931732, "learning_rate": 6.089874350439507e-09, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 980 }, { "completion_length": 256.0, "epoch": 8.243697478991596, "grad_norm": 0.005098224151879549, "kl": 0.0011380916694179177, "learning_rate": 5.4963292698750896e-09, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 981 }, { "completion_length": 256.0, "epoch": 8.252100840336134, "grad_norm": 0.009711641818284988, "kl": 0.001982130343094468, "learning_rate": 4.933178929321103e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 982 }, { "completion_length": 211.5, "epoch": 8.260504201680673, "grad_norm": 0.9163153171539307, "kl": 0.0035074464976787567, "learning_rate": 4.400430190586724e-09, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 983 }, { "completion_length": 256.0, "epoch": 8.268907563025211, "grad_norm": 0.005259423516690731, "kl": 0.0021749367006123066, "learning_rate": 3.8980895450474455e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 984 }, { "completion_length": 256.0, "epoch": 8.277310924369749, "grad_norm": 0.0032086444552987814, "kl": 0.0008269399404525757, "learning_rate": 3.4261631135654174e-09, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 985 }, { "completion_length": 256.0, "epoch": 8.285714285714286, "grad_norm": 0.009038717485964298, "kl": 0.0017567822942510247, "learning_rate": 2.984656646415063e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 986 }, { "completion_length": 256.0, "epoch": 8.294117647058824, "grad_norm": 0.01052094530314207, "kl": 0.0016078234184533358, "learning_rate": 2.573575523213412e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 987 }, { "completion_length": 256.0, "epoch": 8.302521008403362, "grad_norm": 0.004637818783521652, "kl": 0.0014643018366768956, "learning_rate": 2.192924752854042e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 988 }, { "completion_length": 256.0, "epoch": 8.3109243697479, "grad_norm": 0.8975232243537903, "kl": 0.004763162694871426, "learning_rate": 1.842708973447127e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 989 }, { "completion_length": 116.0, "epoch": 8.319327731092438, "grad_norm": 0.009518878534436226, "kl": 0.00221485598012805, "learning_rate": 1.5229324522605949e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 990 }, { "completion_length": 206.5, "epoch": 8.327731092436975, "grad_norm": 1.0217604637145996, "kl": 0.0055153220891952515, "learning_rate": 1.2335990856710001e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 991 }, { "completion_length": 181.5, "epoch": 8.336134453781513, "grad_norm": 1.393789529800415, "kl": 0.0032854671590030193, "learning_rate": 9.747123991141193e-10, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 992 }, { "completion_length": 256.0, "epoch": 8.344537815126051, "grad_norm": 0.006702889688313007, "kl": 0.001793250092305243, "learning_rate": 7.462755470422078e-10, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 993 }, { "completion_length": 109.5, "epoch": 8.352941176470589, "grad_norm": 0.0069739194586873055, "kl": 0.001861305208876729, "learning_rate": 5.48291312886251e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 994 }, { "completion_length": 256.0, "epoch": 8.361344537815127, "grad_norm": 0.007762628607451916, "kl": 0.002744232304394245, "learning_rate": 3.8076210902182607e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 995 }, { "completion_length": 256.0, "epoch": 8.369747899159664, "grad_norm": 0.0054655857384204865, "kl": 0.001322555122897029, "learning_rate": 2.43689976739403e-10, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 996 }, { "completion_length": 166.5, "epoch": 8.378151260504202, "grad_norm": 0.012693590484559536, "kl": 0.0055658938363194466, "learning_rate": 1.3707658621964216e-10, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 997 }, { "completion_length": 256.0, "epoch": 8.38655462184874, "grad_norm": 0.006239285226911306, "kl": 0.002571606542915106, "learning_rate": 6.092323651313293e-11, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 998 }, { "completion_length": 256.0, "epoch": 8.394957983193278, "grad_norm": 0.005442825146019459, "kl": 0.0013467655517160892, "learning_rate": 1.5230855524017708e-11, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 999 }, { "completion_length": 256.0, "epoch": 8.403361344537815, "grad_norm": 0.006616292521357536, "kl": 0.0019641756080091, "learning_rate": 0.0, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }