diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12650 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 1000, + "global_step": 8316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012025012025012025, + "grad_norm": 801.1768400159908, + "learning_rate": 6.009615384615385e-10, + "logits/generated": -2.701044797897339, + "logits/real": -2.630336046218872, + "logps/generated": -268.9845275878906, + "logps/real": -454.865478515625, + "loss": 1.0444, + "rewards/accuracies": 0.0, + "rewards/generated": 0.0, + "rewards/margins": 0.0, + "rewards/real": 0.0, + "step": 1 + }, + { + "epoch": 0.0012025012025012026, + "grad_norm": 779.6618128301591, + "learning_rate": 6.009615384615385e-09, + "logits/generated": -2.5943427085876465, + "logits/real": -2.5661115646362305, + "logps/generated": -653.7264404296875, + "logps/real": -644.458740234375, + "loss": 1.1575, + "rewards/accuracies": 0.2777777910232544, + "rewards/generated": 0.0031201376114040613, + "rewards/margins": -0.036962077021598816, + "rewards/real": -0.03384193778038025, + "step": 10 + }, + { + "epoch": 0.002405002405002405, + "grad_norm": 801.905899552404, + "learning_rate": 1.201923076923077e-08, + "logits/generated": -2.601459503173828, + "logits/real": -2.5791308879852295, + "logps/generated": -425.9059143066406, + "logps/real": -415.3407287597656, + "loss": 1.1469, + "rewards/accuracies": 0.574999988079071, + "rewards/generated": -0.014987630769610405, + "rewards/margins": 0.01562613621354103, + "rewards/real": 0.0006385032320395112, + "step": 20 + }, + { + "epoch": 0.0036075036075036075, + "grad_norm": 915.0921462682539, + "learning_rate": 1.802884615384615e-08, + "logits/generated": -2.614966869354248, + "logits/real": -2.579796552658081, + "logps/generated": -541.577392578125, + "logps/real": -548.8224487304688, + "loss": 1.1734, + "rewards/accuracies": 0.42500001192092896, + "rewards/generated": 0.016047468408942223, + "rewards/margins": -0.0203808955848217, + "rewards/real": -0.004333429038524628, + "step": 30 + }, + { + "epoch": 0.00481000481000481, + "grad_norm": 730.4416358593099, + "learning_rate": 2.403846153846154e-08, + "logits/generated": -2.593712091445923, + "logits/real": -2.599540948867798, + "logps/generated": -575.514404296875, + "logps/real": -462.55242919921875, + "loss": 1.1219, + "rewards/accuracies": 0.5249999761581421, + "rewards/generated": 0.0017959155375137925, + "rewards/margins": 0.014108413830399513, + "rewards/real": 0.015904325991868973, + "step": 40 + }, + { + "epoch": 0.006012506012506013, + "grad_norm": 764.8278246157919, + "learning_rate": 3.004807692307692e-08, + "logits/generated": -2.609126329421997, + "logits/real": -2.536421537399292, + "logps/generated": -554.6502075195312, + "logps/real": -468.76885986328125, + "loss": 1.1639, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.04441952705383301, + "rewards/margins": 0.1175607442855835, + "rewards/real": 0.07314121723175049, + "step": 50 + }, + { + "epoch": 0.007215007215007215, + "grad_norm": 723.6041258034065, + "learning_rate": 3.60576923076923e-08, + "logits/generated": -2.613905668258667, + "logits/real": -2.5859856605529785, + "logps/generated": -523.7518310546875, + "logps/real": -452.20654296875, + "loss": 1.1171, + "rewards/accuracies": 0.800000011920929, + "rewards/generated": 0.0223550945520401, + "rewards/margins": 0.091279536485672, + "rewards/real": 0.11363464593887329, + "step": 60 + }, + { + "epoch": 0.008417508417508417, + "grad_norm": 807.1990949198348, + "learning_rate": 4.206730769230769e-08, + "logits/generated": -2.6600990295410156, + "logits/real": -2.5796685218811035, + "logps/generated": -488.0248107910156, + "logps/real": -502.6383361816406, + "loss": 1.0794, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.03872176259756088, + "rewards/margins": 0.2055022418498993, + "rewards/real": 0.1667805016040802, + "step": 70 + }, + { + "epoch": 0.00962000962000962, + "grad_norm": 557.9206390670668, + "learning_rate": 4.807692307692308e-08, + "logits/generated": -2.5978739261627197, + "logits/real": -2.5913002490997314, + "logps/generated": -405.31683349609375, + "logps/real": -380.29229736328125, + "loss": 1.0792, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.05246102809906006, + "rewards/margins": 0.22621431946754456, + "rewards/real": 0.1737532913684845, + "step": 80 + }, + { + "epoch": 0.010822510822510822, + "grad_norm": 542.1020748733775, + "learning_rate": 5.4086538461538464e-08, + "logits/generated": -2.5775256156921387, + "logits/real": -2.583211898803711, + "logps/generated": -609.107421875, + "logps/real": -473.18231201171875, + "loss": 0.9755, + "rewards/accuracies": 0.7749999761581421, + "rewards/generated": -0.1160820946097374, + "rewards/margins": 0.31075945496559143, + "rewards/real": 0.19467735290527344, + "step": 90 + }, + { + "epoch": 0.012025012025012025, + "grad_norm": 640.8082792099547, + "learning_rate": 6.009615384615384e-08, + "logits/generated": -2.6364057064056396, + "logits/real": -2.643916606903076, + "logps/generated": -570.5785522460938, + "logps/real": -506.00396728515625, + "loss": 0.9193, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.19135859608650208, + "rewards/margins": 0.3666282296180725, + "rewards/real": 0.17526963353157043, + "step": 100 + }, + { + "epoch": 0.013227513227513227, + "grad_norm": 1166.4897957447906, + "learning_rate": 6.610576923076924e-08, + "logits/generated": -2.634885311126709, + "logits/real": -2.648221492767334, + "logps/generated": -475.5386657714844, + "logps/real": -539.703369140625, + "loss": 1.0419, + "rewards/accuracies": 0.75, + "rewards/generated": 0.035467393696308136, + "rewards/margins": 0.3378516435623169, + "rewards/real": 0.37331902980804443, + "step": 110 + }, + { + "epoch": 0.01443001443001443, + "grad_norm": 812.577454904602, + "learning_rate": 7.21153846153846e-08, + "logits/generated": -2.6088433265686035, + "logits/real": -2.587759494781494, + "logps/generated": -385.3928527832031, + "logps/real": -301.99420166015625, + "loss": 0.9061, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.4538938105106354, + "rewards/margins": 0.6048692464828491, + "rewards/real": 0.15097543597221375, + "step": 120 + }, + { + "epoch": 0.015632515632515633, + "grad_norm": 820.3630614817264, + "learning_rate": 7.812499999999999e-08, + "logits/generated": -2.6701788902282715, + "logits/real": -2.5965487957000732, + "logps/generated": -540.5206909179688, + "logps/real": -473.69091796875, + "loss": 0.9399, + "rewards/accuracies": 0.800000011920929, + "rewards/generated": -0.04602137207984924, + "rewards/margins": 0.4468808174133301, + "rewards/real": 0.40085944533348083, + "step": 130 + }, + { + "epoch": 0.016835016835016835, + "grad_norm": 672.691049772504, + "learning_rate": 8.413461538461539e-08, + "logits/generated": -2.666149139404297, + "logits/real": -2.6245312690734863, + "logps/generated": -540.637451171875, + "logps/real": -487.11004638671875, + "loss": 0.9741, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -0.5353710055351257, + "rewards/margins": 0.8442476987838745, + "rewards/real": 0.30887678265571594, + "step": 140 + }, + { + "epoch": 0.018037518037518036, + "grad_norm": 382.5992415420118, + "learning_rate": 9.014423076923076e-08, + "logits/generated": -2.625582218170166, + "logits/real": -2.588461399078369, + "logps/generated": -458.9794006347656, + "logps/real": -340.5458679199219, + "loss": 0.7644, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.8172666430473328, + "rewards/margins": 1.0257928371429443, + "rewards/real": 0.2085261046886444, + "step": 150 + }, + { + "epoch": 0.01924001924001924, + "grad_norm": 514.8693438152337, + "learning_rate": 9.615384615384616e-08, + "logits/generated": -2.6541621685028076, + "logits/real": -2.6064515113830566, + "logps/generated": -494.205322265625, + "logps/real": -537.3380126953125, + "loss": 0.8356, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.7966269254684448, + "rewards/margins": 1.1517274379730225, + "rewards/real": 0.3551006019115448, + "step": 160 + }, + { + "epoch": 0.020442520442520443, + "grad_norm": 290.27033949114116, + "learning_rate": 1.0216346153846154e-07, + "logits/generated": -2.7106773853302, + "logits/real": -2.659435272216797, + "logps/generated": -480.480224609375, + "logps/real": -387.47772216796875, + "loss": 0.822, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.075594186782837, + "rewards/margins": 1.4880707263946533, + "rewards/real": 0.4124765992164612, + "step": 170 + }, + { + "epoch": 0.021645021645021644, + "grad_norm": 420.11050173483665, + "learning_rate": 1.0817307692307693e-07, + "logits/generated": -2.6927719116210938, + "logits/real": -2.6310219764709473, + "logps/generated": -454.25994873046875, + "logps/real": -530.0487060546875, + "loss": 0.8513, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.28558117151260376, + "rewards/margins": 0.7983309626579285, + "rewards/real": 0.5127498507499695, + "step": 180 + }, + { + "epoch": 0.02284752284752285, + "grad_norm": 568.4859686887961, + "learning_rate": 1.141826923076923e-07, + "logits/generated": -2.6487412452697754, + "logits/real": -2.6339938640594482, + "logps/generated": -395.45294189453125, + "logps/real": -395.30145263671875, + "loss": 0.7378, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.9616363644599915, + "rewards/margins": 1.4832828044891357, + "rewards/real": 0.5216464996337891, + "step": 190 + }, + { + "epoch": 0.02405002405002405, + "grad_norm": 770.9599691138654, + "learning_rate": 1.2019230769230769e-07, + "logits/generated": -2.6841750144958496, + "logits/real": -2.6595330238342285, + "logps/generated": -557.3283081054688, + "logps/real": -498.83392333984375, + "loss": 0.8671, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.3152936697006226, + "rewards/margins": 1.9122545719146729, + "rewards/real": 0.5969609022140503, + "step": 200 + }, + { + "epoch": 0.025252525252525252, + "grad_norm": 633.8366426708363, + "learning_rate": 1.2620192307692308e-07, + "logits/generated": -2.6651999950408936, + "logits/real": -2.6084225177764893, + "logps/generated": -550.3500366210938, + "logps/real": -575.7730712890625, + "loss": 0.9144, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.824698269367218, + "rewards/margins": 1.798667311668396, + "rewards/real": 0.9739691019058228, + "step": 210 + }, + { + "epoch": 0.026455026455026454, + "grad_norm": 301.80349725323805, + "learning_rate": 1.3221153846153847e-07, + "logits/generated": -2.6506190299987793, + "logits/real": -2.632826805114746, + "logps/generated": -338.2746276855469, + "logps/real": -296.9292297363281, + "loss": 0.6624, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.1101773977279663, + "rewards/margins": 1.6372003555297852, + "rewards/real": 0.5270229578018188, + "step": 220 + }, + { + "epoch": 0.02765752765752766, + "grad_norm": 345.67994999617287, + "learning_rate": 1.3822115384615384e-07, + "logits/generated": -2.6194703578948975, + "logits/real": -2.614107131958008, + "logps/generated": -432.5381774902344, + "logps/real": -374.21966552734375, + "loss": 0.7959, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.127386450767517, + "rewards/margins": 1.9437313079833984, + "rewards/real": 0.8163448572158813, + "step": 230 + }, + { + "epoch": 0.02886002886002886, + "grad_norm": 602.2763147294542, + "learning_rate": 1.442307692307692e-07, + "logits/generated": -2.604137897491455, + "logits/real": -2.5926718711853027, + "logps/generated": -501.0613708496094, + "logps/real": -463.6505432128906, + "loss": 0.7729, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -0.8467072248458862, + "rewards/margins": 1.6996408700942993, + "rewards/real": 0.8529335260391235, + "step": 240 + }, + { + "epoch": 0.03006253006253006, + "grad_norm": 141.56496682275844, + "learning_rate": 1.502403846153846e-07, + "logits/generated": -2.6360106468200684, + "logits/real": -2.5817055702209473, + "logps/generated": -529.6976318359375, + "logps/real": -435.2784729003906, + "loss": 0.7024, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.684234380722046, + "rewards/margins": 2.7245383262634277, + "rewards/real": 1.0403037071228027, + "step": 250 + }, + { + "epoch": 0.031265031265031266, + "grad_norm": 564.9266573022827, + "learning_rate": 1.5624999999999999e-07, + "logits/generated": -2.692927598953247, + "logits/real": -2.611467123031616, + "logps/generated": -524.0853271484375, + "logps/real": -434.35821533203125, + "loss": 0.7334, + "rewards/accuracies": 0.800000011920929, + "rewards/generated": -1.3338186740875244, + "rewards/margins": 2.573237419128418, + "rewards/real": 1.239418625831604, + "step": 260 + }, + { + "epoch": 0.032467532467532464, + "grad_norm": 524.1826937063853, + "learning_rate": 1.6225961538461538e-07, + "logits/generated": -2.6772425174713135, + "logits/real": -2.6025278568267822, + "logps/generated": -580.0106201171875, + "logps/real": -491.933837890625, + "loss": 0.7656, + "rewards/accuracies": 0.875, + "rewards/generated": -1.5975688695907593, + "rewards/margins": 2.1904983520507812, + "rewards/real": 0.5929292440414429, + "step": 270 + }, + { + "epoch": 0.03367003367003367, + "grad_norm": 428.7369151880417, + "learning_rate": 1.6826923076923077e-07, + "logits/generated": -2.6498706340789795, + "logits/real": -2.650085687637329, + "logps/generated": -392.8506774902344, + "logps/real": -369.8538513183594, + "loss": 0.6524, + "rewards/accuracies": 0.875, + "rewards/generated": -1.6748186349868774, + "rewards/margins": 2.5976762771606445, + "rewards/real": 0.9228577613830566, + "step": 280 + }, + { + "epoch": 0.034872534872534874, + "grad_norm": 243.60844008737482, + "learning_rate": 1.7427884615384614e-07, + "logits/generated": -2.666207790374756, + "logits/real": -2.6470437049865723, + "logps/generated": -502.02264404296875, + "logps/real": -403.56585693359375, + "loss": 0.7117, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -1.3320616483688354, + "rewards/margins": 2.2303693294525146, + "rewards/real": 0.8983078002929688, + "step": 290 + }, + { + "epoch": 0.03607503607503607, + "grad_norm": 593.7863933773639, + "learning_rate": 1.8028846153846153e-07, + "logits/generated": -2.6411917209625244, + "logits/real": -2.5625710487365723, + "logps/generated": -457.24298095703125, + "logps/real": -396.57537841796875, + "loss": 0.6727, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -1.5696680545806885, + "rewards/margins": 1.9555574655532837, + "rewards/real": 0.3858892619609833, + "step": 300 + }, + { + "epoch": 0.03727753727753728, + "grad_norm": 564.2932253829903, + "learning_rate": 1.8629807692307692e-07, + "logits/generated": -2.5881824493408203, + "logits/real": -2.561656951904297, + "logps/generated": -487.9922790527344, + "logps/real": -400.12200927734375, + "loss": 0.6698, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.118783712387085, + "rewards/margins": 2.787592887878418, + "rewards/real": 0.6688090562820435, + "step": 310 + }, + { + "epoch": 0.03848003848003848, + "grad_norm": 472.1429017589802, + "learning_rate": 1.9230769230769231e-07, + "logits/generated": -2.5694048404693604, + "logits/real": -2.504631519317627, + "logps/generated": -482.6383361816406, + "logps/real": -373.3436279296875, + "loss": 0.7241, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -1.9490076303482056, + "rewards/margins": 2.4001662731170654, + "rewards/real": 0.45115867257118225, + "step": 320 + }, + { + "epoch": 0.03968253968253968, + "grad_norm": 351.72945578036473, + "learning_rate": 1.9831730769230768e-07, + "logits/generated": -2.5863828659057617, + "logits/real": -2.556687831878662, + "logps/generated": -566.9625244140625, + "logps/real": -511.40673828125, + "loss": 0.6826, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.6171916723251343, + "rewards/margins": 2.7914156913757324, + "rewards/real": 1.1742244958877563, + "step": 330 + }, + { + "epoch": 0.040885040885040885, + "grad_norm": 230.944199336625, + "learning_rate": 2.0432692307692307e-07, + "logits/generated": -2.579220771789551, + "logits/real": -2.5368800163269043, + "logps/generated": -587.6741333007812, + "logps/real": -509.1455993652344, + "loss": 0.61, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.6644325256347656, + "rewards/margins": 4.057629585266113, + "rewards/real": 1.3931968212127686, + "step": 340 + }, + { + "epoch": 0.04208754208754209, + "grad_norm": 610.2796148482761, + "learning_rate": 2.1033653846153846e-07, + "logits/generated": -2.6246964931488037, + "logits/real": -2.607069492340088, + "logps/generated": -405.486083984375, + "logps/real": -408.4022521972656, + "loss": 0.6567, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -1.6070611476898193, + "rewards/margins": 3.0959818363189697, + "rewards/real": 1.4889209270477295, + "step": 350 + }, + { + "epoch": 0.04329004329004329, + "grad_norm": 311.0025856326943, + "learning_rate": 2.1634615384615386e-07, + "logits/generated": -2.555190324783325, + "logits/real": -2.564065456390381, + "logps/generated": -445.73095703125, + "logps/real": -367.8589172363281, + "loss": 0.5773, + "rewards/accuracies": 0.875, + "rewards/generated": -1.8315942287445068, + "rewards/margins": 3.250680923461914, + "rewards/real": 1.4190864562988281, + "step": 360 + }, + { + "epoch": 0.04449254449254449, + "grad_norm": 355.8050012317517, + "learning_rate": 2.223557692307692e-07, + "logits/generated": -2.5479896068573, + "logits/real": -2.5375823974609375, + "logps/generated": -451.8771057128906, + "logps/real": -421.00421142578125, + "loss": 0.6029, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.6865472793579102, + "rewards/margins": 3.6383838653564453, + "rewards/real": 1.9518362283706665, + "step": 370 + }, + { + "epoch": 0.0456950456950457, + "grad_norm": 213.75664258958034, + "learning_rate": 2.283653846153846e-07, + "logits/generated": -2.6107444763183594, + "logits/real": -2.5683038234710693, + "logps/generated": -472.42938232421875, + "logps/real": -404.27276611328125, + "loss": 0.5558, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.811893105506897, + "rewards/margins": 4.338118076324463, + "rewards/real": 2.5262250900268555, + "step": 380 + }, + { + "epoch": 0.046897546897546896, + "grad_norm": 803.8075458632958, + "learning_rate": 2.3437499999999998e-07, + "logits/generated": -2.624004364013672, + "logits/real": -2.607717514038086, + "logps/generated": -532.3065795898438, + "logps/real": -464.7298889160156, + "loss": 0.7197, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.13997170329093933, + "rewards/margins": 3.161569356918335, + "rewards/real": 3.0215978622436523, + "step": 390 + }, + { + "epoch": 0.0481000481000481, + "grad_norm": 77.72914826224212, + "learning_rate": 2.4038461538461537e-07, + "logits/generated": -2.6229655742645264, + "logits/real": -2.5912537574768066, + "logps/generated": -599.0198364257812, + "logps/real": -509.234619140625, + "loss": 0.6238, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.3059558868408203, + "rewards/margins": 5.463759899139404, + "rewards/real": 3.157804012298584, + "step": 400 + }, + { + "epoch": 0.0493025493025493, + "grad_norm": 215.20858836955986, + "learning_rate": 2.4639423076923076e-07, + "logits/generated": -2.5568790435791016, + "logits/real": -2.519082546234131, + "logps/generated": -547.3959350585938, + "logps/real": -472.25244140625, + "loss": 0.5749, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.3383524417877197, + "rewards/margins": 4.01686954498291, + "rewards/real": 2.678516387939453, + "step": 410 + }, + { + "epoch": 0.050505050505050504, + "grad_norm": 270.6802091677648, + "learning_rate": 2.5240384615384616e-07, + "logits/generated": -2.525097131729126, + "logits/real": -2.5024070739746094, + "logps/generated": -409.02276611328125, + "logps/real": -322.4518127441406, + "loss": 0.5, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.2078709602355957, + "rewards/margins": 3.8254752159118652, + "rewards/real": 1.6176042556762695, + "step": 420 + }, + { + "epoch": 0.05170755170755171, + "grad_norm": 438.88307962028466, + "learning_rate": 2.5841346153846155e-07, + "logits/generated": -2.5554592609405518, + "logits/real": -2.5345654487609863, + "logps/generated": -374.7137145996094, + "logps/real": -272.47564697265625, + "loss": 0.6292, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -2.2819714546203613, + "rewards/margins": 3.79643177986145, + "rewards/real": 1.5144603252410889, + "step": 430 + }, + { + "epoch": 0.05291005291005291, + "grad_norm": 159.03520082916455, + "learning_rate": 2.6442307692307694e-07, + "logits/generated": -2.523759365081787, + "logits/real": -2.5251123905181885, + "logps/generated": -440.36126708984375, + "logps/real": -395.1308288574219, + "loss": 0.6537, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.2008309364318848, + "rewards/margins": 4.090251922607422, + "rewards/real": 1.8894212245941162, + "step": 440 + }, + { + "epoch": 0.05411255411255411, + "grad_norm": 106.50435638461428, + "learning_rate": 2.704326923076923e-07, + "logits/generated": -2.577564239501953, + "logits/real": -2.5691657066345215, + "logps/generated": -575.0303344726562, + "logps/real": -491.9471130371094, + "loss": 0.6023, + "rewards/accuracies": 1.0, + "rewards/generated": -1.6865742206573486, + "rewards/margins": 4.896966457366943, + "rewards/real": 3.2103919982910156, + "step": 450 + }, + { + "epoch": 0.05531505531505532, + "grad_norm": 149.6158479648117, + "learning_rate": 2.7644230769230767e-07, + "logits/generated": -2.583193302154541, + "logits/real": -2.527129650115967, + "logps/generated": -511.2799377441406, + "logps/real": -359.5958251953125, + "loss": 0.7013, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.581364154815674, + "rewards/margins": 5.107150554656982, + "rewards/real": 2.5257863998413086, + "step": 460 + }, + { + "epoch": 0.056517556517556515, + "grad_norm": 286.1520809667947, + "learning_rate": 2.8245192307692306e-07, + "logits/generated": -2.5159034729003906, + "logits/real": -2.4691407680511475, + "logps/generated": -538.7835693359375, + "logps/real": -378.9813537597656, + "loss": 0.5576, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.5919742584228516, + "rewards/margins": 5.763204097747803, + "rewards/real": 2.171229600906372, + "step": 470 + }, + { + "epoch": 0.05772005772005772, + "grad_norm": 266.57122924881753, + "learning_rate": 2.884615384615384e-07, + "logits/generated": -2.57939076423645, + "logits/real": -2.518014430999756, + "logps/generated": -464.8843688964844, + "logps/real": -393.81854248046875, + "loss": 0.5013, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.7847225666046143, + "rewards/margins": 6.235506534576416, + "rewards/real": 2.4507832527160645, + "step": 480 + }, + { + "epoch": 0.058922558922558925, + "grad_norm": 55.87920931382874, + "learning_rate": 2.9447115384615385e-07, + "logits/generated": -2.5865557193756104, + "logits/real": -2.562652349472046, + "logps/generated": -597.1098022460938, + "logps/real": -537.5181884765625, + "loss": 0.5396, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.842224359512329, + "rewards/margins": 5.604229927062988, + "rewards/real": 3.76200532913208, + "step": 490 + }, + { + "epoch": 0.06012506012506012, + "grad_norm": 131.18473169676926, + "learning_rate": 3.004807692307692e-07, + "logits/generated": -2.506545066833496, + "logits/real": -2.4941606521606445, + "logps/generated": -447.23748779296875, + "logps/real": -305.6527404785156, + "loss": 0.506, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.6240792274475098, + "rewards/margins": 4.890207290649414, + "rewards/real": 2.2661280632019043, + "step": 500 + }, + { + "epoch": 0.06132756132756133, + "grad_norm": 62.90722345988625, + "learning_rate": 3.0649038461538463e-07, + "logits/generated": -2.5521976947784424, + "logits/real": -2.5559165477752686, + "logps/generated": -520.6541748046875, + "logps/real": -467.57781982421875, + "loss": 0.5945, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -1.9740931987762451, + "rewards/margins": 5.077263355255127, + "rewards/real": 3.1031699180603027, + "step": 510 + }, + { + "epoch": 0.06253006253006253, + "grad_norm": 545.3694751870157, + "learning_rate": 3.1249999999999997e-07, + "logits/generated": -2.5236270427703857, + "logits/real": -2.4969699382781982, + "logps/generated": -386.9814453125, + "logps/real": -265.28399658203125, + "loss": 0.6543, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.31213641166687, + "rewards/margins": 4.220711708068848, + "rewards/real": 1.9085750579833984, + "step": 520 + }, + { + "epoch": 0.06373256373256374, + "grad_norm": 160.94599584449614, + "learning_rate": 3.1850961538461536e-07, + "logits/generated": -2.492840051651001, + "logits/real": -2.504638910293579, + "logps/generated": -438.1183166503906, + "logps/real": -422.6055603027344, + "loss": 0.4684, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.6569240093231201, + "rewards/margins": 5.817118167877197, + "rewards/real": 4.160194396972656, + "step": 530 + }, + { + "epoch": 0.06493506493506493, + "grad_norm": 219.24291818102114, + "learning_rate": 3.2451923076923076e-07, + "logits/generated": -2.5277576446533203, + "logits/real": -2.534254550933838, + "logps/generated": -567.7418212890625, + "logps/real": -410.13189697265625, + "loss": 0.5648, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.521907091140747, + "rewards/margins": 5.512710094451904, + "rewards/real": 3.9908034801483154, + "step": 540 + }, + { + "epoch": 0.06613756613756613, + "grad_norm": 262.4963212470536, + "learning_rate": 3.3052884615384615e-07, + "logits/generated": -2.4447712898254395, + "logits/real": -2.4399654865264893, + "logps/generated": -563.7222290039062, + "logps/real": -487.09149169921875, + "loss": 0.5795, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.4821799993515015, + "rewards/margins": 5.634998321533203, + "rewards/real": 4.152817726135254, + "step": 550 + }, + { + "epoch": 0.06734006734006734, + "grad_norm": 14.40198793340137, + "learning_rate": 3.3653846153846154e-07, + "logits/generated": -2.446659803390503, + "logits/real": -2.4423022270202637, + "logps/generated": -510.00048828125, + "logps/real": -354.17633056640625, + "loss": 0.463, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.6586902141571045, + "rewards/margins": 5.03386926651001, + "rewards/real": 3.375178098678589, + "step": 560 + }, + { + "epoch": 0.06854256854256854, + "grad_norm": 322.40706415540495, + "learning_rate": 3.4254807692307693e-07, + "logits/generated": -2.436397075653076, + "logits/real": -2.407928943634033, + "logps/generated": -444.75335693359375, + "logps/real": -361.814697265625, + "loss": 0.5566, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.6426433324813843, + "rewards/margins": 5.619287490844727, + "rewards/real": 3.9766430854797363, + "step": 570 + }, + { + "epoch": 0.06974506974506975, + "grad_norm": 520.8071806827575, + "learning_rate": 3.4855769230769227e-07, + "logits/generated": -2.443695545196533, + "logits/real": -2.4213390350341797, + "logps/generated": -573.2529296875, + "logps/real": -466.0352478027344, + "loss": 0.5636, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.9522768259048462, + "rewards/margins": 7.255436897277832, + "rewards/real": 5.303159713745117, + "step": 580 + }, + { + "epoch": 0.07094757094757095, + "grad_norm": 136.86662774329525, + "learning_rate": 3.545673076923077e-07, + "logits/generated": -2.444119930267334, + "logits/real": -2.4461493492126465, + "logps/generated": -403.935546875, + "logps/real": -372.10955810546875, + "loss": 0.5627, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.049966335296631, + "rewards/margins": 5.5424723625183105, + "rewards/real": 3.492506504058838, + "step": 590 + }, + { + "epoch": 0.07215007215007214, + "grad_norm": 200.39657674377634, + "learning_rate": 3.6057692307692306e-07, + "logits/generated": -2.4266598224639893, + "logits/real": -2.3929033279418945, + "logps/generated": -471.447998046875, + "logps/real": -349.2568359375, + "loss": 0.4766, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.8870925903320312, + "rewards/margins": 6.241556167602539, + "rewards/real": 3.3544631004333496, + "step": 600 + }, + { + "epoch": 0.07335257335257335, + "grad_norm": 26.212093486989286, + "learning_rate": 3.665865384615384e-07, + "logits/generated": -2.537414312362671, + "logits/real": -2.522113800048828, + "logps/generated": -539.5340576171875, + "logps/real": -499.4637756347656, + "loss": 0.563, + "rewards/accuracies": 0.875, + "rewards/generated": 0.5028484463691711, + "rewards/margins": 5.832631587982178, + "rewards/real": 6.335480213165283, + "step": 610 + }, + { + "epoch": 0.07455507455507455, + "grad_norm": 288.1085577690171, + "learning_rate": 3.7259615384615384e-07, + "logits/generated": -2.44157075881958, + "logits/real": -2.4534690380096436, + "logps/generated": -562.2481689453125, + "logps/real": -413.3949279785156, + "loss": 0.7212, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.3799520432949066, + "rewards/margins": 6.192772388458252, + "rewards/real": 5.812820911407471, + "step": 620 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 234.3285774174516, + "learning_rate": 3.786057692307692e-07, + "logits/generated": -2.4359230995178223, + "logits/real": -2.4403796195983887, + "logps/generated": -473.8140563964844, + "logps/real": -376.0264587402344, + "loss": 0.5482, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.6209138631820679, + "rewards/margins": 6.4328203201293945, + "rewards/real": 5.811906337738037, + "step": 630 + }, + { + "epoch": 0.07696007696007696, + "grad_norm": 607.9712137079911, + "learning_rate": 3.8461538461538463e-07, + "logits/generated": -2.480332851409912, + "logits/real": -2.427781105041504, + "logps/generated": -534.2012939453125, + "logps/real": -399.5908203125, + "loss": 0.5918, + "rewards/accuracies": 0.875, + "rewards/generated": -0.24489817023277283, + "rewards/margins": 5.902339458465576, + "rewards/real": 5.657441139221191, + "step": 640 + }, + { + "epoch": 0.07816257816257816, + "grad_norm": 644.5895679430932, + "learning_rate": 3.9062499999999997e-07, + "logits/generated": -2.4072418212890625, + "logits/real": -2.4051966667175293, + "logps/generated": -488.00372314453125, + "logps/real": -419.2978515625, + "loss": 0.6482, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.9883257746696472, + "rewards/margins": 6.993611812591553, + "rewards/real": 6.005285739898682, + "step": 650 + }, + { + "epoch": 0.07936507936507936, + "grad_norm": 165.58063796257375, + "learning_rate": 3.9663461538461536e-07, + "logits/generated": -2.4074649810791016, + "logits/real": -2.4075779914855957, + "logps/generated": -429.4495544433594, + "logps/real": -338.69903564453125, + "loss": 0.4639, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.5352877378463745, + "rewards/margins": 5.956145763397217, + "rewards/real": 4.4208574295043945, + "step": 660 + }, + { + "epoch": 0.08056758056758057, + "grad_norm": 470.2298594259089, + "learning_rate": 4.0264423076923075e-07, + "logits/generated": -2.438861846923828, + "logits/real": -2.43005108833313, + "logps/generated": -609.919921875, + "logps/real": -400.38555908203125, + "loss": 0.5084, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.441681146621704, + "rewards/margins": 8.445650100708008, + "rewards/real": 7.003968715667725, + "step": 670 + }, + { + "epoch": 0.08177008177008177, + "grad_norm": 231.19690718837484, + "learning_rate": 4.0865384615384614e-07, + "logits/generated": -2.4505388736724854, + "logits/real": -2.4851975440979004, + "logps/generated": -523.8932495117188, + "logps/real": -337.04669189453125, + "loss": 0.5029, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.192518949508667, + "rewards/margins": 7.243227958679199, + "rewards/real": 5.050708770751953, + "step": 680 + }, + { + "epoch": 0.08297258297258298, + "grad_norm": 36.955288870395485, + "learning_rate": 4.1466346153846153e-07, + "logits/generated": -2.4332611560821533, + "logits/real": -2.445805072784424, + "logps/generated": -447.68951416015625, + "logps/real": -366.5887756347656, + "loss": 0.5458, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.1135241985321045, + "rewards/margins": 5.728610992431641, + "rewards/real": 6.842134952545166, + "step": 690 + }, + { + "epoch": 0.08417508417508418, + "grad_norm": 104.13258883882806, + "learning_rate": 4.2067307692307693e-07, + "logits/generated": -2.400561809539795, + "logits/real": -2.390897512435913, + "logps/generated": -420.000732421875, + "logps/real": -329.4757385253906, + "loss": 0.6707, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -0.8484959602355957, + "rewards/margins": 5.642020225524902, + "rewards/real": 4.793523788452148, + "step": 700 + }, + { + "epoch": 0.08537758537758537, + "grad_norm": 463.4846438228281, + "learning_rate": 4.2668269230769227e-07, + "logits/generated": -2.410674571990967, + "logits/real": -2.3668198585510254, + "logps/generated": -516.2708740234375, + "logps/real": -344.43231201171875, + "loss": 0.5679, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.931654453277588, + "rewards/margins": 8.226637840270996, + "rewards/real": 5.294983863830566, + "step": 710 + }, + { + "epoch": 0.08658008658008658, + "grad_norm": 194.3378900515586, + "learning_rate": 4.326923076923077e-07, + "logits/generated": -2.4032340049743652, + "logits/real": -2.4055473804473877, + "logps/generated": -672.94873046875, + "logps/real": -361.0772399902344, + "loss": 0.5047, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.726757049560547, + "rewards/margins": 10.705833435058594, + "rewards/real": 5.979075908660889, + "step": 720 + }, + { + "epoch": 0.08778258778258778, + "grad_norm": 473.11628415499354, + "learning_rate": 4.3870192307692305e-07, + "logits/generated": -2.441622257232666, + "logits/real": -2.4414029121398926, + "logps/generated": -554.408447265625, + "logps/real": -416.569091796875, + "loss": 0.5421, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.7528645992279053, + "rewards/margins": 10.26198673248291, + "rewards/real": 7.509122371673584, + "step": 730 + }, + { + "epoch": 0.08898508898508899, + "grad_norm": 593.8514206150782, + "learning_rate": 4.447115384615384e-07, + "logits/generated": -2.306051731109619, + "logits/real": -2.3629348278045654, + "logps/generated": -431.8253479003906, + "logps/real": -410.6041564941406, + "loss": 0.7682, + "rewards/accuracies": 0.875, + "rewards/generated": -0.5903338193893433, + "rewards/margins": 6.22799825668335, + "rewards/real": 5.637664794921875, + "step": 740 + }, + { + "epoch": 0.09018759018759019, + "grad_norm": 559.1956981297295, + "learning_rate": 4.5072115384615384e-07, + "logits/generated": -2.4053454399108887, + "logits/real": -2.429900646209717, + "logps/generated": -411.954833984375, + "logps/real": -375.6228332519531, + "loss": 0.5099, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.7702974677085876, + "rewards/margins": 6.885931968688965, + "rewards/real": 6.115634918212891, + "step": 750 + }, + { + "epoch": 0.0913900913900914, + "grad_norm": 208.42536166252188, + "learning_rate": 4.567307692307692e-07, + "logits/generated": -2.435375928878784, + "logits/real": -2.4195473194122314, + "logps/generated": -404.2030944824219, + "logps/real": -304.415283203125, + "loss": 0.5346, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.7665412425994873, + "rewards/margins": 7.186232566833496, + "rewards/real": 5.419691562652588, + "step": 760 + }, + { + "epoch": 0.09259259259259259, + "grad_norm": 277.6783880737456, + "learning_rate": 4.627403846153846e-07, + "logits/generated": -2.446922540664673, + "logits/real": -2.4227371215820312, + "logps/generated": -464.3440856933594, + "logps/real": -328.65765380859375, + "loss": 0.4893, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.3294190168380737, + "rewards/margins": 8.597673416137695, + "rewards/real": 7.26825475692749, + "step": 770 + }, + { + "epoch": 0.09379509379509379, + "grad_norm": 177.72925006175979, + "learning_rate": 4.6874999999999996e-07, + "logits/generated": -2.445438861846924, + "logits/real": -2.4165053367614746, + "logps/generated": -412.7168884277344, + "logps/real": -357.92266845703125, + "loss": 0.5939, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 0.5890105962753296, + "rewards/margins": 6.901103973388672, + "rewards/real": 7.490114688873291, + "step": 780 + }, + { + "epoch": 0.094997594997595, + "grad_norm": 498.9814928270047, + "learning_rate": 4.7475961538461535e-07, + "logits/generated": -2.508052110671997, + "logits/real": -2.463560104370117, + "logps/generated": -544.4752197265625, + "logps/real": -421.33880615234375, + "loss": 0.598, + "rewards/accuracies": 0.875, + "rewards/generated": -0.3339812755584717, + "rewards/margins": 8.22835922241211, + "rewards/real": 7.894377708435059, + "step": 790 + }, + { + "epoch": 0.0962000962000962, + "grad_norm": 177.58519858258708, + "learning_rate": 4.807692307692307e-07, + "logits/generated": -2.529059886932373, + "logits/real": -2.5021815299987793, + "logps/generated": -638.894287109375, + "logps/real": -456.08636474609375, + "loss": 0.6459, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.0881636142730713, + "rewards/margins": 9.593210220336914, + "rewards/real": 8.505046844482422, + "step": 800 + }, + { + "epoch": 0.09740259740259741, + "grad_norm": 289.1705831497986, + "learning_rate": 4.867788461538461e-07, + "logits/generated": -2.4944424629211426, + "logits/real": -2.4910571575164795, + "logps/generated": -389.064208984375, + "logps/real": -338.09130859375, + "loss": 0.6605, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.8759121894836426, + "rewards/margins": 7.720027923583984, + "rewards/real": 4.8441162109375, + "step": 810 + }, + { + "epoch": 0.0986050986050986, + "grad_norm": 89.71763545171552, + "learning_rate": 4.927884615384615e-07, + "logits/generated": -2.4910922050476074, + "logits/real": -2.512207508087158, + "logps/generated": -429.01885986328125, + "logps/real": -306.97186279296875, + "loss": 0.4738, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.9805129766464233, + "rewards/margins": 7.416163444519043, + "rewards/real": 5.435650825500488, + "step": 820 + }, + { + "epoch": 0.0998075998075998, + "grad_norm": 43.180380119194005, + "learning_rate": 4.987980769230769e-07, + "logits/generated": -2.541921854019165, + "logits/real": -2.5123507976531982, + "logps/generated": -642.5328979492188, + "logps/real": -390.1651611328125, + "loss": 0.6895, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.0395376682281494, + "rewards/margins": 8.597407341003418, + "rewards/real": 7.557869911193848, + "step": 830 + }, + { + "epoch": 0.10101010101010101, + "grad_norm": 283.019728239531, + "learning_rate": 4.994655264564404e-07, + "logits/generated": -2.54461407661438, + "logits/real": -2.5405166149139404, + "logps/generated": -526.7052001953125, + "logps/real": -341.8548278808594, + "loss": 0.4662, + "rewards/accuracies": 1.0, + "rewards/generated": -1.519399642944336, + "rewards/margins": 8.346105575561523, + "rewards/real": 6.826704978942871, + "step": 840 + }, + { + "epoch": 0.10221260221260221, + "grad_norm": 59.705205007363304, + "learning_rate": 4.987974345269909e-07, + "logits/generated": -2.5890445709228516, + "logits/real": -2.5490384101867676, + "logps/generated": -526.5341186523438, + "logps/real": -378.7647399902344, + "loss": 0.4837, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.410961866378784, + "rewards/margins": 10.236207008361816, + "rewards/real": 7.825244903564453, + "step": 850 + }, + { + "epoch": 0.10341510341510342, + "grad_norm": 36.879840980165895, + "learning_rate": 4.981293425975414e-07, + "logits/generated": -2.510080337524414, + "logits/real": -2.5209672451019287, + "logps/generated": -557.8668212890625, + "logps/real": -390.33087158203125, + "loss": 0.7064, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -1.326934814453125, + "rewards/margins": 9.699923515319824, + "rewards/real": 8.372990608215332, + "step": 860 + }, + { + "epoch": 0.10461760461760462, + "grad_norm": 44.31221900322654, + "learning_rate": 4.974612506680919e-07, + "logits/generated": -2.4854016304016113, + "logits/real": -2.5063211917877197, + "logps/generated": -390.99456787109375, + "logps/real": -264.6806640625, + "loss": 0.4735, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.6687812805175781, + "rewards/margins": 7.6750898361206055, + "rewards/real": 6.006308555603027, + "step": 870 + }, + { + "epoch": 0.10582010582010581, + "grad_norm": 595.4513964717609, + "learning_rate": 4.967931587386424e-07, + "logits/generated": -2.4633870124816895, + "logits/real": -2.4368672370910645, + "logps/generated": -516.665771484375, + "logps/real": -359.8690490722656, + "loss": 0.8311, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 2.3331992626190186, + "rewards/margins": 7.489238739013672, + "rewards/real": 9.822439193725586, + "step": 880 + }, + { + "epoch": 0.10702260702260702, + "grad_norm": 494.76580483961595, + "learning_rate": 4.961250668091929e-07, + "logits/generated": -2.3668341636657715, + "logits/real": -2.4266374111175537, + "logps/generated": -477.02947998046875, + "logps/real": -399.01104736328125, + "loss": 0.5306, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.31238502264022827, + "rewards/margins": 8.317577362060547, + "rewards/real": 8.005192756652832, + "step": 890 + }, + { + "epoch": 0.10822510822510822, + "grad_norm": 227.26978506638594, + "learning_rate": 4.954569748797434e-07, + "logits/generated": -2.452258586883545, + "logits/real": -2.409104108810425, + "logps/generated": -344.12030029296875, + "logps/real": -279.15472412109375, + "loss": 0.4819, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.35809463262557983, + "rewards/margins": 8.180088996887207, + "rewards/real": 7.821993827819824, + "step": 900 + }, + { + "epoch": 0.10942760942760943, + "grad_norm": 408.53064920970644, + "learning_rate": 4.947888829502939e-07, + "logits/generated": -2.5109660625457764, + "logits/real": -2.5140318870544434, + "logps/generated": -387.4598693847656, + "logps/real": -274.79962158203125, + "loss": 0.637, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.21372707188129425, + "rewards/margins": 9.280416488647461, + "rewards/real": 9.494144439697266, + "step": 910 + }, + { + "epoch": 0.11063011063011063, + "grad_norm": 627.8064977379107, + "learning_rate": 4.941207910208444e-07, + "logits/generated": -2.43747615814209, + "logits/real": -2.4163014888763428, + "logps/generated": -411.9353942871094, + "logps/real": -286.6072082519531, + "loss": 0.6341, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.19150570034980774, + "rewards/margins": 7.435213565826416, + "rewards/real": 7.6267194747924805, + "step": 920 + }, + { + "epoch": 0.11183261183261184, + "grad_norm": 129.83205422064785, + "learning_rate": 4.93452699091395e-07, + "logits/generated": -2.452383279800415, + "logits/real": -2.4608356952667236, + "logps/generated": -479.47430419921875, + "logps/real": -348.01751708984375, + "loss": 0.9279, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 3.176835536956787, + "rewards/margins": 6.142772674560547, + "rewards/real": 9.319608688354492, + "step": 930 + }, + { + "epoch": 0.11303511303511303, + "grad_norm": 60.642646580294546, + "learning_rate": 4.927846071619454e-07, + "logits/generated": -2.526930332183838, + "logits/real": -2.535792112350464, + "logps/generated": -442.809814453125, + "logps/real": -283.3125915527344, + "loss": 0.4962, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.3951802253723145, + "rewards/margins": 11.033866882324219, + "rewards/real": 8.638687133789062, + "step": 940 + }, + { + "epoch": 0.11423761423761424, + "grad_norm": 82.19446737393146, + "learning_rate": 4.921165152324959e-07, + "logits/generated": -2.5252230167388916, + "logits/real": -2.4797539710998535, + "logps/generated": -443.1474609375, + "logps/real": -261.7357177734375, + "loss": 0.7189, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.12446584552526474, + "rewards/margins": 7.716545104980469, + "rewards/real": 7.841011047363281, + "step": 950 + }, + { + "epoch": 0.11544011544011544, + "grad_norm": 375.0730409211993, + "learning_rate": 4.914484233030464e-07, + "logits/generated": -2.524353504180908, + "logits/real": -2.4667043685913086, + "logps/generated": -524.8092041015625, + "logps/real": -352.5014343261719, + "loss": 0.8018, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.1281962394714355, + "rewards/margins": 7.834773063659668, + "rewards/real": 5.706576347351074, + "step": 960 + }, + { + "epoch": 0.11664261664261664, + "grad_norm": 52.860984681162265, + "learning_rate": 4.907803313735969e-07, + "logits/generated": -2.5741899013519287, + "logits/real": -2.5507147312164307, + "logps/generated": -561.8126831054688, + "logps/real": -427.07537841796875, + "loss": 0.5758, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.574278473854065, + "rewards/margins": 10.812956809997559, + "rewards/real": 12.387235641479492, + "step": 970 + }, + { + "epoch": 0.11784511784511785, + "grad_norm": 350.4347742596602, + "learning_rate": 4.901122394441475e-07, + "logits/generated": -2.5642387866973877, + "logits/real": -2.538931131362915, + "logps/generated": -459.7955017089844, + "logps/real": -340.40509033203125, + "loss": 0.5898, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.2329527586698532, + "rewards/margins": 11.073533058166504, + "rewards/real": 10.840580940246582, + "step": 980 + }, + { + "epoch": 0.11904761904761904, + "grad_norm": 132.98066032528084, + "learning_rate": 4.89444147514698e-07, + "logits/generated": -2.4642674922943115, + "logits/real": -2.47807240486145, + "logps/generated": -505.66522216796875, + "logps/real": -333.0113220214844, + "loss": 0.6519, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.3832284212112427, + "rewards/margins": 9.659668922424316, + "rewards/real": 11.042898178100586, + "step": 990 + }, + { + "epoch": 0.12025012025012025, + "grad_norm": 242.28515976706066, + "learning_rate": 4.887760555852485e-07, + "logits/generated": -2.436150074005127, + "logits/real": -2.366624355316162, + "logps/generated": -610.8570556640625, + "logps/real": -486.4978942871094, + "loss": 0.595, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.764488697052002, + "rewards/margins": 11.187214851379395, + "rewards/real": 13.951703071594238, + "step": 1000 + }, + { + "epoch": 0.12025012025012025, + "eval_logits/generated": -2.478092908859253, + "eval_logits/real": -2.4647560119628906, + "eval_logps/generated": -473.17388916015625, + "eval_logps/real": -358.1131896972656, + "eval_loss": 0.5124213099479675, + "eval_rewards/accuracies": 0.9464285969734192, + "eval_rewards/generated": 1.6607673168182373, + "eval_rewards/margins": 9.65501880645752, + "eval_rewards/real": 11.31578540802002, + "eval_runtime": 161.422, + "eval_samples_per_second": 6.195, + "eval_steps_per_second": 0.52, + "step": 1000 + }, + { + "epoch": 0.12145262145262145, + "grad_norm": 33.429061743895765, + "learning_rate": 4.881079636557991e-07, + "logits/generated": -2.500575542449951, + "logits/real": -2.4536807537078857, + "logps/generated": -388.9678955078125, + "logps/real": -366.090576171875, + "loss": 0.5454, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.6237608194351196, + "rewards/margins": 10.379239082336426, + "rewards/real": 9.755477905273438, + "step": 1010 + }, + { + "epoch": 0.12265512265512266, + "grad_norm": 50.69127825558814, + "learning_rate": 4.874398717263496e-07, + "logits/generated": -2.435478687286377, + "logits/real": -2.4292519092559814, + "logps/generated": -407.613037109375, + "logps/real": -299.08050537109375, + "loss": 0.6124, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.8360468745231628, + "rewards/margins": 9.333776473999023, + "rewards/real": 8.49772834777832, + "step": 1020 + }, + { + "epoch": 0.12385762385762386, + "grad_norm": 101.56606026574721, + "learning_rate": 4.867717797969001e-07, + "logits/generated": -2.3880202770233154, + "logits/real": -2.4442012310028076, + "logps/generated": -592.423095703125, + "logps/real": -419.3992614746094, + "loss": 0.5554, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.1970345973968506, + "rewards/margins": 14.139823913574219, + "rewards/real": 12.942790031433105, + "step": 1030 + }, + { + "epoch": 0.12506012506012507, + "grad_norm": 356.7800204621778, + "learning_rate": 4.861036878674505e-07, + "logits/generated": -2.432394504547119, + "logits/real": -2.4268717765808105, + "logps/generated": -515.2378540039062, + "logps/real": -355.1988830566406, + "loss": 0.5997, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.030477285385132, + "rewards/margins": 11.90191650390625, + "rewards/real": 9.871438980102539, + "step": 1040 + }, + { + "epoch": 0.12626262626262627, + "grad_norm": 81.28241781969085, + "learning_rate": 4.85435595938001e-07, + "logits/generated": -2.480248212814331, + "logits/real": -2.490931987762451, + "logps/generated": -531.4738159179688, + "logps/real": -348.71099853515625, + "loss": 0.5742, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 3.440899610519409, + "rewards/margins": 9.678221702575684, + "rewards/real": 13.119122505187988, + "step": 1050 + }, + { + "epoch": 0.12746512746512748, + "grad_norm": 209.72715801690197, + "learning_rate": 4.847675040085515e-07, + "logits/generated": -2.4914391040802, + "logits/real": -2.524989366531372, + "logps/generated": -524.429931640625, + "logps/real": -325.39263916015625, + "loss": 0.6769, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.5767424702644348, + "rewards/margins": 11.284751892089844, + "rewards/real": 10.708008766174316, + "step": 1060 + }, + { + "epoch": 0.12866762866762868, + "grad_norm": 119.62875271914363, + "learning_rate": 4.84099412079102e-07, + "logits/generated": -2.477612018585205, + "logits/real": -2.476527452468872, + "logps/generated": -435.0867614746094, + "logps/real": -314.85333251953125, + "loss": 0.6891, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.22685666382312775, + "rewards/margins": 10.923724174499512, + "rewards/real": 11.150581359863281, + "step": 1070 + }, + { + "epoch": 0.12987012987012986, + "grad_norm": 22.269697003690823, + "learning_rate": 4.834313201496525e-07, + "logits/generated": -2.3847670555114746, + "logits/real": -2.4150912761688232, + "logps/generated": -539.1434326171875, + "logps/real": -399.27154541015625, + "loss": 0.5388, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 2.7289445400238037, + "rewards/margins": 11.612767219543457, + "rewards/real": 14.341712951660156, + "step": 1080 + }, + { + "epoch": 0.13107263107263106, + "grad_norm": 130.4099114252631, + "learning_rate": 4.827632282202031e-07, + "logits/generated": -2.412956953048706, + "logits/real": -2.4364120960235596, + "logps/generated": -473.836181640625, + "logps/real": -327.67926025390625, + "loss": 0.7132, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.9893986582756042, + "rewards/margins": 13.284649848937988, + "rewards/real": 14.2740478515625, + "step": 1090 + }, + { + "epoch": 0.13227513227513227, + "grad_norm": 214.17196740614938, + "learning_rate": 4.820951362907536e-07, + "logits/generated": -2.3645195960998535, + "logits/real": -2.411306619644165, + "logps/generated": -611.4638061523438, + "logps/real": -375.39239501953125, + "loss": 0.5831, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.057896614074707, + "rewards/margins": 13.181696891784668, + "rewards/real": 11.123800277709961, + "step": 1100 + }, + { + "epoch": 0.13347763347763347, + "grad_norm": 585.7892384302444, + "learning_rate": 4.814270443613041e-07, + "logits/generated": -2.468086004257202, + "logits/real": -2.4742960929870605, + "logps/generated": -520.3275756835938, + "logps/real": -408.871337890625, + "loss": 0.6295, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": 3.207319736480713, + "rewards/margins": 12.730597496032715, + "rewards/real": 15.937917709350586, + "step": 1110 + }, + { + "epoch": 0.13468013468013468, + "grad_norm": 39.50209435074947, + "learning_rate": 4.807589524318546e-07, + "logits/generated": -2.515010356903076, + "logits/real": -2.5143516063690186, + "logps/generated": -592.912841796875, + "logps/real": -406.6743469238281, + "loss": 0.6573, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.7588653564453125, + "rewards/margins": 15.239355087280273, + "rewards/real": 13.480488777160645, + "step": 1120 + }, + { + "epoch": 0.13588263588263588, + "grad_norm": 20.336657571766736, + "learning_rate": 4.800908605024052e-07, + "logits/generated": -2.453594923019409, + "logits/real": -2.508441925048828, + "logps/generated": -562.1450805664062, + "logps/real": -344.488525390625, + "loss": 0.6144, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.14070944488048553, + "rewards/margins": 15.271081924438477, + "rewards/real": 15.130373001098633, + "step": 1130 + }, + { + "epoch": 0.1370851370851371, + "grad_norm": 78.19434756439111, + "learning_rate": 4.794227685729556e-07, + "logits/generated": -2.454686403274536, + "logits/real": -2.470715284347534, + "logps/generated": -460.20465087890625, + "logps/real": -363.54034423828125, + "loss": 0.7764, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 5.291678428649902, + "rewards/margins": 9.904863357543945, + "rewards/real": 15.196542739868164, + "step": 1140 + }, + { + "epoch": 0.1382876382876383, + "grad_norm": 247.0342336683836, + "learning_rate": 4.787546766435061e-07, + "logits/generated": -2.302990198135376, + "logits/real": -2.364873170852661, + "logps/generated": -505.3990173339844, + "logps/real": -331.7662048339844, + "loss": 0.5708, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.2684504985809326, + "rewards/margins": 13.13066291809082, + "rewards/real": 9.862212181091309, + "step": 1150 + }, + { + "epoch": 0.1394901394901395, + "grad_norm": 33.81672212166084, + "learning_rate": 4.780865847140566e-07, + "logits/generated": -2.3383965492248535, + "logits/real": -2.3328897953033447, + "logps/generated": -513.9251708984375, + "logps/real": -321.45458984375, + "loss": 0.6679, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.1338083744049072, + "rewards/margins": 12.928006172180176, + "rewards/real": 10.794198036193848, + "step": 1160 + }, + { + "epoch": 0.1406926406926407, + "grad_norm": 140.00271640988865, + "learning_rate": 4.774184927846072e-07, + "logits/generated": -2.2809154987335205, + "logits/real": -2.2834174633026123, + "logps/generated": -473.13873291015625, + "logps/real": -264.2456359863281, + "loss": 0.7149, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.4780304431915283, + "rewards/margins": 12.65080738067627, + "rewards/real": 10.17277717590332, + "step": 1170 + }, + { + "epoch": 0.1418951418951419, + "grad_norm": 144.07305793972205, + "learning_rate": 4.767504008551577e-07, + "logits/generated": -2.2845511436462402, + "logits/real": -2.341881513595581, + "logps/generated": -494.26715087890625, + "logps/real": -308.30810546875, + "loss": 0.4847, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.6808719635009766, + "rewards/margins": 14.3323335647583, + "rewards/real": 10.651460647583008, + "step": 1180 + }, + { + "epoch": 0.14309764309764308, + "grad_norm": 39.01401304181334, + "learning_rate": 4.7608230892570814e-07, + "logits/generated": -2.3511962890625, + "logits/real": -2.347599983215332, + "logps/generated": -477.8648986816406, + "logps/real": -268.33685302734375, + "loss": 0.6017, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.041712760925293, + "rewards/margins": 13.663518905639648, + "rewards/real": 8.621807098388672, + "step": 1190 + }, + { + "epoch": 0.1443001443001443, + "grad_norm": 105.7004920268303, + "learning_rate": 4.7541421699625865e-07, + "logits/generated": -2.4286952018737793, + "logits/real": -2.4655444622039795, + "logps/generated": -515.7615966796875, + "logps/real": -323.18011474609375, + "loss": 0.5312, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.503037691116333, + "rewards/margins": 17.060775756835938, + "rewards/real": 13.557737350463867, + "step": 1200 + }, + { + "epoch": 0.1455026455026455, + "grad_norm": 240.55346316701133, + "learning_rate": 4.7474612506680915e-07, + "logits/generated": -2.3684866428375244, + "logits/real": -2.4068591594696045, + "logps/generated": -432.70452880859375, + "logps/real": -303.25897216796875, + "loss": 0.5141, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.26573410630226135, + "rewards/margins": 12.277440071105957, + "rewards/real": 12.011706352233887, + "step": 1210 + }, + { + "epoch": 0.1467051467051467, + "grad_norm": 119.35160235135942, + "learning_rate": 4.7407803313735966e-07, + "logits/generated": -2.324099063873291, + "logits/real": -2.3842215538024902, + "logps/generated": -676.953857421875, + "logps/real": -430.06683349609375, + "loss": 0.5652, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.1948742866516113, + "rewards/margins": 18.48761749267578, + "rewards/real": 15.292742729187012, + "step": 1220 + }, + { + "epoch": 0.1479076479076479, + "grad_norm": 47.52295374620309, + "learning_rate": 4.7340994120791017e-07, + "logits/generated": -2.368788242340088, + "logits/real": -2.3172709941864014, + "logps/generated": -395.877197265625, + "logps/real": -179.94818115234375, + "loss": 0.5762, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.3225250244140625, + "rewards/margins": 10.550348281860352, + "rewards/real": 7.2278242111206055, + "step": 1230 + }, + { + "epoch": 0.1491101491101491, + "grad_norm": 9.593658350756252, + "learning_rate": 4.7274184927846067e-07, + "logits/generated": -2.19208025932312, + "logits/real": -2.206171989440918, + "logps/generated": -471.58355712890625, + "logps/real": -359.2139892578125, + "loss": 0.5611, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 0.07689180225133896, + "rewards/margins": 11.707413673400879, + "rewards/real": 11.78430461883545, + "step": 1240 + }, + { + "epoch": 0.15031265031265031, + "grad_norm": 635.9445131020764, + "learning_rate": 4.7207375734901123e-07, + "logits/generated": -2.27813982963562, + "logits/real": -2.3700928688049316, + "logps/generated": -531.0651245117188, + "logps/real": -286.7881164550781, + "loss": 0.957, + "rewards/accuracies": 0.875, + "rewards/generated": -1.789415955543518, + "rewards/margins": 13.39814567565918, + "rewards/real": 11.608728408813477, + "step": 1250 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 418.0975051018659, + "learning_rate": 4.7140566541956174e-07, + "logits/generated": -2.273059368133545, + "logits/real": -2.3243207931518555, + "logps/generated": -449.553466796875, + "logps/real": -281.80035400390625, + "loss": 0.5858, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.2391071766614914, + "rewards/margins": 13.100481986999512, + "rewards/real": 13.339590072631836, + "step": 1260 + }, + { + "epoch": 0.15271765271765272, + "grad_norm": 89.7497286321816, + "learning_rate": 4.7073757349011224e-07, + "logits/generated": -2.2853188514709473, + "logits/real": -2.298370361328125, + "logps/generated": -496.208251953125, + "logps/real": -326.5314025878906, + "loss": 0.5482, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.892930269241333, + "rewards/margins": 12.869303703308105, + "rewards/real": 10.976374626159668, + "step": 1270 + }, + { + "epoch": 0.15392015392015393, + "grad_norm": 142.91378486261908, + "learning_rate": 4.7006948156066275e-07, + "logits/generated": -2.383915901184082, + "logits/real": -2.3850746154785156, + "logps/generated": -463.39715576171875, + "logps/real": -257.097900390625, + "loss": 0.4108, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.065344333648682, + "rewards/margins": 15.007519721984863, + "rewards/real": 8.942174911499023, + "step": 1280 + }, + { + "epoch": 0.15512265512265513, + "grad_norm": 9.179187293755389, + "learning_rate": 4.694013896312132e-07, + "logits/generated": -2.3176121711730957, + "logits/real": -2.351137638092041, + "logps/generated": -428.81890869140625, + "logps/real": -280.2019348144531, + "loss": 0.6041, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 0.25497040152549744, + "rewards/margins": 9.969922065734863, + "rewards/real": 10.224891662597656, + "step": 1290 + }, + { + "epoch": 0.1563251563251563, + "grad_norm": 58.845435926340926, + "learning_rate": 4.687332977017637e-07, + "logits/generated": -2.3036844730377197, + "logits/real": -2.3555920124053955, + "logps/generated": -525.302978515625, + "logps/real": -343.0516052246094, + "loss": 0.5772, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.0582842826843262, + "rewards/margins": 13.692483901977539, + "rewards/real": 12.634200096130371, + "step": 1300 + }, + { + "epoch": 0.15752765752765752, + "grad_norm": 116.29820907535108, + "learning_rate": 4.680652057723142e-07, + "logits/generated": -2.2145986557006836, + "logits/real": -2.2836480140686035, + "logps/generated": -561.1383056640625, + "logps/real": -343.1160583496094, + "loss": 0.6855, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.1828441619873047, + "rewards/margins": 14.4840726852417, + "rewards/real": 11.301229476928711, + "step": 1310 + }, + { + "epoch": 0.15873015873015872, + "grad_norm": 91.38327282645358, + "learning_rate": 4.673971138428647e-07, + "logits/generated": -2.2995400428771973, + "logits/real": -2.321096181869507, + "logps/generated": -475.58349609375, + "logps/real": -314.55731201171875, + "loss": 0.4606, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.775874376296997, + "rewards/margins": 16.249086380004883, + "rewards/real": 12.473210334777832, + "step": 1320 + }, + { + "epoch": 0.15993265993265993, + "grad_norm": 17.17921537999581, + "learning_rate": 4.667290219134153e-07, + "logits/generated": -2.3252789974212646, + "logits/real": -2.320345401763916, + "logps/generated": -482.9417419433594, + "logps/real": -281.4524841308594, + "loss": 0.3806, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.5373896360397339, + "rewards/margins": 14.353311538696289, + "rewards/real": 12.81592082977295, + "step": 1330 + }, + { + "epoch": 0.16113516113516113, + "grad_norm": 11.28736641110847, + "learning_rate": 4.660609299839658e-07, + "logits/generated": -2.3551905155181885, + "logits/real": -2.399311065673828, + "logps/generated": -479.5247497558594, + "logps/real": -274.640380859375, + "loss": 0.5912, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 2.5349957942962646, + "rewards/margins": 11.847904205322266, + "rewards/real": 14.382901191711426, + "step": 1340 + }, + { + "epoch": 0.16233766233766234, + "grad_norm": 60.62685743997119, + "learning_rate": 4.653928380545163e-07, + "logits/generated": -2.457043170928955, + "logits/real": -2.4052481651306152, + "logps/generated": -495.21026611328125, + "logps/real": -284.0331115722656, + "loss": 0.7891, + "rewards/accuracies": 0.875, + "rewards/generated": 1.9937740564346313, + "rewards/margins": 11.762030601501465, + "rewards/real": 13.755805969238281, + "step": 1350 + }, + { + "epoch": 0.16354016354016354, + "grad_norm": 371.39808284910134, + "learning_rate": 4.647247461250668e-07, + "logits/generated": -2.3548593521118164, + "logits/real": -2.326577663421631, + "logps/generated": -516.9511108398438, + "logps/real": -317.8005065917969, + "loss": 0.8024, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.0962762832641602, + "rewards/margins": 13.388837814331055, + "rewards/real": 12.292562484741211, + "step": 1360 + }, + { + "epoch": 0.16474266474266475, + "grad_norm": 373.7942551502674, + "learning_rate": 4.640566541956173e-07, + "logits/generated": -2.395211696624756, + "logits/real": -2.429844856262207, + "logps/generated": -471.0773010253906, + "logps/real": -293.4932861328125, + "loss": 0.4601, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.73394775390625, + "rewards/margins": 14.701550483703613, + "rewards/real": 11.967602729797363, + "step": 1370 + }, + { + "epoch": 0.16594516594516595, + "grad_norm": 74.08532767749598, + "learning_rate": 4.633885622661678e-07, + "logits/generated": -2.399756669998169, + "logits/real": -2.3803422451019287, + "logps/generated": -574.4180297851562, + "logps/real": -323.24102783203125, + "loss": 0.6522, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.6571858525276184, + "rewards/margins": 13.190020561218262, + "rewards/real": 12.532835006713867, + "step": 1380 + }, + { + "epoch": 0.16714766714766716, + "grad_norm": 267.6385400497943, + "learning_rate": 4.6272047033671827e-07, + "logits/generated": -2.4607558250427246, + "logits/real": -2.4937150478363037, + "logps/generated": -533.2379150390625, + "logps/real": -404.29168701171875, + "loss": 0.8347, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 6.728078365325928, + "rewards/margins": 14.371792793273926, + "rewards/real": 21.099870681762695, + "step": 1390 + }, + { + "epoch": 0.16835016835016836, + "grad_norm": 308.7151767120102, + "learning_rate": 4.620523784072688e-07, + "logits/generated": -2.536503314971924, + "logits/real": -2.4976582527160645, + "logps/generated": -545.3577880859375, + "logps/real": -431.7715759277344, + "loss": 0.6409, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 3.8251757621765137, + "rewards/margins": 13.433639526367188, + "rewards/real": 17.25881576538086, + "step": 1400 + }, + { + "epoch": 0.16955266955266957, + "grad_norm": 306.179494954948, + "learning_rate": 4.6138428647781933e-07, + "logits/generated": -2.4538426399230957, + "logits/real": -2.4141111373901367, + "logps/generated": -478.7513122558594, + "logps/real": -396.04547119140625, + "loss": 0.5104, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.485698699951172, + "rewards/margins": 13.679600715637207, + "rewards/real": 16.165298461914062, + "step": 1410 + }, + { + "epoch": 0.17075517075517074, + "grad_norm": 496.13565710543406, + "learning_rate": 4.6071619454836984e-07, + "logits/generated": -2.386244773864746, + "logits/real": -2.398043632507324, + "logps/generated": -482.45526123046875, + "logps/real": -345.71978759765625, + "loss": 0.5504, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 3.6164448261260986, + "rewards/margins": 8.972063064575195, + "rewards/real": 12.588507652282715, + "step": 1420 + }, + { + "epoch": 0.17195767195767195, + "grad_norm": 7.012010018130877, + "learning_rate": 4.6004810261892035e-07, + "logits/generated": -2.3383326530456543, + "logits/real": -2.3502814769744873, + "logps/generated": -401.1824951171875, + "logps/real": -345.9866943359375, + "loss": 0.6135, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.4917720854282379, + "rewards/margins": 10.542943954467773, + "rewards/real": 11.03471565246582, + "step": 1430 + }, + { + "epoch": 0.17316017316017315, + "grad_norm": 893.0870988980764, + "learning_rate": 4.5938001068947085e-07, + "logits/generated": -2.4274826049804688, + "logits/real": -2.432913303375244, + "logps/generated": -517.9597778320312, + "logps/real": -382.17681884765625, + "loss": 0.779, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": 4.715818881988525, + "rewards/margins": 8.692477226257324, + "rewards/real": 13.408296585083008, + "step": 1440 + }, + { + "epoch": 0.17436267436267436, + "grad_norm": 428.45989043129157, + "learning_rate": 4.5871191876002136e-07, + "logits/generated": -2.4645206928253174, + "logits/real": -2.4560036659240723, + "logps/generated": -493.51397705078125, + "logps/real": -310.2508239746094, + "loss": 0.5581, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.16840839385986328, + "rewards/margins": 15.603052139282227, + "rewards/real": 15.434643745422363, + "step": 1450 + }, + { + "epoch": 0.17556517556517556, + "grad_norm": 50.85431689497447, + "learning_rate": 4.5804382683057187e-07, + "logits/generated": -2.435072660446167, + "logits/real": -2.456373929977417, + "logps/generated": -469.07269287109375, + "logps/real": -414.0575256347656, + "loss": 0.7517, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 4.06419563293457, + "rewards/margins": 12.056317329406738, + "rewards/real": 16.120512008666992, + "step": 1460 + }, + { + "epoch": 0.17676767676767677, + "grad_norm": 14.883022568834077, + "learning_rate": 4.573757349011224e-07, + "logits/generated": -2.333134889602661, + "logits/real": -2.330747604370117, + "logps/generated": -464.7652282714844, + "logps/real": -231.93667602539062, + "loss": 0.5558, + "rewards/accuracies": 0.875, + "rewards/generated": 0.14628782868385315, + "rewards/margins": 11.229342460632324, + "rewards/real": 11.375631332397461, + "step": 1470 + }, + { + "epoch": 0.17797017797017797, + "grad_norm": 196.79652566238005, + "learning_rate": 4.567076429716729e-07, + "logits/generated": -2.3200600147247314, + "logits/real": -2.396704912185669, + "logps/generated": -432.6995544433594, + "logps/real": -338.466064453125, + "loss": 0.6584, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 1.0237276554107666, + "rewards/margins": 13.672329902648926, + "rewards/real": 14.69605827331543, + "step": 1480 + }, + { + "epoch": 0.17917267917267918, + "grad_norm": 589.2611389956969, + "learning_rate": 4.5603955104222344e-07, + "logits/generated": -2.3034911155700684, + "logits/real": -2.2840018272399902, + "logps/generated": -410.0265197753906, + "logps/real": -288.2999267578125, + "loss": 0.84, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.6751577854156494, + "rewards/margins": 11.684983253479004, + "rewards/real": 11.009824752807617, + "step": 1490 + }, + { + "epoch": 0.18037518037518038, + "grad_norm": 25.104466466994783, + "learning_rate": 4.553714591127739e-07, + "logits/generated": -2.250640869140625, + "logits/real": -2.3452303409576416, + "logps/generated": -366.4850158691406, + "logps/real": -303.73175048828125, + "loss": 0.6159, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.013589382171631, + "rewards/margins": 16.16395378112793, + "rewards/real": 12.15036392211914, + "step": 1500 + }, + { + "epoch": 0.1815776815776816, + "grad_norm": 55.12333155338489, + "learning_rate": 4.547033671833244e-07, + "logits/generated": -2.3714919090270996, + "logits/real": -2.378178119659424, + "logps/generated": -513.4163818359375, + "logps/real": -332.26312255859375, + "loss": 0.6123, + "rewards/accuracies": 0.875, + "rewards/generated": 3.348322629928589, + "rewards/margins": 12.414596557617188, + "rewards/real": 15.762918472290039, + "step": 1510 + }, + { + "epoch": 0.1827801827801828, + "grad_norm": 768.6271488170355, + "learning_rate": 4.540352752538749e-07, + "logits/generated": -2.3710010051727295, + "logits/real": -2.3883090019226074, + "logps/generated": -571.8932495117188, + "logps/real": -419.20172119140625, + "loss": 0.4591, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.354491710662842, + "rewards/margins": 16.200727462768555, + "rewards/real": 18.555217742919922, + "step": 1520 + }, + { + "epoch": 0.18398268398268397, + "grad_norm": 314.7097682368379, + "learning_rate": 4.533671833244254e-07, + "logits/generated": -2.379059076309204, + "logits/real": -2.423083782196045, + "logps/generated": -562.8613891601562, + "logps/real": -310.7023620605469, + "loss": 0.5682, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 3.716447353363037, + "rewards/margins": 12.8453369140625, + "rewards/real": 16.561786651611328, + "step": 1530 + }, + { + "epoch": 0.18518518518518517, + "grad_norm": 41.10200461760403, + "learning_rate": 4.526990913949759e-07, + "logits/generated": -2.3485469818115234, + "logits/real": -2.346954822540283, + "logps/generated": -541.9116821289062, + "logps/real": -284.74383544921875, + "loss": 0.6131, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 0.33158206939697266, + "rewards/margins": 12.98466682434082, + "rewards/real": 13.316247940063477, + "step": 1540 + }, + { + "epoch": 0.18638768638768638, + "grad_norm": 497.39061404702255, + "learning_rate": 4.520309994655264e-07, + "logits/generated": -2.3814096450805664, + "logits/real": -2.3579282760620117, + "logps/generated": -413.19970703125, + "logps/real": -272.5608215332031, + "loss": 0.5324, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.7178661823272705, + "rewards/margins": 11.779719352722168, + "rewards/real": 14.497584342956543, + "step": 1550 + }, + { + "epoch": 0.18759018759018758, + "grad_norm": 429.25026650701125, + "learning_rate": 4.5136290753607693e-07, + "logits/generated": -2.4310972690582275, + "logits/real": -2.4227535724639893, + "logps/generated": -419.1544494628906, + "logps/real": -263.32415771484375, + "loss": 0.671, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": 3.222504138946533, + "rewards/margins": 11.070513725280762, + "rewards/real": 14.29301643371582, + "step": 1560 + }, + { + "epoch": 0.1887926887926888, + "grad_norm": 49.50677819796209, + "learning_rate": 4.506948156066275e-07, + "logits/generated": -2.4903347492218018, + "logits/real": -2.4775195121765137, + "logps/generated": -466.87957763671875, + "logps/real": -298.753173828125, + "loss": 0.8774, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.608281135559082, + "rewards/margins": 12.107540130615234, + "rewards/real": 14.715822219848633, + "step": 1570 + }, + { + "epoch": 0.18999518999519, + "grad_norm": 519.2203790711026, + "learning_rate": 4.50026723677178e-07, + "logits/generated": -2.4701426029205322, + "logits/real": -2.4812369346618652, + "logps/generated": -461.80181884765625, + "logps/real": -288.8653564453125, + "loss": 0.5857, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.4484314918518066, + "rewards/margins": 12.156876564025879, + "rewards/real": 14.605308532714844, + "step": 1580 + }, + { + "epoch": 0.1911976911976912, + "grad_norm": 25.36251599233081, + "learning_rate": 4.493586317477285e-07, + "logits/generated": -2.4882984161376953, + "logits/real": -2.5147907733917236, + "logps/generated": -571.2619018554688, + "logps/real": -326.4120178222656, + "loss": 0.5151, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.17659726738929749, + "rewards/margins": 16.33140754699707, + "rewards/real": 16.50800323486328, + "step": 1590 + }, + { + "epoch": 0.1924001924001924, + "grad_norm": 24.19702687293738, + "learning_rate": 4.4869053981827896e-07, + "logits/generated": -2.4557952880859375, + "logits/real": -2.5277934074401855, + "logps/generated": -498.01849365234375, + "logps/real": -229.16592407226562, + "loss": 0.4103, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.9251322746276855, + "rewards/margins": 15.357412338256836, + "rewards/real": 11.432281494140625, + "step": 1600 + }, + { + "epoch": 0.1936026936026936, + "grad_norm": 763.9680269490647, + "learning_rate": 4.4802244788882946e-07, + "logits/generated": -2.4854238033294678, + "logits/real": -2.4586923122406006, + "logps/generated": -394.8902282714844, + "logps/real": -201.8122100830078, + "loss": 0.6123, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": 2.92989444732666, + "rewards/margins": 9.340131759643555, + "rewards/real": 12.270027160644531, + "step": 1610 + }, + { + "epoch": 0.19480519480519481, + "grad_norm": 96.0981867685106, + "learning_rate": 4.4735435595937997e-07, + "logits/generated": -2.4385573863983154, + "logits/real": -2.4143004417419434, + "logps/generated": -377.39617919921875, + "logps/real": -227.68789672851562, + "loss": 0.5976, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": -0.8079854846000671, + "rewards/margins": 11.13033676147461, + "rewards/real": 10.322351455688477, + "step": 1620 + }, + { + "epoch": 0.19600769600769602, + "grad_norm": 113.30707926581165, + "learning_rate": 4.466862640299305e-07, + "logits/generated": -2.4770843982696533, + "logits/real": -2.4692885875701904, + "logps/generated": -481.06878662109375, + "logps/real": -274.21343994140625, + "loss": 0.4434, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.1712679862976074, + "rewards/margins": 11.480100631713867, + "rewards/real": 13.651369094848633, + "step": 1630 + }, + { + "epoch": 0.1972101972101972, + "grad_norm": 87.07106570030153, + "learning_rate": 4.46018172100481e-07, + "logits/generated": -2.4080023765563965, + "logits/real": -2.4215290546417236, + "logps/generated": -470.46026611328125, + "logps/real": -295.76507568359375, + "loss": 0.648, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 4.108889102935791, + "rewards/margins": 12.166716575622559, + "rewards/real": 16.275604248046875, + "step": 1640 + }, + { + "epoch": 0.1984126984126984, + "grad_norm": 41.39049774073744, + "learning_rate": 4.4535008017103154e-07, + "logits/generated": -2.455228328704834, + "logits/real": -2.4480817317962646, + "logps/generated": -411.90057373046875, + "logps/real": -256.37957763671875, + "loss": 0.6725, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.38176360726356506, + "rewards/margins": 12.624078750610352, + "rewards/real": 13.005842208862305, + "step": 1650 + }, + { + "epoch": 0.1996151996151996, + "grad_norm": 74.85093751786184, + "learning_rate": 4.4468198824158205e-07, + "logits/generated": -2.4727237224578857, + "logits/real": -2.4917914867401123, + "logps/generated": -484.3727111816406, + "logps/real": -301.02545166015625, + "loss": 0.611, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.7268470525741577, + "rewards/margins": 12.012619018554688, + "rewards/real": 13.739466667175293, + "step": 1660 + }, + { + "epoch": 0.2008177008177008, + "grad_norm": 73.04979535106028, + "learning_rate": 4.4401389631213256e-07, + "logits/generated": -2.4775261878967285, + "logits/real": -2.4395952224731445, + "logps/generated": -456.0423278808594, + "logps/real": -281.8722839355469, + "loss": 0.6198, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.6114019155502319, + "rewards/margins": 16.019010543823242, + "rewards/real": 15.407610893249512, + "step": 1670 + }, + { + "epoch": 0.20202020202020202, + "grad_norm": 74.86241887073881, + "learning_rate": 4.4334580438268306e-07, + "logits/generated": -2.4175937175750732, + "logits/real": -2.3863775730133057, + "logps/generated": -425.3651428222656, + "logps/real": -240.4586944580078, + "loss": 0.4477, + "rewards/accuracies": 1.0, + "rewards/generated": -0.2837601602077484, + "rewards/margins": 14.158716201782227, + "rewards/real": 13.874957084655762, + "step": 1680 + }, + { + "epoch": 0.20322270322270322, + "grad_norm": 572.3239272700722, + "learning_rate": 4.4267771245323357e-07, + "logits/generated": -2.423804759979248, + "logits/real": -2.4141461849212646, + "logps/generated": -392.3048095703125, + "logps/real": -300.6315002441406, + "loss": 0.583, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.03982771560549736, + "rewards/margins": 13.115669250488281, + "rewards/real": 13.155497550964355, + "step": 1690 + }, + { + "epoch": 0.20442520442520443, + "grad_norm": 228.8164207304763, + "learning_rate": 4.42009620523784e-07, + "logits/generated": -2.3679518699645996, + "logits/real": -2.4284584522247314, + "logps/generated": -534.503173828125, + "logps/real": -310.5823974609375, + "loss": 0.6521, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 2.6827309131622314, + "rewards/margins": 12.820253372192383, + "rewards/real": 15.502985000610352, + "step": 1700 + }, + { + "epoch": 0.20562770562770563, + "grad_norm": 217.21778042822632, + "learning_rate": 4.4134152859433453e-07, + "logits/generated": -2.241626739501953, + "logits/real": -2.2955174446105957, + "logps/generated": -435.6639709472656, + "logps/real": -250.5611572265625, + "loss": 0.6091, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.5427447557449341, + "rewards/margins": 12.22032642364502, + "rewards/real": 11.677579879760742, + "step": 1710 + }, + { + "epoch": 0.20683020683020684, + "grad_norm": 71.02987427794781, + "learning_rate": 4.4067343666488503e-07, + "logits/generated": -2.2914438247680664, + "logits/real": -2.3038229942321777, + "logps/generated": -504.0199279785156, + "logps/real": -340.43194580078125, + "loss": 0.4417, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.6536057591438293, + "rewards/margins": 16.69736099243164, + "rewards/real": 17.35096549987793, + "step": 1720 + }, + { + "epoch": 0.20803270803270804, + "grad_norm": 245.7445291713791, + "learning_rate": 4.400053447354356e-07, + "logits/generated": -2.354158401489258, + "logits/real": -2.3235185146331787, + "logps/generated": -542.0653076171875, + "logps/real": -333.35601806640625, + "loss": 0.4537, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.0518426895141602, + "rewards/margins": 16.70272445678711, + "rewards/real": 15.65088176727295, + "step": 1730 + }, + { + "epoch": 0.20923520923520925, + "grad_norm": 331.9096392613902, + "learning_rate": 4.393372528059861e-07, + "logits/generated": -2.260397434234619, + "logits/real": -2.321648359298706, + "logps/generated": -601.6236572265625, + "logps/real": -353.7864990234375, + "loss": 0.5179, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.30995121598243713, + "rewards/margins": 15.769620895385742, + "rewards/real": 15.459671020507812, + "step": 1740 + }, + { + "epoch": 0.21043771043771045, + "grad_norm": 21.281642905190886, + "learning_rate": 4.386691608765366e-07, + "logits/generated": -2.2410879135131836, + "logits/real": -2.2713370323181152, + "logps/generated": -468.92987060546875, + "logps/real": -259.9978332519531, + "loss": 0.4654, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.6135820150375366, + "rewards/margins": 13.631518363952637, + "rewards/real": 12.017935752868652, + "step": 1750 + }, + { + "epoch": 0.21164021164021163, + "grad_norm": 320.62046619878, + "learning_rate": 4.380010689470871e-07, + "logits/generated": -2.295548677444458, + "logits/real": -2.3581936359405518, + "logps/generated": -517.8612060546875, + "logps/real": -268.38616943359375, + "loss": 0.4489, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.6024210453033447, + "rewards/margins": 16.50004005432129, + "rewards/real": 13.897619247436523, + "step": 1760 + }, + { + "epoch": 0.21284271284271283, + "grad_norm": 233.75139428508683, + "learning_rate": 4.373329770176376e-07, + "logits/generated": -2.3198766708374023, + "logits/real": -2.3995108604431152, + "logps/generated": -511.88720703125, + "logps/real": -391.0545349121094, + "loss": 0.5041, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 5.2740559577941895, + "rewards/margins": 16.216516494750977, + "rewards/real": 21.49057388305664, + "step": 1770 + }, + { + "epoch": 0.21404521404521404, + "grad_norm": 18.52303496346386, + "learning_rate": 4.3666488508818813e-07, + "logits/generated": -2.1872031688690186, + "logits/real": -2.221298933029175, + "logps/generated": -482.6136779785156, + "logps/real": -300.1002197265625, + "loss": 1.0096, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.4327448010444641, + "rewards/margins": 12.359970092773438, + "rewards/real": 11.927223205566406, + "step": 1780 + }, + { + "epoch": 0.21524771524771524, + "grad_norm": 157.91598537412438, + "learning_rate": 4.3599679315873863e-07, + "logits/generated": -2.2624454498291016, + "logits/real": -2.293370246887207, + "logps/generated": -544.6771850585938, + "logps/real": -346.1015625, + "loss": 0.4924, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.7577619552612305, + "rewards/margins": 14.797775268554688, + "rewards/real": 15.555536270141602, + "step": 1790 + }, + { + "epoch": 0.21645021645021645, + "grad_norm": 25.268422973955733, + "learning_rate": 4.353287012292891e-07, + "logits/generated": -2.3029236793518066, + "logits/real": -2.311941146850586, + "logps/generated": -491.8525390625, + "logps/real": -256.67156982421875, + "loss": 0.4484, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.2190592288970947, + "rewards/margins": 15.085932731628418, + "rewards/real": 12.86687183380127, + "step": 1800 + }, + { + "epoch": 0.21765271765271765, + "grad_norm": 380.27991069500246, + "learning_rate": 4.3466060929983965e-07, + "logits/generated": -2.2758796215057373, + "logits/real": -2.3383190631866455, + "logps/generated": -425.18389892578125, + "logps/real": -309.69171142578125, + "loss": 0.6675, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.514988660812378, + "rewards/margins": 14.734872817993164, + "rewards/real": 17.249860763549805, + "step": 1810 + }, + { + "epoch": 0.21885521885521886, + "grad_norm": 12.522879331918343, + "learning_rate": 4.3399251737039015e-07, + "logits/generated": -2.2408108711242676, + "logits/real": -2.2868175506591797, + "logps/generated": -391.3513488769531, + "logps/real": -247.9014892578125, + "loss": 0.5494, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.6192407608032227, + "rewards/margins": 14.945060729980469, + "rewards/real": 12.325821876525879, + "step": 1820 + }, + { + "epoch": 0.22005772005772006, + "grad_norm": 430.19118319914594, + "learning_rate": 4.3332442544094066e-07, + "logits/generated": -2.343614101409912, + "logits/real": -2.356734275817871, + "logps/generated": -483.753173828125, + "logps/real": -279.4593200683594, + "loss": 0.7199, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.6338332891464233, + "rewards/margins": 14.713136672973633, + "rewards/real": 16.346969604492188, + "step": 1830 + }, + { + "epoch": 0.22126022126022127, + "grad_norm": 17.347325299062064, + "learning_rate": 4.3265633351149117e-07, + "logits/generated": -2.308915853500366, + "logits/real": -2.350360631942749, + "logps/generated": -478.40423583984375, + "logps/real": -358.20611572265625, + "loss": 0.649, + "rewards/accuracies": 1.0, + "rewards/generated": 4.020327091217041, + "rewards/margins": 16.09906578063965, + "rewards/real": 20.1193904876709, + "step": 1840 + }, + { + "epoch": 0.22246272246272247, + "grad_norm": 407.4563009923528, + "learning_rate": 4.3198824158204167e-07, + "logits/generated": -2.3142600059509277, + "logits/real": -2.300481081008911, + "logps/generated": -461.16650390625, + "logps/real": -245.1242218017578, + "loss": 0.5186, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.1521506309509277, + "rewards/margins": 14.651391983032227, + "rewards/real": 11.499241828918457, + "step": 1850 + }, + { + "epoch": 0.22366522366522368, + "grad_norm": 696.4123601790546, + "learning_rate": 4.313201496525922e-07, + "logits/generated": -2.2123773097991943, + "logits/real": -2.3065268993377686, + "logps/generated": -435.8077087402344, + "logps/real": -245.79104614257812, + "loss": 0.8668, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.2253907918930054, + "rewards/margins": 12.305874824523926, + "rewards/real": 13.53126335144043, + "step": 1860 + }, + { + "epoch": 0.22486772486772486, + "grad_norm": 79.30742150180409, + "learning_rate": 4.306520577231427e-07, + "logits/generated": -2.1671016216278076, + "logits/real": -2.1830508708953857, + "logps/generated": -525.7474365234375, + "logps/real": -286.37158203125, + "loss": 0.508, + "rewards/accuracies": 0.875, + "rewards/generated": -2.8406291007995605, + "rewards/margins": 16.074737548828125, + "rewards/real": 13.234106063842773, + "step": 1870 + }, + { + "epoch": 0.22607022607022606, + "grad_norm": 85.42810821033221, + "learning_rate": 4.299839657936932e-07, + "logits/generated": -2.221487045288086, + "logits/real": -2.212695837020874, + "logps/generated": -532.984130859375, + "logps/real": -257.0509033203125, + "loss": 0.5246, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.5567172169685364, + "rewards/margins": 14.513051986694336, + "rewards/real": 13.956336975097656, + "step": 1880 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 64.49347505838848, + "learning_rate": 4.2931587386424375e-07, + "logits/generated": -2.329141855239868, + "logits/real": -2.3000168800354004, + "logps/generated": -464.63287353515625, + "logps/real": -294.87652587890625, + "loss": 0.5232, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 4.820662021636963, + "rewards/margins": 13.737935066223145, + "rewards/real": 18.558597564697266, + "step": 1890 + }, + { + "epoch": 0.22847522847522847, + "grad_norm": 657.9420294202405, + "learning_rate": 4.286477819347942e-07, + "logits/generated": -2.2628135681152344, + "logits/real": -2.280303955078125, + "logps/generated": -361.05511474609375, + "logps/real": -195.9595947265625, + "loss": 0.9089, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.6073031425476074, + "rewards/margins": 12.64094352722168, + "rewards/real": 12.03364086151123, + "step": 1900 + }, + { + "epoch": 0.22967772967772968, + "grad_norm": 329.61710458929105, + "learning_rate": 4.279796900053447e-07, + "logits/generated": -2.2633259296417236, + "logits/real": -2.3366036415100098, + "logps/generated": -435.8863830566406, + "logps/real": -282.08587646484375, + "loss": 0.7455, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.6685137748718262, + "rewards/margins": 15.345560073852539, + "rewards/real": 16.01407241821289, + "step": 1910 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 10.871694392692595, + "learning_rate": 4.273115980758952e-07, + "logits/generated": -2.227038860321045, + "logits/real": -2.2521004676818848, + "logps/generated": -513.7752685546875, + "logps/real": -307.07830810546875, + "loss": 0.6906, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.7155650854110718, + "rewards/margins": 15.317960739135742, + "rewards/real": 17.033527374267578, + "step": 1920 + }, + { + "epoch": 0.23208273208273208, + "grad_norm": 34.03969667999832, + "learning_rate": 4.266435061464457e-07, + "logits/generated": -2.1800999641418457, + "logits/real": -2.2404844760894775, + "logps/generated": -559.51123046875, + "logps/real": -282.93316650390625, + "loss": 0.3882, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.813613176345825, + "rewards/margins": 18.474063873291016, + "rewards/real": 14.660449028015137, + "step": 1930 + }, + { + "epoch": 0.2332852332852333, + "grad_norm": 580.2052114089759, + "learning_rate": 4.2597541421699623e-07, + "logits/generated": -2.2422163486480713, + "logits/real": -2.31141996383667, + "logps/generated": -539.1412353515625, + "logps/real": -268.7283630371094, + "loss": 0.5868, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.737396240234375, + "rewards/margins": 13.86515998840332, + "rewards/real": 15.602557182312012, + "step": 1940 + }, + { + "epoch": 0.2344877344877345, + "grad_norm": 15.896789915736347, + "learning_rate": 4.2530732228754674e-07, + "logits/generated": -2.2258827686309814, + "logits/real": -2.1926958560943604, + "logps/generated": -467.52166748046875, + "logps/real": -212.1752471923828, + "loss": 0.7112, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.5906529426574707, + "rewards/margins": 13.30827808380127, + "rewards/real": 10.717625617980957, + "step": 1950 + }, + { + "epoch": 0.2356902356902357, + "grad_norm": 106.83104345911345, + "learning_rate": 4.2463923035809724e-07, + "logits/generated": -2.2099413871765137, + "logits/real": -2.250890016555786, + "logps/generated": -452.8672790527344, + "logps/real": -339.02288818359375, + "loss": 0.4297, + "rewards/accuracies": 0.875, + "rewards/generated": 5.544894695281982, + "rewards/margins": 11.915275573730469, + "rewards/real": 17.46017074584961, + "step": 1960 + }, + { + "epoch": 0.2368927368927369, + "grad_norm": 498.3981378201728, + "learning_rate": 4.239711384286478e-07, + "logits/generated": -2.246563196182251, + "logits/real": -2.2958579063415527, + "logps/generated": -578.3214111328125, + "logps/real": -343.76971435546875, + "loss": 0.5417, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 3.9790565967559814, + "rewards/margins": 16.92190170288086, + "rewards/real": 20.900959014892578, + "step": 1970 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 30.15345699954383, + "learning_rate": 4.233030464991983e-07, + "logits/generated": -2.2259361743927, + "logits/real": -2.294032573699951, + "logps/generated": -525.3324584960938, + "logps/real": -358.31097412109375, + "loss": 0.6253, + "rewards/accuracies": 1.0, + "rewards/generated": 1.466997742652893, + "rewards/margins": 17.65773582458496, + "rewards/real": 19.12473487854004, + "step": 1980 + }, + { + "epoch": 0.2392977392977393, + "grad_norm": 53.102967891588364, + "learning_rate": 4.226349545697488e-07, + "logits/generated": -2.211698055267334, + "logits/real": -2.2531678676605225, + "logps/generated": -521.4146118164062, + "logps/real": -284.21527099609375, + "loss": 0.5967, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.092949628829956, + "rewards/margins": 16.75921630859375, + "rewards/real": 14.666265487670898, + "step": 1990 + }, + { + "epoch": 0.2405002405002405, + "grad_norm": 22.240501548749336, + "learning_rate": 4.2196686264029927e-07, + "logits/generated": -2.1727306842803955, + "logits/real": -2.192847490310669, + "logps/generated": -502.5591735839844, + "logps/real": -300.94757080078125, + "loss": 0.6451, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.1510093212127686, + "rewards/margins": 16.0479679107666, + "rewards/real": 14.89695930480957, + "step": 2000 + }, + { + "epoch": 0.2405002405002405, + "eval_logits/generated": -2.2453532218933105, + "eval_logits/real": -2.289402961730957, + "eval_logps/generated": -467.9886169433594, + "eval_logps/real": -300.9576110839844, + "eval_loss": 0.4696206748485565, + "eval_rewards/accuracies": 0.961309552192688, + "eval_rewards/generated": 2.1792919635772705, + "eval_rewards/margins": 14.852046012878418, + "eval_rewards/real": 17.031339645385742, + "eval_runtime": 159.2924, + "eval_samples_per_second": 6.278, + "eval_steps_per_second": 0.527, + "step": 2000 + }, + { + "epoch": 0.2417027417027417, + "grad_norm": 498.25699198056566, + "learning_rate": 4.212987707108498e-07, + "logits/generated": -2.235767364501953, + "logits/real": -2.296876907348633, + "logps/generated": -465.62615966796875, + "logps/real": -289.1161804199219, + "loss": 0.6497, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 4.580079555511475, + "rewards/margins": 13.837247848510742, + "rewards/real": 18.417327880859375, + "step": 2010 + }, + { + "epoch": 0.2429052429052429, + "grad_norm": 108.4784891447063, + "learning_rate": 4.206306787814003e-07, + "logits/generated": -2.1960015296936035, + "logits/real": -2.2037336826324463, + "logps/generated": -411.76739501953125, + "logps/real": -199.02542114257812, + "loss": 0.4661, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.699652910232544, + "rewards/margins": 13.909741401672363, + "rewards/real": 12.210088729858398, + "step": 2020 + }, + { + "epoch": 0.2441077441077441, + "grad_norm": 20.078751268669887, + "learning_rate": 4.199625868519508e-07, + "logits/generated": -2.1672139167785645, + "logits/real": -2.26566481590271, + "logps/generated": -466.12799072265625, + "logps/real": -307.26123046875, + "loss": 0.6044, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.63902747631073, + "rewards/margins": 16.171306610107422, + "rewards/real": 16.810333251953125, + "step": 2030 + }, + { + "epoch": 0.2453102453102453, + "grad_norm": 397.60746174814227, + "learning_rate": 4.192944949225013e-07, + "logits/generated": -2.186650276184082, + "logits/real": -2.2666990756988525, + "logps/generated": -478.31884765625, + "logps/real": -299.16778564453125, + "loss": 0.6022, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.120175361633301, + "rewards/margins": 17.342470169067383, + "rewards/real": 19.462646484375, + "step": 2040 + }, + { + "epoch": 0.24651274651274652, + "grad_norm": 7.458043613749171, + "learning_rate": 4.1862640299305185e-07, + "logits/generated": -2.1063156127929688, + "logits/real": -2.2582225799560547, + "logps/generated": -453.861328125, + "logps/real": -266.7242126464844, + "loss": 0.4791, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.4714561700820923, + "rewards/margins": 16.04201889038086, + "rewards/real": 14.570562362670898, + "step": 2050 + }, + { + "epoch": 0.24771524771524772, + "grad_norm": 22.85137487751545, + "learning_rate": 4.1795831106360236e-07, + "logits/generated": -2.164400815963745, + "logits/real": -2.239060163497925, + "logps/generated": -549.2992553710938, + "logps/real": -300.09600830078125, + "loss": 0.4671, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.7945098876953125, + "rewards/margins": 19.4163761138916, + "rewards/real": 15.621866226196289, + "step": 2060 + }, + { + "epoch": 0.24891774891774893, + "grad_norm": 158.87412496144168, + "learning_rate": 4.1729021913415287e-07, + "logits/generated": -2.260401964187622, + "logits/real": -2.3259756565093994, + "logps/generated": -485.48614501953125, + "logps/real": -280.7210693359375, + "loss": 0.8281, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 4.852581977844238, + "rewards/margins": 12.515386581420898, + "rewards/real": 17.367969512939453, + "step": 2070 + }, + { + "epoch": 0.25012025012025013, + "grad_norm": 488.5382133493573, + "learning_rate": 4.1662212720470337e-07, + "logits/generated": -2.1770377159118652, + "logits/real": -2.2206311225891113, + "logps/generated": -531.7283935546875, + "logps/real": -331.6108703613281, + "loss": 0.4937, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.05897097662091255, + "rewards/margins": 16.59484100341797, + "rewards/real": 16.653812408447266, + "step": 2080 + }, + { + "epoch": 0.25132275132275134, + "grad_norm": 78.167027168952, + "learning_rate": 4.159540352752539e-07, + "logits/generated": -2.1663780212402344, + "logits/real": -2.2352824211120605, + "logps/generated": -455.36163330078125, + "logps/real": -214.59616088867188, + "loss": 0.3855, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.8598149418830872, + "rewards/margins": 14.810043334960938, + "rewards/real": 13.950228691101074, + "step": 2090 + }, + { + "epoch": 0.25252525252525254, + "grad_norm": 982.4700543733867, + "learning_rate": 4.1528594334580433e-07, + "logits/generated": -2.289685010910034, + "logits/real": -2.3377137184143066, + "logps/generated": -469.47088623046875, + "logps/real": -281.41229248046875, + "loss": 0.6261, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.8475968241691589, + "rewards/margins": 16.942211151123047, + "rewards/real": 17.78980827331543, + "step": 2100 + }, + { + "epoch": 0.25372775372775375, + "grad_norm": 306.0092872930662, + "learning_rate": 4.1461785141635484e-07, + "logits/generated": -2.2969069480895996, + "logits/real": -2.3424816131591797, + "logps/generated": -445.5846252441406, + "logps/real": -279.9046325683594, + "loss": 0.5521, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.848059892654419, + "rewards/margins": 14.768579483032227, + "rewards/real": 16.61663818359375, + "step": 2110 + }, + { + "epoch": 0.25493025493025495, + "grad_norm": 167.629245540347, + "learning_rate": 4.1394975948690535e-07, + "logits/generated": -2.26259183883667, + "logits/real": -2.282552719116211, + "logps/generated": -396.5188903808594, + "logps/real": -249.4000701904297, + "loss": 0.6949, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.0532415397465229, + "rewards/margins": 14.1654634475708, + "rewards/real": 14.112218856811523, + "step": 2120 + }, + { + "epoch": 0.25613275613275616, + "grad_norm": 27.881230266471583, + "learning_rate": 4.1328166755745585e-07, + "logits/generated": -2.260697841644287, + "logits/real": -2.3000593185424805, + "logps/generated": -442.90057373046875, + "logps/real": -238.9254913330078, + "loss": 0.4759, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.20674467086792, + "rewards/margins": 16.25245475769043, + "rewards/real": 15.045707702636719, + "step": 2130 + }, + { + "epoch": 0.25733525733525736, + "grad_norm": 404.60303723706244, + "learning_rate": 4.126135756280064e-07, + "logits/generated": -2.3268682956695557, + "logits/real": -2.3540945053100586, + "logps/generated": -486.217041015625, + "logps/real": -275.4425354003906, + "loss": 0.5313, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 1.3409477472305298, + "rewards/margins": 15.182576179504395, + "rewards/real": 16.523523330688477, + "step": 2140 + }, + { + "epoch": 0.2585377585377585, + "grad_norm": 1205.735848922581, + "learning_rate": 4.119454836985569e-07, + "logits/generated": -2.343628406524658, + "logits/real": -2.4291324615478516, + "logps/generated": -587.6064453125, + "logps/real": -310.84063720703125, + "loss": 0.9305, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.1078176498413086, + "rewards/margins": 20.106380462646484, + "rewards/real": 18.998563766479492, + "step": 2150 + }, + { + "epoch": 0.2597402597402597, + "grad_norm": 764.8068203898779, + "learning_rate": 4.112773917691074e-07, + "logits/generated": -2.2855494022369385, + "logits/real": -2.4112563133239746, + "logps/generated": -488.3350524902344, + "logps/real": -305.74322509765625, + "loss": 0.8736, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.6400399208068848, + "rewards/margins": 18.86920738220215, + "rewards/real": 21.509246826171875, + "step": 2160 + }, + { + "epoch": 0.2609427609427609, + "grad_norm": 17.96023916811693, + "learning_rate": 4.1060929983965793e-07, + "logits/generated": -2.3181021213531494, + "logits/real": -2.3859965801239014, + "logps/generated": -427.2066345214844, + "logps/real": -247.71176147460938, + "loss": 0.3734, + "rewards/accuracies": 1.0, + "rewards/generated": -2.1131749153137207, + "rewards/margins": 17.140527725219727, + "rewards/real": 15.02735424041748, + "step": 2170 + }, + { + "epoch": 0.2621452621452621, + "grad_norm": 21.958852114576167, + "learning_rate": 4.0994120791020844e-07, + "logits/generated": -2.329206943511963, + "logits/real": -2.402454376220703, + "logps/generated": -445.8583068847656, + "logps/real": -263.9056701660156, + "loss": 0.591, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.5780588388442993, + "rewards/margins": 17.21750259399414, + "rewards/real": 15.639444351196289, + "step": 2180 + }, + { + "epoch": 0.26334776334776333, + "grad_norm": 67.24513864053671, + "learning_rate": 4.0927311598075894e-07, + "logits/generated": -2.4399991035461426, + "logits/real": -2.4571475982666016, + "logps/generated": -386.81072998046875, + "logps/real": -234.23135375976562, + "loss": 0.5481, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.7285892963409424, + "rewards/margins": 12.364453315734863, + "rewards/real": 13.093042373657227, + "step": 2190 + }, + { + "epoch": 0.26455026455026454, + "grad_norm": 10.961551304341183, + "learning_rate": 4.086050240513094e-07, + "logits/generated": -2.319725513458252, + "logits/real": -2.3283464908599854, + "logps/generated": -494.9991149902344, + "logps/real": -259.45501708984375, + "loss": 0.7363, + "rewards/accuracies": 0.824999988079071, + "rewards/generated": 0.5694778561592102, + "rewards/margins": 11.519054412841797, + "rewards/real": 12.088532447814941, + "step": 2200 + }, + { + "epoch": 0.26575276575276574, + "grad_norm": 493.7000639760628, + "learning_rate": 4.079369321218599e-07, + "logits/generated": -2.3618216514587402, + "logits/real": -2.383780002593994, + "logps/generated": -494.12664794921875, + "logps/real": -261.7317810058594, + "loss": 0.7859, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.7946144342422485, + "rewards/margins": 16.871675491333008, + "rewards/real": 15.077061653137207, + "step": 2210 + }, + { + "epoch": 0.26695526695526695, + "grad_norm": 9.674714438495831, + "learning_rate": 4.0726884019241046e-07, + "logits/generated": -2.4149184226989746, + "logits/real": -2.4204447269439697, + "logps/generated": -478.0731506347656, + "logps/real": -283.4941711425781, + "loss": 0.61, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 3.0854523181915283, + "rewards/margins": 15.088220596313477, + "rewards/real": 18.173673629760742, + "step": 2220 + }, + { + "epoch": 0.26815776815776815, + "grad_norm": 164.81520791608986, + "learning_rate": 4.0660074826296097e-07, + "logits/generated": -2.35487699508667, + "logits/real": -2.3857338428497314, + "logps/generated": -493.53387451171875, + "logps/real": -264.8500671386719, + "loss": 0.6175, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.235954999923706, + "rewards/margins": 17.718080520629883, + "rewards/real": 16.48212242126465, + "step": 2230 + }, + { + "epoch": 0.26936026936026936, + "grad_norm": 29.11290451077836, + "learning_rate": 4.059326563335115e-07, + "logits/generated": -2.385585308074951, + "logits/real": -2.386427879333496, + "logps/generated": -455.605224609375, + "logps/real": -305.58050537109375, + "loss": 0.5898, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.353062391281128, + "rewards/margins": 17.456607818603516, + "rewards/real": 18.80967140197754, + "step": 2240 + }, + { + "epoch": 0.27056277056277056, + "grad_norm": 219.00150654471975, + "learning_rate": 4.05264564404062e-07, + "logits/generated": -2.370087146759033, + "logits/real": -2.320514440536499, + "logps/generated": -458.58367919921875, + "logps/real": -232.2949676513672, + "loss": 0.7588, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.4099538326263428, + "rewards/margins": 13.649721145629883, + "rewards/real": 15.059674263000488, + "step": 2250 + }, + { + "epoch": 0.27176527176527177, + "grad_norm": 10.529133930047273, + "learning_rate": 4.045964724746125e-07, + "logits/generated": -2.3615708351135254, + "logits/real": -2.385770559310913, + "logps/generated": -643.0159912109375, + "logps/real": -347.21728515625, + "loss": 0.5589, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 2.890958786010742, + "rewards/margins": 20.825693130493164, + "rewards/real": 23.716650009155273, + "step": 2260 + }, + { + "epoch": 0.27296777296777297, + "grad_norm": 150.54104577841935, + "learning_rate": 4.03928380545163e-07, + "logits/generated": -2.192412853240967, + "logits/real": -2.2333662509918213, + "logps/generated": -463.63201904296875, + "logps/real": -251.89990234375, + "loss": 0.6744, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.015620231628418, + "rewards/margins": 16.744342803955078, + "rewards/real": 14.728724479675293, + "step": 2270 + }, + { + "epoch": 0.2741702741702742, + "grad_norm": 566.8994714944832, + "learning_rate": 4.032602886157135e-07, + "logits/generated": -2.1026272773742676, + "logits/real": -2.1895911693573, + "logps/generated": -478.6298828125, + "logps/real": -265.9862365722656, + "loss": 0.5446, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.028159642592072487, + "rewards/margins": 14.760887145996094, + "rewards/real": 14.73272705078125, + "step": 2280 + }, + { + "epoch": 0.2753727753727754, + "grad_norm": 10.64695627907612, + "learning_rate": 4.02592196686264e-07, + "logits/generated": -2.061657428741455, + "logits/real": -2.1556432247161865, + "logps/generated": -545.6156616210938, + "logps/real": -286.1421813964844, + "loss": 0.6071, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.486213684082031, + "rewards/margins": 22.017555236816406, + "rewards/real": 16.531341552734375, + "step": 2290 + }, + { + "epoch": 0.2765752765752766, + "grad_norm": 612.2821400497011, + "learning_rate": 4.0192410475681457e-07, + "logits/generated": -2.1123647689819336, + "logits/real": -2.19512677192688, + "logps/generated": -528.1289672851562, + "logps/real": -288.802001953125, + "loss": 0.9276, + "rewards/accuracies": 0.875, + "rewards/generated": 1.6563570499420166, + "rewards/margins": 15.107965469360352, + "rewards/real": 16.76432228088379, + "step": 2300 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 7.313056170151692, + "learning_rate": 4.01256012827365e-07, + "logits/generated": -2.117569923400879, + "logits/real": -2.1547975540161133, + "logps/generated": -495.6553649902344, + "logps/real": -248.65133666992188, + "loss": 0.3803, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.843641757965088, + "rewards/margins": 19.134342193603516, + "rewards/real": 15.290698051452637, + "step": 2310 + }, + { + "epoch": 0.278980278980279, + "grad_norm": 7.7175479001943526, + "learning_rate": 4.0058792089791553e-07, + "logits/generated": -2.1462583541870117, + "logits/real": -2.2426555156707764, + "logps/generated": -585.8822631835938, + "logps/real": -360.5639953613281, + "loss": 0.7028, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 5.030735969543457, + "rewards/margins": 17.73818588256836, + "rewards/real": 22.768924713134766, + "step": 2320 + }, + { + "epoch": 0.2801827801827802, + "grad_norm": 227.33141818933976, + "learning_rate": 3.9991982896846603e-07, + "logits/generated": -2.140292167663574, + "logits/real": -2.1454365253448486, + "logps/generated": -486.5482482910156, + "logps/real": -244.6797332763672, + "loss": 0.5421, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.1532176733016968, + "rewards/margins": 17.947900772094727, + "rewards/real": 16.794681549072266, + "step": 2330 + }, + { + "epoch": 0.2813852813852814, + "grad_norm": 228.19243583306417, + "learning_rate": 3.9925173703901654e-07, + "logits/generated": -2.126603603363037, + "logits/real": -2.230055332183838, + "logps/generated": -532.1044921875, + "logps/real": -248.2303009033203, + "loss": 0.456, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.700369834899902, + "rewards/margins": 19.924604415893555, + "rewards/real": 13.224233627319336, + "step": 2340 + }, + { + "epoch": 0.2825877825877826, + "grad_norm": 102.87004216018481, + "learning_rate": 3.9858364510956705e-07, + "logits/generated": -2.1835708618164062, + "logits/real": -2.2770071029663086, + "logps/generated": -556.3807373046875, + "logps/real": -299.39337158203125, + "loss": 0.5245, + "rewards/accuracies": 0.875, + "rewards/generated": 1.4072721004486084, + "rewards/margins": 18.639461517333984, + "rewards/real": 20.046733856201172, + "step": 2350 + }, + { + "epoch": 0.2837902837902838, + "grad_norm": 52.076633582830944, + "learning_rate": 3.9791555318011755e-07, + "logits/generated": -2.0553860664367676, + "logits/real": -2.3055174350738525, + "logps/generated": -595.501220703125, + "logps/real": -365.21728515625, + "loss": 0.4418, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.978679895401001, + "rewards/margins": 21.45098876953125, + "rewards/real": 19.472309112548828, + "step": 2360 + }, + { + "epoch": 0.284992784992785, + "grad_norm": 43.265739821635215, + "learning_rate": 3.9724746125066806e-07, + "logits/generated": -2.211268901824951, + "logits/real": -2.2506141662597656, + "logps/generated": -569.6729736328125, + "logps/real": -267.82177734375, + "loss": 0.6833, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -3.011526584625244, + "rewards/margins": 20.3045711517334, + "rewards/real": 17.293045043945312, + "step": 2370 + }, + { + "epoch": 0.28619528619528617, + "grad_norm": 5.432566620710611, + "learning_rate": 3.965793693212186e-07, + "logits/generated": -2.2058141231536865, + "logits/real": -2.2846221923828125, + "logps/generated": -429.84002685546875, + "logps/real": -262.2298278808594, + "loss": 0.5481, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.326233386993408, + "rewards/margins": 13.523542404174805, + "rewards/real": 15.849777221679688, + "step": 2380 + }, + { + "epoch": 0.2873977873977874, + "grad_norm": 4.096190479648606, + "learning_rate": 3.959112773917691e-07, + "logits/generated": -2.231595039367676, + "logits/real": -2.326998233795166, + "logps/generated": -464.07147216796875, + "logps/real": -293.7169189453125, + "loss": 0.5556, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.1314868927001953, + "rewards/margins": 20.272600173950195, + "rewards/real": 20.141111373901367, + "step": 2390 + }, + { + "epoch": 0.2886002886002886, + "grad_norm": 671.8046709052743, + "learning_rate": 3.9524318546231963e-07, + "logits/generated": -2.17610764503479, + "logits/real": -2.2699453830718994, + "logps/generated": -587.4531860351562, + "logps/real": -281.1423645019531, + "loss": 0.8125, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.4140739440917969, + "rewards/margins": 19.855026245117188, + "rewards/real": 21.269100189208984, + "step": 2400 + }, + { + "epoch": 0.2898027898027898, + "grad_norm": 45.617477855374005, + "learning_rate": 3.945750935328701e-07, + "logits/generated": -2.201483726501465, + "logits/real": -2.280503988265991, + "logps/generated": -544.9918823242188, + "logps/real": -300.0970764160156, + "loss": 0.3515, + "rewards/accuracies": 1.0, + "rewards/generated": 2.2972793579101562, + "rewards/margins": 18.853200912475586, + "rewards/real": 21.150482177734375, + "step": 2410 + }, + { + "epoch": 0.291005291005291, + "grad_norm": 201.9185668294254, + "learning_rate": 3.939070016034206e-07, + "logits/generated": -2.1289939880371094, + "logits/real": -2.170576572418213, + "logps/generated": -533.7762451171875, + "logps/real": -260.8277587890625, + "loss": 0.5091, + "rewards/accuracies": 0.875, + "rewards/generated": -0.015506362542510033, + "rewards/margins": 14.969365119934082, + "rewards/real": 14.953857421875, + "step": 2420 + }, + { + "epoch": 0.2922077922077922, + "grad_norm": 12.401245180803466, + "learning_rate": 3.932389096739711e-07, + "logits/generated": -2.1283884048461914, + "logits/real": -2.213829755783081, + "logps/generated": -542.0123291015625, + "logps/real": -326.3026428222656, + "loss": 0.3686, + "rewards/accuracies": 1.0, + "rewards/generated": -1.4629404544830322, + "rewards/margins": 21.855056762695312, + "rewards/real": 20.39211654663086, + "step": 2430 + }, + { + "epoch": 0.2934102934102934, + "grad_norm": 469.35001799084023, + "learning_rate": 3.925708177445216e-07, + "logits/generated": -2.1530420780181885, + "logits/real": -2.2408835887908936, + "logps/generated": -553.5878295898438, + "logps/real": -266.2428283691406, + "loss": 0.6252, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.5921292304992676, + "rewards/margins": 17.0020751953125, + "rewards/real": 18.594202041625977, + "step": 2440 + }, + { + "epoch": 0.2946127946127946, + "grad_norm": 377.2950116445638, + "learning_rate": 3.919027258150721e-07, + "logits/generated": -2.202223300933838, + "logits/real": -2.2388205528259277, + "logps/generated": -468.335693359375, + "logps/real": -286.7129211425781, + "loss": 1.0598, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 6.400108337402344, + "rewards/margins": 11.34289836883545, + "rewards/real": 17.74300765991211, + "step": 2450 + }, + { + "epoch": 0.2958152958152958, + "grad_norm": 7.359700932395771, + "learning_rate": 3.9123463388562267e-07, + "logits/generated": -2.1311264038085938, + "logits/real": -2.2204508781433105, + "logps/generated": -551.81640625, + "logps/real": -360.0440368652344, + "loss": 0.7343, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.7086261510849, + "rewards/margins": 20.443758010864258, + "rewards/real": 22.15238380432129, + "step": 2460 + }, + { + "epoch": 0.297017797017797, + "grad_norm": 10.964991581489983, + "learning_rate": 3.905665419561732e-07, + "logits/generated": -1.9595201015472412, + "logits/real": -2.1721019744873047, + "logps/generated": -380.84161376953125, + "logps/real": -230.2252960205078, + "loss": 0.3808, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.458247184753418, + "rewards/margins": 17.87284278869629, + "rewards/real": 13.414594650268555, + "step": 2470 + }, + { + "epoch": 0.2982202982202982, + "grad_norm": 57.25792791041731, + "learning_rate": 3.898984500267237e-07, + "logits/generated": -2.2138562202453613, + "logits/real": -2.1700243949890137, + "logps/generated": -398.2755126953125, + "logps/real": -151.51303100585938, + "loss": 0.5028, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.947381496429443, + "rewards/margins": 14.118861198425293, + "rewards/real": 9.171480178833008, + "step": 2480 + }, + { + "epoch": 0.2994227994227994, + "grad_norm": 5.578683663434324, + "learning_rate": 3.892303580972742e-07, + "logits/generated": -2.1187691688537598, + "logits/real": -2.168670654296875, + "logps/generated": -448.775146484375, + "logps/real": -240.85302734375, + "loss": 0.4702, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.1800984889268875, + "rewards/margins": 16.530004501342773, + "rewards/real": 16.34990882873535, + "step": 2490 + }, + { + "epoch": 0.30062530062530063, + "grad_norm": 184.652793423236, + "learning_rate": 3.885622661678247e-07, + "logits/generated": -2.1732075214385986, + "logits/real": -2.1834158897399902, + "logps/generated": -470.95208740234375, + "logps/real": -230.28964233398438, + "loss": 0.5405, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.23640525341033936, + "rewards/margins": 14.332746505737305, + "rewards/real": 14.096341133117676, + "step": 2500 + }, + { + "epoch": 0.30182780182780183, + "grad_norm": 271.53536021611694, + "learning_rate": 3.8789417423837515e-07, + "logits/generated": -2.0926122665405273, + "logits/real": -2.2409567832946777, + "logps/generated": -471.1053161621094, + "logps/real": -273.2291564941406, + "loss": 0.4105, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.34327155351638794, + "rewards/margins": 17.26302719116211, + "rewards/real": 17.606298446655273, + "step": 2510 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 20.766863064045708, + "learning_rate": 3.8722608230892566e-07, + "logits/generated": -2.234311580657959, + "logits/real": -2.247994899749756, + "logps/generated": -408.84271240234375, + "logps/real": -258.6989440917969, + "loss": 0.7622, + "rewards/accuracies": 0.875, + "rewards/generated": 4.932229518890381, + "rewards/margins": 13.865893363952637, + "rewards/real": 18.79812240600586, + "step": 2520 + }, + { + "epoch": 0.30423280423280424, + "grad_norm": 99.42766781589214, + "learning_rate": 3.8655799037947616e-07, + "logits/generated": -2.18027925491333, + "logits/real": -2.226388931274414, + "logps/generated": -498.34967041015625, + "logps/real": -246.7676544189453, + "loss": 0.5463, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.9954167604446411, + "rewards/margins": 16.997676849365234, + "rewards/real": 16.00226402282715, + "step": 2530 + }, + { + "epoch": 0.30543530543530545, + "grad_norm": 350.754875419416, + "learning_rate": 3.858898984500267e-07, + "logits/generated": -2.1765637397766113, + "logits/real": -2.2366700172424316, + "logps/generated": -606.4658813476562, + "logps/real": -316.66510009765625, + "loss": 0.5861, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.058906078338623, + "rewards/margins": 23.236309051513672, + "rewards/real": 21.17740249633789, + "step": 2540 + }, + { + "epoch": 0.30663780663780665, + "grad_norm": 1191.7895875915897, + "learning_rate": 3.8522180652057723e-07, + "logits/generated": -2.1629891395568848, + "logits/real": -2.2447993755340576, + "logps/generated": -498.3642578125, + "logps/real": -253.2069091796875, + "loss": 0.6364, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.32391619682312, + "rewards/margins": 17.841588973999023, + "rewards/real": 15.517674446105957, + "step": 2550 + }, + { + "epoch": 0.30784030784030786, + "grad_norm": 490.4584889577419, + "learning_rate": 3.8455371459112774e-07, + "logits/generated": -2.1314988136291504, + "logits/real": -2.1876564025878906, + "logps/generated": -520.8911743164062, + "logps/real": -293.00384521484375, + "loss": 0.6943, + "rewards/accuracies": 0.875, + "rewards/generated": -1.5374441146850586, + "rewards/margins": 14.27092456817627, + "rewards/real": 12.733478546142578, + "step": 2560 + }, + { + "epoch": 0.30904280904280906, + "grad_norm": 403.6397986457478, + "learning_rate": 3.8388562266167824e-07, + "logits/generated": -2.19177508354187, + "logits/real": -2.3153634071350098, + "logps/generated": -577.9305419921875, + "logps/real": -342.81182861328125, + "loss": 0.6109, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.02606072463095188, + "rewards/margins": 22.08085060119629, + "rewards/real": 22.054790496826172, + "step": 2570 + }, + { + "epoch": 0.31024531024531027, + "grad_norm": 14.546934589605003, + "learning_rate": 3.8321753073222875e-07, + "logits/generated": -2.313136577606201, + "logits/real": -2.2953858375549316, + "logps/generated": -461.09619140625, + "logps/real": -271.5863342285156, + "loss": 0.4425, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.7134145498275757, + "rewards/margins": 15.884051322937012, + "rewards/real": 16.597463607788086, + "step": 2580 + }, + { + "epoch": 0.3114478114478115, + "grad_norm": 195.04118296937162, + "learning_rate": 3.8254943880277925e-07, + "logits/generated": -2.257913827896118, + "logits/real": -2.2643375396728516, + "logps/generated": -447.4485778808594, + "logps/real": -231.215576171875, + "loss": 0.5602, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.1901184618473053, + "rewards/margins": 15.335597038269043, + "rewards/real": 15.145477294921875, + "step": 2590 + }, + { + "epoch": 0.3126503126503126, + "grad_norm": 122.41843973159715, + "learning_rate": 3.8188134687332976e-07, + "logits/generated": -2.2918972969055176, + "logits/real": -2.2709531784057617, + "logps/generated": -514.5028076171875, + "logps/real": -259.34429931640625, + "loss": 0.7289, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.576934337615967, + "rewards/margins": 13.527132987976074, + "rewards/real": 16.10406494140625, + "step": 2600 + }, + { + "epoch": 0.31385281385281383, + "grad_norm": 42.42941538698551, + "learning_rate": 3.812132549438802e-07, + "logits/generated": -2.2534289360046387, + "logits/real": -2.339850902557373, + "logps/generated": -518.5089721679688, + "logps/real": -317.2063903808594, + "loss": 0.7398, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 5.504884243011475, + "rewards/margins": 13.643224716186523, + "rewards/real": 19.148107528686523, + "step": 2610 + }, + { + "epoch": 0.31505531505531503, + "grad_norm": 66.50875144317763, + "learning_rate": 3.805451630144308e-07, + "logits/generated": -2.2641537189483643, + "logits/real": -2.3424103260040283, + "logps/generated": -462.82977294921875, + "logps/real": -228.4934539794922, + "loss": 0.5007, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.8506364822387695, + "rewards/margins": 18.461023330688477, + "rewards/real": 15.610386848449707, + "step": 2620 + }, + { + "epoch": 0.31625781625781624, + "grad_norm": 121.3224524860916, + "learning_rate": 3.798770710849813e-07, + "logits/generated": -2.261126756668091, + "logits/real": -2.3151824474334717, + "logps/generated": -564.8231811523438, + "logps/real": -308.75213623046875, + "loss": 0.5121, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.0477707386016846, + "rewards/margins": 19.658065795898438, + "rewards/real": 17.610294342041016, + "step": 2630 + }, + { + "epoch": 0.31746031746031744, + "grad_norm": 286.4977922209483, + "learning_rate": 3.792089791555318e-07, + "logits/generated": -2.219849109649658, + "logits/real": -2.258054494857788, + "logps/generated": -412.97625732421875, + "logps/real": -201.11192321777344, + "loss": 0.5705, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -3.6303048133850098, + "rewards/margins": 14.927179336547852, + "rewards/real": 11.296873092651367, + "step": 2640 + }, + { + "epoch": 0.31866281866281865, + "grad_norm": 27.11392858172599, + "learning_rate": 3.785408872260823e-07, + "logits/generated": -2.2176125049591064, + "logits/real": -2.3149025440216064, + "logps/generated": -447.37939453125, + "logps/real": -258.38006591796875, + "loss": 0.4621, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.2688522338867188, + "rewards/margins": 17.308168411254883, + "rewards/real": 15.039314270019531, + "step": 2650 + }, + { + "epoch": 0.31986531986531985, + "grad_norm": 28.25488178532129, + "learning_rate": 3.778727952966328e-07, + "logits/generated": -2.250487804412842, + "logits/real": -2.281437397003174, + "logps/generated": -509.69677734375, + "logps/real": -250.135009765625, + "loss": 0.544, + "rewards/accuracies": 1.0, + "rewards/generated": -1.2124731540679932, + "rewards/margins": 18.824953079223633, + "rewards/real": 17.61248016357422, + "step": 2660 + }, + { + "epoch": 0.32106782106782106, + "grad_norm": 8.3833092801713, + "learning_rate": 3.772047033671833e-07, + "logits/generated": -2.2615928649902344, + "logits/real": -2.317018508911133, + "logps/generated": -514.9195556640625, + "logps/real": -277.6170959472656, + "loss": 0.6451, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.9076513051986694, + "rewards/margins": 19.119131088256836, + "rewards/real": 18.211477279663086, + "step": 2670 + }, + { + "epoch": 0.32227032227032226, + "grad_norm": 399.2622493885436, + "learning_rate": 3.765366114377338e-07, + "logits/generated": -2.310234785079956, + "logits/real": -2.391126871109009, + "logps/generated": -455.75616455078125, + "logps/real": -275.1095275878906, + "loss": 0.6719, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.8847066164016724, + "rewards/margins": 18.197498321533203, + "rewards/real": 17.312789916992188, + "step": 2680 + }, + { + "epoch": 0.32347282347282347, + "grad_norm": 134.0020971242362, + "learning_rate": 3.758685195082843e-07, + "logits/generated": -2.3624823093414307, + "logits/real": -2.3429665565490723, + "logps/generated": -418.9403381347656, + "logps/real": -284.3204650878906, + "loss": 0.8075, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.12245519459247589, + "rewards/margins": 15.674652099609375, + "rewards/real": 15.552197456359863, + "step": 2690 + }, + { + "epoch": 0.3246753246753247, + "grad_norm": 213.3160271893409, + "learning_rate": 3.752004275788349e-07, + "logits/generated": -2.3580338954925537, + "logits/real": -2.403287172317505, + "logps/generated": -536.4251098632812, + "logps/real": -341.7716064453125, + "loss": 0.4892, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.7240122556686401, + "rewards/margins": 21.984424591064453, + "rewards/real": 22.708433151245117, + "step": 2700 + }, + { + "epoch": 0.3258778258778259, + "grad_norm": 21.235970633452414, + "learning_rate": 3.7453233564938533e-07, + "logits/generated": -2.3828284740448, + "logits/real": -2.3824453353881836, + "logps/generated": -540.6604614257812, + "logps/real": -266.5406188964844, + "loss": 0.4202, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.8464826345443726, + "rewards/margins": 16.177722930908203, + "rewards/real": 18.024206161499023, + "step": 2710 + }, + { + "epoch": 0.3270803270803271, + "grad_norm": 5.13194445661882, + "learning_rate": 3.7386424371993584e-07, + "logits/generated": -2.217475414276123, + "logits/real": -2.157270908355713, + "logps/generated": -496.35400390625, + "logps/real": -193.39320373535156, + "loss": 0.4943, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.373010635375977, + "rewards/margins": 19.458637237548828, + "rewards/real": 11.085625648498535, + "step": 2720 + }, + { + "epoch": 0.3282828282828283, + "grad_norm": 190.2068058722336, + "learning_rate": 3.7319615179048635e-07, + "logits/generated": -2.3180150985717773, + "logits/real": -2.3551723957061768, + "logps/generated": -534.9052124023438, + "logps/real": -277.12054443359375, + "loss": 0.5609, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.518371343612671, + "rewards/margins": 19.505908966064453, + "rewards/real": 21.024280548095703, + "step": 2730 + }, + { + "epoch": 0.3294853294853295, + "grad_norm": 74.78807710626396, + "learning_rate": 3.7252805986103685e-07, + "logits/generated": -2.2545883655548096, + "logits/real": -2.2242777347564697, + "logps/generated": -422.09429931640625, + "logps/real": -195.44094848632812, + "loss": 0.3884, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.2318847179412842, + "rewards/margins": 16.44162940979004, + "rewards/real": 15.209744453430176, + "step": 2740 + }, + { + "epoch": 0.3306878306878307, + "grad_norm": 8.74676748820064, + "learning_rate": 3.7185996793158736e-07, + "logits/generated": -2.246854066848755, + "logits/real": -2.326922655105591, + "logps/generated": -534.0845947265625, + "logps/real": -248.7612762451172, + "loss": 0.3652, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.486065626144409, + "rewards/margins": 18.387165069580078, + "rewards/real": 14.901100158691406, + "step": 2750 + }, + { + "epoch": 0.3318903318903319, + "grad_norm": 138.41521218871162, + "learning_rate": 3.7119187600213786e-07, + "logits/generated": -2.213275671005249, + "logits/real": -2.2868189811706543, + "logps/generated": -475.4173278808594, + "logps/real": -214.8863067626953, + "loss": 0.4147, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.996474504470825, + "rewards/margins": 18.990652084350586, + "rewards/real": 15.994176864624023, + "step": 2760 + }, + { + "epoch": 0.3330928330928331, + "grad_norm": 112.81032486998156, + "learning_rate": 3.7052378407268837e-07, + "logits/generated": -2.3418517112731934, + "logits/real": -2.3174288272857666, + "logps/generated": -459.78277587890625, + "logps/real": -346.34503173828125, + "loss": 0.5185, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 3.563931703567505, + "rewards/margins": 16.707632064819336, + "rewards/real": 20.271564483642578, + "step": 2770 + }, + { + "epoch": 0.3342953342953343, + "grad_norm": 6.766426332696746, + "learning_rate": 3.6985569214323893e-07, + "logits/generated": -2.3402276039123535, + "logits/real": -2.3462252616882324, + "logps/generated": -412.19647216796875, + "logps/real": -202.32833862304688, + "loss": 0.401, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.25340956449508667, + "rewards/margins": 14.352015495300293, + "rewards/real": 14.098605155944824, + "step": 2780 + }, + { + "epoch": 0.3354978354978355, + "grad_norm": 511.5833964031441, + "learning_rate": 3.6918760021378944e-07, + "logits/generated": -2.279498815536499, + "logits/real": -2.3270673751831055, + "logps/generated": -442.42388916015625, + "logps/real": -239.3588409423828, + "loss": 0.4917, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 3.2044434547424316, + "rewards/margins": 14.670681953430176, + "rewards/real": 17.875125885009766, + "step": 2790 + }, + { + "epoch": 0.3367003367003367, + "grad_norm": 103.30030241970397, + "learning_rate": 3.6851950828433994e-07, + "logits/generated": -2.2594730854034424, + "logits/real": -2.360800266265869, + "logps/generated": -434.021728515625, + "logps/real": -238.7564239501953, + "loss": 0.9014, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 0.7155309915542603, + "rewards/margins": 13.3612642288208, + "rewards/real": 14.07679557800293, + "step": 2800 + }, + { + "epoch": 0.3379028379028379, + "grad_norm": 426.4962354296236, + "learning_rate": 3.678514163548904e-07, + "logits/generated": -2.308851718902588, + "logits/real": -2.311568260192871, + "logps/generated": -421.3448181152344, + "logps/real": -218.58200073242188, + "loss": 0.7486, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.2538965344429016, + "rewards/margins": 15.328129768371582, + "rewards/real": 15.074234008789062, + "step": 2810 + }, + { + "epoch": 0.33910533910533913, + "grad_norm": 5.335246883158649, + "learning_rate": 3.671833244254409e-07, + "logits/generated": -2.271623134613037, + "logits/real": -2.2808804512023926, + "logps/generated": -475.78912353515625, + "logps/real": -257.80328369140625, + "loss": 0.4559, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.2881863117218018, + "rewards/margins": 17.663087844848633, + "rewards/real": 15.374898910522461, + "step": 2820 + }, + { + "epoch": 0.3403078403078403, + "grad_norm": 373.64227720589076, + "learning_rate": 3.665152324959914e-07, + "logits/generated": -2.30668044090271, + "logits/real": -2.340928316116333, + "logps/generated": -518.8795166015625, + "logps/real": -283.543701171875, + "loss": 0.6693, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.5370599627494812, + "rewards/margins": 19.455291748046875, + "rewards/real": 18.918231964111328, + "step": 2830 + }, + { + "epoch": 0.3415103415103415, + "grad_norm": 521.2894272426375, + "learning_rate": 3.658471405665419e-07, + "logits/generated": -2.273470401763916, + "logits/real": -2.3501219749450684, + "logps/generated": -533.2444458007812, + "logps/real": -338.16253662109375, + "loss": 0.6353, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.6802558898925781, + "rewards/margins": 21.118061065673828, + "rewards/real": 20.43780517578125, + "step": 2840 + }, + { + "epoch": 0.3427128427128427, + "grad_norm": 90.07582210277268, + "learning_rate": 3.651790486370924e-07, + "logits/generated": -2.1684927940368652, + "logits/real": -2.295745372772217, + "logps/generated": -419.84173583984375, + "logps/real": -246.21926879882812, + "loss": 0.4569, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.7380359172821045, + "rewards/margins": 16.973033905029297, + "rewards/real": 14.235000610351562, + "step": 2850 + }, + { + "epoch": 0.3439153439153439, + "grad_norm": 302.42665547984643, + "learning_rate": 3.64510956707643e-07, + "logits/generated": -2.2570419311523438, + "logits/real": -2.329324245452881, + "logps/generated": -430.212890625, + "logps/real": -216.98428344726562, + "loss": 0.6666, + "rewards/accuracies": 1.0, + "rewards/generated": -1.2797664403915405, + "rewards/margins": 16.35479736328125, + "rewards/real": 15.075032234191895, + "step": 2860 + }, + { + "epoch": 0.3451178451178451, + "grad_norm": 16.668042765363705, + "learning_rate": 3.638428647781935e-07, + "logits/generated": -2.326631784439087, + "logits/real": -2.373183012008667, + "logps/generated": -486.47064208984375, + "logps/real": -265.8896484375, + "loss": 0.4662, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 1.3753122091293335, + "rewards/margins": 18.226367950439453, + "rewards/real": 19.601680755615234, + "step": 2870 + }, + { + "epoch": 0.3463203463203463, + "grad_norm": 7.364150510742759, + "learning_rate": 3.63174772848744e-07, + "logits/generated": -2.1694772243499756, + "logits/real": -2.2114944458007812, + "logps/generated": -531.44677734375, + "logps/real": -232.7603759765625, + "loss": 0.4505, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.588564395904541, + "rewards/margins": 22.947429656982422, + "rewards/real": 15.358866691589355, + "step": 2880 + }, + { + "epoch": 0.3475228475228475, + "grad_norm": 11.610271194094071, + "learning_rate": 3.625066809192945e-07, + "logits/generated": -2.179028034210205, + "logits/real": -2.2378478050231934, + "logps/generated": -528.3477172851562, + "logps/real": -231.15243530273438, + "loss": 0.3434, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.327719688415527, + "rewards/margins": 22.12078857421875, + "rewards/real": 15.793066024780273, + "step": 2890 + }, + { + "epoch": 0.3487253487253487, + "grad_norm": 37.74433267090721, + "learning_rate": 3.61838588989845e-07, + "logits/generated": -2.1644222736358643, + "logits/real": -2.2317419052124023, + "logps/generated": -469.208251953125, + "logps/real": -276.16094970703125, + "loss": 0.8129, + "rewards/accuracies": 1.0, + "rewards/generated": -3.7847771644592285, + "rewards/margins": 21.81374740600586, + "rewards/real": 18.028968811035156, + "step": 2900 + }, + { + "epoch": 0.3499278499278499, + "grad_norm": 19.454792292634934, + "learning_rate": 3.6117049706039546e-07, + "logits/generated": -2.2520663738250732, + "logits/real": -2.3081467151641846, + "logps/generated": -530.2596435546875, + "logps/real": -294.5622863769531, + "loss": 0.4308, + "rewards/accuracies": 1.0, + "rewards/generated": -0.6475614309310913, + "rewards/margins": 22.570125579833984, + "rewards/real": 21.922565460205078, + "step": 2910 + }, + { + "epoch": 0.3511303511303511, + "grad_norm": 20.555559351118088, + "learning_rate": 3.6050240513094597e-07, + "logits/generated": -2.2065482139587402, + "logits/real": -2.285953998565674, + "logps/generated": -590.97265625, + "logps/real": -328.4667053222656, + "loss": 0.56, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 2.960893154144287, + "rewards/margins": 20.918859481811523, + "rewards/real": 23.8797550201416, + "step": 2920 + }, + { + "epoch": 0.35233285233285233, + "grad_norm": 407.4218369458025, + "learning_rate": 3.598343132014965e-07, + "logits/generated": -2.1608686447143555, + "logits/real": -2.2716238498687744, + "logps/generated": -632.4462890625, + "logps/real": -391.6826477050781, + "loss": 1.1456, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.0389041900634766, + "rewards/margins": 20.525333404541016, + "rewards/real": 18.486431121826172, + "step": 2930 + }, + { + "epoch": 0.35353535353535354, + "grad_norm": 16.446471253966664, + "learning_rate": 3.5916622127204703e-07, + "logits/generated": -2.1633334159851074, + "logits/real": -2.236440420150757, + "logps/generated": -626.8197631835938, + "logps/real": -301.2580871582031, + "loss": 0.3489, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.856711387634277, + "rewards/margins": 25.98929214477539, + "rewards/real": 21.132579803466797, + "step": 2940 + }, + { + "epoch": 0.35473785473785474, + "grad_norm": 21.781590947073084, + "learning_rate": 3.5849812934259754e-07, + "logits/generated": -2.2256908416748047, + "logits/real": -2.3398449420928955, + "logps/generated": -580.8104858398438, + "logps/real": -310.58636474609375, + "loss": 0.4497, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.1089565753936768, + "rewards/margins": 21.500940322875977, + "rewards/real": 19.391984939575195, + "step": 2950 + }, + { + "epoch": 0.35594035594035595, + "grad_norm": 163.62698125544455, + "learning_rate": 3.5783003741314805e-07, + "logits/generated": -2.313955545425415, + "logits/real": -2.364743947982788, + "logps/generated": -484.18524169921875, + "logps/real": -272.0729675292969, + "loss": 0.3679, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.4989825189113617, + "rewards/margins": 18.99362564086914, + "rewards/real": 18.494644165039062, + "step": 2960 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 499.66094153881966, + "learning_rate": 3.5716194548369855e-07, + "logits/generated": -2.255885601043701, + "logits/real": -2.2675743103027344, + "logps/generated": -501.8041076660156, + "logps/real": -234.0949249267578, + "loss": 0.4604, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.058472633361816, + "rewards/margins": 20.395231246948242, + "rewards/real": 15.336758613586426, + "step": 2970 + }, + { + "epoch": 0.35834535834535836, + "grad_norm": 13.313462414204784, + "learning_rate": 3.5649385355424906e-07, + "logits/generated": -2.2967865467071533, + "logits/real": -2.273264169692993, + "logps/generated": -485.79437255859375, + "logps/real": -256.3107604980469, + "loss": 0.5895, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.5996122360229492, + "rewards/margins": 16.557743072509766, + "rewards/real": 14.9581298828125, + "step": 2980 + }, + { + "epoch": 0.35954785954785956, + "grad_norm": 377.56450844050454, + "learning_rate": 3.5582576162479957e-07, + "logits/generated": -2.265778064727783, + "logits/real": -2.3358829021453857, + "logps/generated": -606.2882080078125, + "logps/real": -299.10772705078125, + "loss": 0.6658, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.9930524826049805, + "rewards/margins": 22.922954559326172, + "rewards/real": 19.929899215698242, + "step": 2990 + }, + { + "epoch": 0.36075036075036077, + "grad_norm": 13.260550088610207, + "learning_rate": 3.5515766969535007e-07, + "logits/generated": -2.252068281173706, + "logits/real": -2.2762959003448486, + "logps/generated": -563.1546630859375, + "logps/real": -273.4827880859375, + "loss": 0.6942, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.128468036651611, + "rewards/margins": 21.493574142456055, + "rewards/real": 15.365107536315918, + "step": 3000 + }, + { + "epoch": 0.36075036075036077, + "eval_logits/generated": -2.2503185272216797, + "eval_logits/real": -2.3049116134643555, + "eval_logps/generated": -529.7844848632812, + "eval_logps/real": -290.2620544433594, + "eval_loss": 0.40316224098205566, + "eval_rewards/accuracies": 0.9732142686843872, + "eval_rewards/generated": -4.000301837921143, + "eval_rewards/margins": 22.101200103759766, + "eval_rewards/real": 18.10089683532715, + "eval_runtime": 158.3944, + "eval_samples_per_second": 6.313, + "eval_steps_per_second": 0.53, + "step": 3000 + }, + { + "epoch": 0.36195286195286197, + "grad_norm": 122.75740828308409, + "learning_rate": 3.544895777659005e-07, + "logits/generated": -2.2664377689361572, + "logits/real": -2.3279268741607666, + "logps/generated": -516.6077880859375, + "logps/real": -307.4842224121094, + "loss": 0.625, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.284638404846191, + "rewards/margins": 25.697412490844727, + "rewards/real": 21.412769317626953, + "step": 3010 + }, + { + "epoch": 0.3631553631553632, + "grad_norm": 5.912793075206404, + "learning_rate": 3.538214858364511e-07, + "logits/generated": -2.2338626384735107, + "logits/real": -2.282067060470581, + "logps/generated": -522.6149291992188, + "logps/real": -242.1405792236328, + "loss": 0.5807, + "rewards/accuracies": 1.0, + "rewards/generated": -5.8531646728515625, + "rewards/margins": 20.41141128540039, + "rewards/real": 14.558245658874512, + "step": 3020 + }, + { + "epoch": 0.3643578643578644, + "grad_norm": 5.770911163059995, + "learning_rate": 3.531533939070016e-07, + "logits/generated": -2.222487688064575, + "logits/real": -2.2802271842956543, + "logps/generated": -552.416015625, + "logps/real": -246.77639770507812, + "loss": 0.3684, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.396537780761719, + "rewards/margins": 21.888214111328125, + "rewards/real": 15.491678237915039, + "step": 3030 + }, + { + "epoch": 0.3655603655603656, + "grad_norm": 14.534338940850242, + "learning_rate": 3.524853019775521e-07, + "logits/generated": -2.1972241401672363, + "logits/real": -2.318814754486084, + "logps/generated": -456.2215270996094, + "logps/real": -282.9758605957031, + "loss": 0.3602, + "rewards/accuracies": 1.0, + "rewards/generated": -2.8818116188049316, + "rewards/margins": 21.660917282104492, + "rewards/real": 18.77910614013672, + "step": 3040 + }, + { + "epoch": 0.3667628667628668, + "grad_norm": 141.71431274136296, + "learning_rate": 3.518172100481026e-07, + "logits/generated": -2.285266876220703, + "logits/real": -2.3525002002716064, + "logps/generated": -436.36767578125, + "logps/real": -301.61981201171875, + "loss": 0.7586, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.640789270401001, + "rewards/margins": 19.90084457397461, + "rewards/real": 19.260055541992188, + "step": 3050 + }, + { + "epoch": 0.36796536796536794, + "grad_norm": 285.17578435724823, + "learning_rate": 3.511491181186531e-07, + "logits/generated": -2.2331862449645996, + "logits/real": -2.3540260791778564, + "logps/generated": -692.9502563476562, + "logps/real": -376.41363525390625, + "loss": 0.4003, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.579779624938965, + "rewards/margins": 33.797027587890625, + "rewards/real": 29.21724510192871, + "step": 3060 + }, + { + "epoch": 0.36916786916786914, + "grad_norm": 262.9206989868409, + "learning_rate": 3.504810261892036e-07, + "logits/generated": -2.192718267440796, + "logits/real": -2.2729651927948, + "logps/generated": -509.4129943847656, + "logps/real": -258.4654846191406, + "loss": 0.4083, + "rewards/accuracies": 1.0, + "rewards/generated": -4.632937431335449, + "rewards/margins": 20.950496673583984, + "rewards/real": 16.317562103271484, + "step": 3070 + }, + { + "epoch": 0.37037037037037035, + "grad_norm": 131.60497647826585, + "learning_rate": 3.498129342597541e-07, + "logits/generated": -2.1637587547302246, + "logits/real": -2.2644407749176025, + "logps/generated": -459.63836669921875, + "logps/real": -248.57723999023438, + "loss": 0.5513, + "rewards/accuracies": 1.0, + "rewards/generated": -3.667370319366455, + "rewards/margins": 19.44435691833496, + "rewards/real": 15.776989936828613, + "step": 3080 + }, + { + "epoch": 0.37157287157287155, + "grad_norm": 10.670283284540384, + "learning_rate": 3.4914484233030463e-07, + "logits/generated": -2.1456921100616455, + "logits/real": -2.2160375118255615, + "logps/generated": -447.6417541503906, + "logps/real": -242.09207153320312, + "loss": 0.5443, + "rewards/accuracies": 1.0, + "rewards/generated": -3.274768829345703, + "rewards/margins": 21.649639129638672, + "rewards/real": 18.374868392944336, + "step": 3090 + }, + { + "epoch": 0.37277537277537276, + "grad_norm": 284.86059767065285, + "learning_rate": 3.484767504008552e-07, + "logits/generated": -2.2293851375579834, + "logits/real": -2.2408015727996826, + "logps/generated": -485.41876220703125, + "logps/real": -248.82473754882812, + "loss": 0.4494, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.447257041931152, + "rewards/margins": 22.267175674438477, + "rewards/real": 16.81991958618164, + "step": 3100 + }, + { + "epoch": 0.37397787397787396, + "grad_norm": 673.3401086177072, + "learning_rate": 3.4780865847140564e-07, + "logits/generated": -2.3030428886413574, + "logits/real": -2.338392734527588, + "logps/generated": -577.5540771484375, + "logps/real": -294.89654541015625, + "loss": 0.5323, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 2.794123888015747, + "rewards/margins": 20.030696868896484, + "rewards/real": 22.824819564819336, + "step": 3110 + }, + { + "epoch": 0.37518037518037517, + "grad_norm": 270.9194607638238, + "learning_rate": 3.4714056654195615e-07, + "logits/generated": -2.2071359157562256, + "logits/real": -2.2805514335632324, + "logps/generated": -456.8641052246094, + "logps/real": -221.67684936523438, + "loss": 0.6461, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.32338285446167, + "rewards/margins": 16.610441207885742, + "rewards/real": 14.28705883026123, + "step": 3120 + }, + { + "epoch": 0.3763828763828764, + "grad_norm": 22.331175258402148, + "learning_rate": 3.4647247461250666e-07, + "logits/generated": -2.2179336547851562, + "logits/real": -2.3464598655700684, + "logps/generated": -469.17724609375, + "logps/real": -246.83615112304688, + "loss": 0.672, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.541749477386475, + "rewards/margins": 21.040027618408203, + "rewards/real": 16.498275756835938, + "step": 3130 + }, + { + "epoch": 0.3775853775853776, + "grad_norm": 12.04281250897961, + "learning_rate": 3.4580438268305716e-07, + "logits/generated": -2.1902594566345215, + "logits/real": -2.2416536808013916, + "logps/generated": -529.442138671875, + "logps/real": -232.1566619873047, + "loss": 0.4831, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.105222225189209, + "rewards/margins": 22.607030868530273, + "rewards/real": 16.50181007385254, + "step": 3140 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 486.14449671043985, + "learning_rate": 3.4513629075360767e-07, + "logits/generated": -2.187072515487671, + "logits/real": -2.2189419269561768, + "logps/generated": -664.7039184570312, + "logps/real": -279.95855712890625, + "loss": 0.8295, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -8.598974227905273, + "rewards/margins": 24.43923568725586, + "rewards/real": 15.840261459350586, + "step": 3150 + }, + { + "epoch": 0.37999037999038, + "grad_norm": 429.9586352979677, + "learning_rate": 3.444681988241582e-07, + "logits/generated": -2.2088828086853027, + "logits/real": -2.2571425437927246, + "logps/generated": -555.0758666992188, + "logps/real": -294.0585021972656, + "loss": 0.399, + "rewards/accuracies": 1.0, + "rewards/generated": -5.015622138977051, + "rewards/margins": 24.913909912109375, + "rewards/real": 19.89828872680664, + "step": 3160 + }, + { + "epoch": 0.3811928811928812, + "grad_norm": 14.595761956469604, + "learning_rate": 3.438001068947087e-07, + "logits/generated": -2.2068917751312256, + "logits/real": -2.214226007461548, + "logps/generated": -457.39678955078125, + "logps/real": -240.8272247314453, + "loss": 0.3143, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.3468408584594727, + "rewards/margins": 18.06717300415039, + "rewards/real": 16.720333099365234, + "step": 3170 + }, + { + "epoch": 0.3823953823953824, + "grad_norm": 286.3107478758328, + "learning_rate": 3.4313201496525924e-07, + "logits/generated": -2.2519948482513428, + "logits/real": -2.3029470443725586, + "logps/generated": -553.2166748046875, + "logps/real": -263.83782958984375, + "loss": 0.5949, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.3181514739990234, + "rewards/margins": 23.930526733398438, + "rewards/real": 20.612375259399414, + "step": 3180 + }, + { + "epoch": 0.3835978835978836, + "grad_norm": 491.5969995393527, + "learning_rate": 3.4246392303580975e-07, + "logits/generated": -2.2349514961242676, + "logits/real": -2.319042682647705, + "logps/generated": -727.5499877929688, + "logps/real": -342.4107360839844, + "loss": 0.5115, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.6486027240753174, + "rewards/margins": 24.541975021362305, + "rewards/real": 22.893375396728516, + "step": 3190 + }, + { + "epoch": 0.3848003848003848, + "grad_norm": 6.759721578774117, + "learning_rate": 3.4179583110636025e-07, + "logits/generated": -2.2635293006896973, + "logits/real": -2.302671194076538, + "logps/generated": -543.232421875, + "logps/real": -226.0625457763672, + "loss": 0.3877, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.465426445007324, + "rewards/margins": 21.946393966674805, + "rewards/real": 16.480966567993164, + "step": 3200 + }, + { + "epoch": 0.386002886002886, + "grad_norm": 36.139008726199734, + "learning_rate": 3.411277391769107e-07, + "logits/generated": -2.2435240745544434, + "logits/real": -2.2899813652038574, + "logps/generated": -417.7156677246094, + "logps/real": -204.80642700195312, + "loss": 0.4363, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.3878477215766907, + "rewards/margins": 15.205143928527832, + "rewards/real": 14.817296028137207, + "step": 3210 + }, + { + "epoch": 0.3872053872053872, + "grad_norm": 11.071793312837038, + "learning_rate": 3.404596472474612e-07, + "logits/generated": -2.2331137657165527, + "logits/real": -2.268756151199341, + "logps/generated": -513.5505981445312, + "logps/real": -249.2364501953125, + "loss": 0.4172, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.7772376537322998, + "rewards/margins": 17.442907333374023, + "rewards/real": 19.22014617919922, + "step": 3220 + }, + { + "epoch": 0.3884078884078884, + "grad_norm": 598.1872828227879, + "learning_rate": 3.397915553180117e-07, + "logits/generated": -2.2873480319976807, + "logits/real": -2.2719578742980957, + "logps/generated": -500.8384704589844, + "logps/real": -266.7796325683594, + "loss": 0.4875, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.352444589138031, + "rewards/margins": 19.39059066772461, + "rewards/real": 19.743032455444336, + "step": 3230 + }, + { + "epoch": 0.38961038961038963, + "grad_norm": 54.846973770309745, + "learning_rate": 3.3912346338856223e-07, + "logits/generated": -2.2452523708343506, + "logits/real": -2.267148494720459, + "logps/generated": -624.7496948242188, + "logps/real": -285.53485107421875, + "loss": 0.6026, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.275235891342163, + "rewards/margins": 20.758686065673828, + "rewards/real": 18.48345184326172, + "step": 3240 + }, + { + "epoch": 0.39081289081289083, + "grad_norm": 23.725353307695027, + "learning_rate": 3.3845537145911273e-07, + "logits/generated": -2.2604598999023438, + "logits/real": -2.3512320518493652, + "logps/generated": -546.6399536132812, + "logps/real": -243.18753051757812, + "loss": 0.4084, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.921527862548828, + "rewards/margins": 23.10455322265625, + "rewards/real": 18.183025360107422, + "step": 3250 + }, + { + "epoch": 0.39201539201539204, + "grad_norm": 13.713306471172793, + "learning_rate": 3.377872795296633e-07, + "logits/generated": -2.2504663467407227, + "logits/real": -2.2905116081237793, + "logps/generated": -529.0084838867188, + "logps/real": -229.9738006591797, + "loss": 0.6297, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.74764883518219, + "rewards/margins": 17.978008270263672, + "rewards/real": 16.23036003112793, + "step": 3260 + }, + { + "epoch": 0.39321789321789324, + "grad_norm": 12.980384198553198, + "learning_rate": 3.371191876002138e-07, + "logits/generated": -2.337036609649658, + "logits/real": -2.3831160068511963, + "logps/generated": -556.7229614257812, + "logps/real": -341.72174072265625, + "loss": 0.6133, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.2307429015636444, + "rewards/margins": 21.993549346923828, + "rewards/real": 21.762807846069336, + "step": 3270 + }, + { + "epoch": 0.3944203944203944, + "grad_norm": 7.486687417030773, + "learning_rate": 3.364510956707643e-07, + "logits/generated": -2.2695746421813965, + "logits/real": -2.278357982635498, + "logps/generated": -452.6968688964844, + "logps/real": -254.2688446044922, + "loss": 0.4876, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.17638777196407318, + "rewards/margins": 16.836870193481445, + "rewards/real": 17.013256072998047, + "step": 3280 + }, + { + "epoch": 0.3956228956228956, + "grad_norm": 941.8749408006992, + "learning_rate": 3.357830037413148e-07, + "logits/generated": -2.335817337036133, + "logits/real": -2.361940860748291, + "logps/generated": -453.5274353027344, + "logps/real": -245.0332489013672, + "loss": 0.5833, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.624830722808838, + "rewards/margins": 21.15799331665039, + "rewards/real": 18.53316307067871, + "step": 3290 + }, + { + "epoch": 0.3968253968253968, + "grad_norm": 50.215082443599826, + "learning_rate": 3.351149118118653e-07, + "logits/generated": -2.192233085632324, + "logits/real": -2.311274528503418, + "logps/generated": -544.9821166992188, + "logps/real": -276.02166748046875, + "loss": 0.4282, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 1.2815845012664795, + "rewards/margins": 18.4959716796875, + "rewards/real": 19.77755355834961, + "step": 3300 + }, + { + "epoch": 0.398027898027898, + "grad_norm": 224.4088605821121, + "learning_rate": 3.3444681988241577e-07, + "logits/generated": -2.214669704437256, + "logits/real": -2.250783681869507, + "logps/generated": -562.6416015625, + "logps/real": -280.8820495605469, + "loss": 0.7211, + "rewards/accuracies": 0.875, + "rewards/generated": -2.61222505569458, + "rewards/margins": 20.133581161499023, + "rewards/real": 17.521358489990234, + "step": 3310 + }, + { + "epoch": 0.3992303992303992, + "grad_norm": 5.858248953022507, + "learning_rate": 3.337787279529663e-07, + "logits/generated": -2.1052157878875732, + "logits/real": -2.248359203338623, + "logps/generated": -491.052001953125, + "logps/real": -229.3201904296875, + "loss": 0.2902, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.617383003234863, + "rewards/margins": 19.45993995666504, + "rewards/real": 14.842556953430176, + "step": 3320 + }, + { + "epoch": 0.4004329004329004, + "grad_norm": 22.85722751121272, + "learning_rate": 3.331106360235168e-07, + "logits/generated": -2.177408456802368, + "logits/real": -2.2191379070281982, + "logps/generated": -416.21435546875, + "logps/real": -212.9326934814453, + "loss": 0.4874, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.6096454858779907, + "rewards/margins": 16.685667037963867, + "rewards/real": 15.076022148132324, + "step": 3330 + }, + { + "epoch": 0.4016354016354016, + "grad_norm": 703.2679079252424, + "learning_rate": 3.324425440940673e-07, + "logits/generated": -2.2595977783203125, + "logits/real": -2.320112943649292, + "logps/generated": -574.4844970703125, + "logps/real": -316.2807312011719, + "loss": 0.3953, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.1208622455596924, + "rewards/margins": 22.697834014892578, + "rewards/real": 20.57697105407715, + "step": 3340 + }, + { + "epoch": 0.40283790283790283, + "grad_norm": 6.410638733708917, + "learning_rate": 3.3177445216461785e-07, + "logits/generated": -2.208864688873291, + "logits/real": -2.218881130218506, + "logps/generated": -558.22900390625, + "logps/real": -275.82513427734375, + "loss": 0.3995, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.4745880365371704, + "rewards/margins": 19.607568740844727, + "rewards/real": 18.132980346679688, + "step": 3350 + }, + { + "epoch": 0.40404040404040403, + "grad_norm": 172.69703943025226, + "learning_rate": 3.3110636023516836e-07, + "logits/generated": -2.2709243297576904, + "logits/real": -2.2835915088653564, + "logps/generated": -462.6661071777344, + "logps/real": -222.7592315673828, + "loss": 0.4266, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.892196536064148, + "rewards/margins": 17.225933074951172, + "rewards/real": 18.11812973022461, + "step": 3360 + }, + { + "epoch": 0.40524290524290524, + "grad_norm": 7.743710313480041, + "learning_rate": 3.3043826830571886e-07, + "logits/generated": -2.2780072689056396, + "logits/real": -2.2864248752593994, + "logps/generated": -475.69305419921875, + "logps/real": -212.77346801757812, + "loss": 0.58, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.44416481256484985, + "rewards/margins": 17.00520896911621, + "rewards/real": 16.561044692993164, + "step": 3370 + }, + { + "epoch": 0.40644540644540644, + "grad_norm": 241.93884894636207, + "learning_rate": 3.2977017637626937e-07, + "logits/generated": -2.261087656021118, + "logits/real": -2.26119327545166, + "logps/generated": -523.0970458984375, + "logps/real": -363.1900329589844, + "loss": 0.493, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.402726650238037, + "rewards/margins": 23.13311767578125, + "rewards/real": 20.73039436340332, + "step": 3380 + }, + { + "epoch": 0.40764790764790765, + "grad_norm": 18.122692990817526, + "learning_rate": 3.291020844468199e-07, + "logits/generated": -2.1860594749450684, + "logits/real": -2.2090258598327637, + "logps/generated": -588.9990844726562, + "logps/real": -290.1547546386719, + "loss": 0.4412, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.9298200607299805, + "rewards/margins": 23.909421920776367, + "rewards/real": 18.979602813720703, + "step": 3390 + }, + { + "epoch": 0.40885040885040885, + "grad_norm": 8.537903796099052, + "learning_rate": 3.284339925173704e-07, + "logits/generated": -2.2259185314178467, + "logits/real": -2.2582287788391113, + "logps/generated": -458.043701171875, + "logps/real": -262.12109375, + "loss": 0.4163, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.679396629333496, + "rewards/margins": 22.194400787353516, + "rewards/real": 15.515007019042969, + "step": 3400 + }, + { + "epoch": 0.41005291005291006, + "grad_norm": 5.8980934306255035, + "learning_rate": 3.2776590058792084e-07, + "logits/generated": -2.29022479057312, + "logits/real": -2.3379244804382324, + "logps/generated": -522.6433715820312, + "logps/real": -284.0138244628906, + "loss": 0.4203, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.9177383184432983, + "rewards/margins": 22.41234016418457, + "rewards/real": 20.49460220336914, + "step": 3410 + }, + { + "epoch": 0.41125541125541126, + "grad_norm": 232.41753097775643, + "learning_rate": 3.2709780865847134e-07, + "logits/generated": -2.226773738861084, + "logits/real": -2.224693536758423, + "logps/generated": -480.4476623535156, + "logps/real": -225.9335479736328, + "loss": 0.6141, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.496685028076172, + "rewards/margins": 20.988861083984375, + "rewards/real": 16.492176055908203, + "step": 3420 + }, + { + "epoch": 0.41245791245791247, + "grad_norm": 21.747758384241585, + "learning_rate": 3.264297167290219e-07, + "logits/generated": -2.2422385215759277, + "logits/real": -2.2999186515808105, + "logps/generated": -509.33416748046875, + "logps/real": -268.3351135253906, + "loss": 0.4256, + "rewards/accuracies": 1.0, + "rewards/generated": 0.6797200441360474, + "rewards/margins": 20.25993537902832, + "rewards/real": 20.939655303955078, + "step": 3430 + }, + { + "epoch": 0.4136604136604137, + "grad_norm": 9.188563537103336, + "learning_rate": 3.257616247995724e-07, + "logits/generated": -2.200892448425293, + "logits/real": -2.267993450164795, + "logps/generated": -523.612548828125, + "logps/real": -216.20706176757812, + "loss": 0.585, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.4908108711242676, + "rewards/margins": 17.959659576416016, + "rewards/real": 15.468846321105957, + "step": 3440 + }, + { + "epoch": 0.4148629148629149, + "grad_norm": 18.877840035858405, + "learning_rate": 3.250935328701229e-07, + "logits/generated": -2.1867141723632812, + "logits/real": -2.2638654708862305, + "logps/generated": -608.2249755859375, + "logps/real": -263.34625244140625, + "loss": 0.7761, + "rewards/accuracies": 1.0, + "rewards/generated": -2.450798511505127, + "rewards/margins": 21.54878044128418, + "rewards/real": 19.09798240661621, + "step": 3450 + }, + { + "epoch": 0.4160654160654161, + "grad_norm": 38.45033161024908, + "learning_rate": 3.244254409406734e-07, + "logits/generated": -2.297166585922241, + "logits/real": -2.295908212661743, + "logps/generated": -513.9171752929688, + "logps/real": -284.8858947753906, + "loss": 0.4688, + "rewards/accuracies": 1.0, + "rewards/generated": -1.543438196182251, + "rewards/margins": 23.487194061279297, + "rewards/real": 21.943756103515625, + "step": 3460 + }, + { + "epoch": 0.4172679172679173, + "grad_norm": 218.30264679323815, + "learning_rate": 3.2375734901122393e-07, + "logits/generated": -2.250934600830078, + "logits/real": -2.301692247390747, + "logps/generated": -411.31549072265625, + "logps/real": -227.10400390625, + "loss": 0.4153, + "rewards/accuracies": 1.0, + "rewards/generated": -3.8222877979278564, + "rewards/margins": 20.231616973876953, + "rewards/real": 16.409326553344727, + "step": 3470 + }, + { + "epoch": 0.4184704184704185, + "grad_norm": 105.4180123617748, + "learning_rate": 3.2308925708177443e-07, + "logits/generated": -2.2137606143951416, + "logits/real": -2.2348432540893555, + "logps/generated": -608.3657836914062, + "logps/real": -308.8596496582031, + "loss": 0.9847, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.6461491584777832, + "rewards/margins": 18.258991241455078, + "rewards/real": 18.905139923095703, + "step": 3480 + }, + { + "epoch": 0.4196729196729197, + "grad_norm": 5.056705023112445, + "learning_rate": 3.2242116515232494e-07, + "logits/generated": -2.2457330226898193, + "logits/real": -2.2896833419799805, + "logps/generated": -521.7301025390625, + "logps/real": -285.7955017089844, + "loss": 0.3993, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.4160698652267456, + "rewards/margins": 20.736743927001953, + "rewards/real": 22.152812957763672, + "step": 3490 + }, + { + "epoch": 0.4208754208754209, + "grad_norm": 54.09032063669176, + "learning_rate": 3.2175307322287545e-07, + "logits/generated": -2.2124075889587402, + "logits/real": -2.244499921798706, + "logps/generated": -446.72430419921875, + "logps/real": -201.4197540283203, + "loss": 0.6846, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.36500120162963867, + "rewards/margins": 15.436958312988281, + "rewards/real": 15.071955680847168, + "step": 3500 + }, + { + "epoch": 0.42207792207792205, + "grad_norm": 400.72282075452534, + "learning_rate": 3.21084981293426e-07, + "logits/generated": -2.237973690032959, + "logits/real": -2.262101888656616, + "logps/generated": -420.045654296875, + "logps/real": -225.34780883789062, + "loss": 0.4097, + "rewards/accuracies": 1.0, + "rewards/generated": -1.5613340139389038, + "rewards/margins": 18.243972778320312, + "rewards/real": 16.68263816833496, + "step": 3510 + }, + { + "epoch": 0.42328042328042326, + "grad_norm": 3.719521776829346, + "learning_rate": 3.2041688936397646e-07, + "logits/generated": -2.217008113861084, + "logits/real": -2.2482199668884277, + "logps/generated": -564.4690551757812, + "logps/real": -277.1367492675781, + "loss": 0.3615, + "rewards/accuracies": 1.0, + "rewards/generated": -1.776555061340332, + "rewards/margins": 22.30672836303711, + "rewards/real": 20.53017234802246, + "step": 3520 + }, + { + "epoch": 0.42448292448292446, + "grad_norm": 73.99862278530746, + "learning_rate": 3.1974879743452697e-07, + "logits/generated": -2.2452683448791504, + "logits/real": -2.2988827228546143, + "logps/generated": -420.21697998046875, + "logps/real": -247.0130615234375, + "loss": 0.6884, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 3.086369752883911, + "rewards/margins": 16.89458465576172, + "rewards/real": 19.980953216552734, + "step": 3530 + }, + { + "epoch": 0.42568542568542567, + "grad_norm": 531.4862807469356, + "learning_rate": 3.1908070550507747e-07, + "logits/generated": -2.206820011138916, + "logits/real": -2.348249912261963, + "logps/generated": -545.2318725585938, + "logps/real": -235.8459014892578, + "loss": 0.3821, + "rewards/accuracies": 1.0, + "rewards/generated": -3.8593764305114746, + "rewards/margins": 24.526968002319336, + "rewards/real": 20.667591094970703, + "step": 3540 + }, + { + "epoch": 0.42688792688792687, + "grad_norm": 17.662058717147133, + "learning_rate": 3.18412613575628e-07, + "logits/generated": -2.2284553050994873, + "logits/real": -2.302344799041748, + "logps/generated": -401.02239990234375, + "logps/real": -237.73046875, + "loss": 0.9592, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.375710129737854, + "rewards/margins": 17.94776725769043, + "rewards/real": 19.323474884033203, + "step": 3550 + }, + { + "epoch": 0.4280904280904281, + "grad_norm": 74.29450182082346, + "learning_rate": 3.177445216461785e-07, + "logits/generated": -2.011335849761963, + "logits/real": -2.1957554817199707, + "logps/generated": -577.8146362304688, + "logps/real": -302.3636474609375, + "loss": 0.5901, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.170976161956787, + "rewards/margins": 27.727657318115234, + "rewards/real": 20.556682586669922, + "step": 3560 + }, + { + "epoch": 0.4292929292929293, + "grad_norm": 54.82557002704048, + "learning_rate": 3.17076429716729e-07, + "logits/generated": -2.0680551528930664, + "logits/real": -2.0925698280334473, + "logps/generated": -490.8046875, + "logps/real": -186.53109741210938, + "loss": 0.4297, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.373655796051025, + "rewards/margins": 19.686954498291016, + "rewards/real": 12.313298225402832, + "step": 3570 + }, + { + "epoch": 0.4304954304954305, + "grad_norm": 269.07808979589714, + "learning_rate": 3.164083377872795e-07, + "logits/generated": -2.0910630226135254, + "logits/real": -2.130899667739868, + "logps/generated": -534.9212036132812, + "logps/real": -245.68911743164062, + "loss": 0.8762, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.103806495666504, + "rewards/margins": 20.604034423828125, + "rewards/real": 15.500228881835938, + "step": 3580 + }, + { + "epoch": 0.4316979316979317, + "grad_norm": 156.16286324114017, + "learning_rate": 3.1574024585783006e-07, + "logits/generated": -2.138777256011963, + "logits/real": -2.227443218231201, + "logps/generated": -550.2257080078125, + "logps/real": -302.7833251953125, + "loss": 0.4447, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.9097096920013428, + "rewards/margins": 22.903656005859375, + "rewards/real": 20.993946075439453, + "step": 3590 + }, + { + "epoch": 0.4329004329004329, + "grad_norm": 316.686145759838, + "learning_rate": 3.1507215392838057e-07, + "logits/generated": -2.1031007766723633, + "logits/real": -2.177794933319092, + "logps/generated": -618.0108642578125, + "logps/real": -300.5312194824219, + "loss": 0.4535, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.1155319213867188, + "rewards/margins": 24.038068771362305, + "rewards/real": 22.92254066467285, + "step": 3600 + }, + { + "epoch": 0.4341029341029341, + "grad_norm": 13.746064790512671, + "learning_rate": 3.1440406199893107e-07, + "logits/generated": -2.04689884185791, + "logits/real": -2.219418525695801, + "logps/generated": -584.58935546875, + "logps/real": -309.8716735839844, + "loss": 0.585, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.4922397136688232, + "rewards/margins": 26.610736846923828, + "rewards/real": 23.118497848510742, + "step": 3610 + }, + { + "epoch": 0.4353054353054353, + "grad_norm": 19.192953699515723, + "learning_rate": 3.137359700694815e-07, + "logits/generated": -2.0190205574035645, + "logits/real": -2.170254945755005, + "logps/generated": -444.9599609375, + "logps/real": -289.107421875, + "loss": 0.5915, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.893264055252075, + "rewards/margins": 22.015300750732422, + "rewards/real": 18.12203598022461, + "step": 3620 + }, + { + "epoch": 0.4365079365079365, + "grad_norm": 117.16286334872296, + "learning_rate": 3.1306787814003203e-07, + "logits/generated": -2.17712140083313, + "logits/real": -2.260840892791748, + "logps/generated": -581.529541015625, + "logps/real": -312.60406494140625, + "loss": 0.5468, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.3088008165359497, + "rewards/margins": 20.99437141418457, + "rewards/real": 19.685569763183594, + "step": 3630 + }, + { + "epoch": 0.4377104377104377, + "grad_norm": 84.84598138327165, + "learning_rate": 3.1239978621058254e-07, + "logits/generated": -2.0735323429107666, + "logits/real": -2.177410125732422, + "logps/generated": -622.9395751953125, + "logps/real": -268.78997802734375, + "loss": 0.4778, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.399700164794922, + "rewards/margins": 24.45461082458496, + "rewards/real": 17.054908752441406, + "step": 3640 + }, + { + "epoch": 0.4389129389129389, + "grad_norm": 211.7406557120953, + "learning_rate": 3.1173169428113304e-07, + "logits/generated": -2.113731861114502, + "logits/real": -2.238961696624756, + "logps/generated": -516.693115234375, + "logps/real": -262.43408203125, + "loss": 0.4155, + "rewards/accuracies": 1.0, + "rewards/generated": -3.0317835807800293, + "rewards/margins": 22.840307235717773, + "rewards/real": 19.808523178100586, + "step": 3650 + }, + { + "epoch": 0.4401154401154401, + "grad_norm": 55.42098247030718, + "learning_rate": 3.1106360235168355e-07, + "logits/generated": -2.196582078933716, + "logits/real": -2.285003423690796, + "logps/generated": -487.26318359375, + "logps/real": -242.1387481689453, + "loss": 0.469, + "rewards/accuracies": 1.0, + "rewards/generated": -3.281810760498047, + "rewards/margins": 19.729379653930664, + "rewards/real": 16.44757080078125, + "step": 3660 + }, + { + "epoch": 0.44131794131794133, + "grad_norm": 391.66853943369136, + "learning_rate": 3.103955104222341e-07, + "logits/generated": -2.206817388534546, + "logits/real": -2.2829651832580566, + "logps/generated": -533.7303466796875, + "logps/real": -289.9284973144531, + "loss": 0.8139, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.014866828918457, + "rewards/margins": 23.939260482788086, + "rewards/real": 19.924394607543945, + "step": 3670 + }, + { + "epoch": 0.44252044252044254, + "grad_norm": 6.573225525685875, + "learning_rate": 3.097274184927846e-07, + "logits/generated": -2.1602535247802734, + "logits/real": -2.2709264755249023, + "logps/generated": -467.51837158203125, + "logps/real": -187.49595642089844, + "loss": 0.3035, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.115992069244385, + "rewards/margins": 21.21573829650879, + "rewards/real": 14.099746704101562, + "step": 3680 + }, + { + "epoch": 0.44372294372294374, + "grad_norm": 7.0052738704130375, + "learning_rate": 3.090593265633351e-07, + "logits/generated": -2.2725062370300293, + "logits/real": -2.316488742828369, + "logps/generated": -489.68359375, + "logps/real": -240.08065795898438, + "loss": 0.5854, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.560263991355896, + "rewards/margins": 19.86050796508789, + "rewards/real": 21.420772552490234, + "step": 3690 + }, + { + "epoch": 0.44492544492544495, + "grad_norm": 2.4838609997369243, + "learning_rate": 3.0839123463388563e-07, + "logits/generated": -2.1661388874053955, + "logits/real": -2.229221820831299, + "logps/generated": -693.8267822265625, + "logps/real": -265.43084716796875, + "loss": 0.647, + "rewards/accuracies": 1.0, + "rewards/generated": -6.291512489318848, + "rewards/margins": 24.833316802978516, + "rewards/real": 18.541805267333984, + "step": 3700 + }, + { + "epoch": 0.44612794612794615, + "grad_norm": 21.20920717884601, + "learning_rate": 3.0772314270443614e-07, + "logits/generated": -2.2226345539093018, + "logits/real": -2.341580867767334, + "logps/generated": -478.16461181640625, + "logps/real": -295.2587585449219, + "loss": 0.5038, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.17221125960350037, + "rewards/margins": 24.30082893371582, + "rewards/real": 24.128616333007812, + "step": 3710 + }, + { + "epoch": 0.44733044733044736, + "grad_norm": 173.4538878199435, + "learning_rate": 3.070550507749866e-07, + "logits/generated": -2.0368051528930664, + "logits/real": -2.2117326259613037, + "logps/generated": -567.9580078125, + "logps/real": -254.87460327148438, + "loss": 0.5571, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.8097100257873535, + "rewards/margins": 26.1510066986084, + "rewards/real": 19.341297149658203, + "step": 3720 + }, + { + "epoch": 0.4485329485329485, + "grad_norm": 7.3825012679693165, + "learning_rate": 3.063869588455371e-07, + "logits/generated": -2.0971477031707764, + "logits/real": -2.150735855102539, + "logps/generated": -513.3101806640625, + "logps/real": -224.2945556640625, + "loss": 0.5287, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.508622646331787, + "rewards/margins": 23.58926773071289, + "rewards/real": 17.080646514892578, + "step": 3730 + }, + { + "epoch": 0.4497354497354497, + "grad_norm": 5.180936608677949, + "learning_rate": 3.057188669160876e-07, + "logits/generated": -2.1248841285705566, + "logits/real": -2.1839470863342285, + "logps/generated": -437.79107666015625, + "logps/real": -202.60720825195312, + "loss": 0.6053, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.073121547698975, + "rewards/margins": 22.3405704498291, + "rewards/real": 16.26744842529297, + "step": 3740 + }, + { + "epoch": 0.4509379509379509, + "grad_norm": 864.6075584973929, + "learning_rate": 3.0505077498663816e-07, + "logits/generated": -2.140291690826416, + "logits/real": -2.2139744758605957, + "logps/generated": -533.6538696289062, + "logps/real": -227.28787231445312, + "loss": 0.6704, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.38225793838501, + "rewards/margins": 23.71435546875, + "rewards/real": 18.332096099853516, + "step": 3750 + }, + { + "epoch": 0.4521404521404521, + "grad_norm": 216.3524497920051, + "learning_rate": 3.0438268305718867e-07, + "logits/generated": -2.2790138721466064, + "logits/real": -2.357851028442383, + "logps/generated": -640.957763671875, + "logps/real": -423.6236267089844, + "loss": 0.489, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 4.965917110443115, + "rewards/margins": 26.667720794677734, + "rewards/real": 31.63364028930664, + "step": 3760 + }, + { + "epoch": 0.4533429533429533, + "grad_norm": 159.65228346706937, + "learning_rate": 3.037145911277392e-07, + "logits/generated": -2.1005051136016846, + "logits/real": -2.237699031829834, + "logps/generated": -390.77569580078125, + "logps/real": -202.80323791503906, + "loss": 0.4214, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.24750280380249, + "rewards/margins": 22.559703826904297, + "rewards/real": 15.312200546264648, + "step": 3770 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 25.65213333469863, + "learning_rate": 3.030464991982897e-07, + "logits/generated": -2.240879535675049, + "logits/real": -2.274664878845215, + "logps/generated": -534.3191528320312, + "logps/real": -239.8083038330078, + "loss": 0.6036, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.094273090362549, + "rewards/margins": 20.56509017944336, + "rewards/real": 18.470813751220703, + "step": 3780 + }, + { + "epoch": 0.45574795574795574, + "grad_norm": 12.046752007252833, + "learning_rate": 3.023784072688402e-07, + "logits/generated": -2.2010159492492676, + "logits/real": -2.255561590194702, + "logps/generated": -407.95745849609375, + "logps/real": -196.4077911376953, + "loss": 0.3797, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.5161245465278625, + "rewards/margins": 14.831266403198242, + "rewards/real": 14.315142631530762, + "step": 3790 + }, + { + "epoch": 0.45695045695045694, + "grad_norm": 37.34126795951205, + "learning_rate": 3.017103153393907e-07, + "logits/generated": -2.237163782119751, + "logits/real": -2.309365749359131, + "logps/generated": -512.1398315429688, + "logps/real": -200.3365478515625, + "loss": 0.6084, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 0.10469992458820343, + "rewards/margins": 15.552648544311523, + "rewards/real": 15.6573486328125, + "step": 3800 + }, + { + "epoch": 0.45815295815295815, + "grad_norm": 397.40898206986094, + "learning_rate": 3.010422234099412e-07, + "logits/generated": -2.2593846321105957, + "logits/real": -2.2778258323669434, + "logps/generated": -539.22998046875, + "logps/real": -222.98300170898438, + "loss": 0.3289, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.4914422035217285, + "rewards/margins": 21.29345703125, + "rewards/real": 17.802013397216797, + "step": 3810 + }, + { + "epoch": 0.45935545935545935, + "grad_norm": 156.24867212632122, + "learning_rate": 3.0037413148049165e-07, + "logits/generated": -2.2712087631225586, + "logits/real": -2.3158586025238037, + "logps/generated": -462.0484924316406, + "logps/real": -261.26666259765625, + "loss": 0.531, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 3.1764886379241943, + "rewards/margins": 17.392261505126953, + "rewards/real": 20.56875228881836, + "step": 3820 + }, + { + "epoch": 0.46055796055796056, + "grad_norm": 283.6040019447855, + "learning_rate": 2.997060395510422e-07, + "logits/generated": -2.2225520610809326, + "logits/real": -2.327397346496582, + "logps/generated": -682.3387451171875, + "logps/real": -340.1170959472656, + "loss": 0.4905, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.2472844123840332, + "rewards/margins": 24.164981842041016, + "rewards/real": 24.412263870239258, + "step": 3830 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 20.895431585751865, + "learning_rate": 2.990379476215927e-07, + "logits/generated": -2.271763801574707, + "logits/real": -2.362795352935791, + "logps/generated": -500.9606018066406, + "logps/real": -245.93502807617188, + "loss": 0.586, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 2.9510433673858643, + "rewards/margins": 17.889434814453125, + "rewards/real": 20.84048080444336, + "step": 3840 + }, + { + "epoch": 0.46296296296296297, + "grad_norm": 3.300174115179123, + "learning_rate": 2.983698556921432e-07, + "logits/generated": -2.1928889751434326, + "logits/real": -2.233339786529541, + "logps/generated": -487.408447265625, + "logps/real": -275.59490966796875, + "loss": 0.4336, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.8288896679878235, + "rewards/margins": 20.417415618896484, + "rewards/real": 19.58852767944336, + "step": 3850 + }, + { + "epoch": 0.46416546416546417, + "grad_norm": 391.80759150042627, + "learning_rate": 2.9770176376269373e-07, + "logits/generated": -2.252769947052002, + "logits/real": -2.323683977127075, + "logps/generated": -461.66595458984375, + "logps/real": -276.63360595703125, + "loss": 0.4622, + "rewards/accuracies": 0.875, + "rewards/generated": 0.5805469751358032, + "rewards/margins": 18.976581573486328, + "rewards/real": 19.55712890625, + "step": 3860 + }, + { + "epoch": 0.4653679653679654, + "grad_norm": 510.6974364883727, + "learning_rate": 2.9703367183324424e-07, + "logits/generated": -2.3645882606506348, + "logits/real": -2.3484604358673096, + "logps/generated": -489.74993896484375, + "logps/real": -312.33343505859375, + "loss": 0.617, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.48322534561157227, + "rewards/margins": 19.076953887939453, + "rewards/real": 19.560176849365234, + "step": 3870 + }, + { + "epoch": 0.4665704665704666, + "grad_norm": 488.6204227927367, + "learning_rate": 2.9636557990379475e-07, + "logits/generated": -2.288896083831787, + "logits/real": -2.309587240219116, + "logps/generated": -503.6145935058594, + "logps/real": -268.85858154296875, + "loss": 0.7618, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.37723970413208, + "rewards/margins": 19.37235450744629, + "rewards/real": 21.74959373474121, + "step": 3880 + }, + { + "epoch": 0.4677729677729678, + "grad_norm": 13.171927751582817, + "learning_rate": 2.9569748797434525e-07, + "logits/generated": -2.223548173904419, + "logits/real": -2.239373207092285, + "logps/generated": -525.7351684570312, + "logps/real": -271.37725830078125, + "loss": 0.4788, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.04958176612854, + "rewards/margins": 18.80972671508789, + "rewards/real": 17.760143280029297, + "step": 3890 + }, + { + "epoch": 0.468975468975469, + "grad_norm": 167.68241170986718, + "learning_rate": 2.9502939604489576e-07, + "logits/generated": -2.185739040374756, + "logits/real": -2.288041591644287, + "logps/generated": -701.4237060546875, + "logps/real": -303.81219482421875, + "loss": 0.5354, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.0823731422424316, + "rewards/margins": 26.959665298461914, + "rewards/real": 25.877288818359375, + "step": 3900 + }, + { + "epoch": 0.4701779701779702, + "grad_norm": 4.075691179085272, + "learning_rate": 2.943613041154463e-07, + "logits/generated": -2.1385841369628906, + "logits/real": -2.1470489501953125, + "logps/generated": -504.74237060546875, + "logps/real": -223.2626190185547, + "loss": 0.4364, + "rewards/accuracies": 1.0, + "rewards/generated": -6.160588264465332, + "rewards/margins": 24.97075843811035, + "rewards/real": 18.810171127319336, + "step": 3910 + }, + { + "epoch": 0.4713804713804714, + "grad_norm": 204.0725652594611, + "learning_rate": 2.9369321218599677e-07, + "logits/generated": -2.1229546070098877, + "logits/real": -2.2415804862976074, + "logps/generated": -464.67620849609375, + "logps/real": -272.47393798828125, + "loss": 0.4992, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.2768571376800537, + "rewards/margins": 23.420005798339844, + "rewards/real": 21.143152236938477, + "step": 3920 + }, + { + "epoch": 0.4725829725829726, + "grad_norm": 3.7067046281291276, + "learning_rate": 2.930251202565473e-07, + "logits/generated": -2.0180985927581787, + "logits/real": -2.087268590927124, + "logps/generated": -467.0703125, + "logps/real": -187.01832580566406, + "loss": 0.3222, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.516222953796387, + "rewards/margins": 18.26113510131836, + "rewards/real": 12.744909286499023, + "step": 3930 + }, + { + "epoch": 0.4737854737854738, + "grad_norm": 50.38328578695344, + "learning_rate": 2.923570283270978e-07, + "logits/generated": -2.1050660610198975, + "logits/real": -2.141570568084717, + "logps/generated": -579.7655029296875, + "logps/real": -259.5626525878906, + "loss": 0.5136, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.849890232086182, + "rewards/margins": 25.93613624572754, + "rewards/real": 20.086244583129883, + "step": 3940 + }, + { + "epoch": 0.474987974987975, + "grad_norm": 16.680861667054387, + "learning_rate": 2.916889363976483e-07, + "logits/generated": -2.0800085067749023, + "logits/real": -2.2065744400024414, + "logps/generated": -606.1420288085938, + "logps/real": -282.26123046875, + "loss": 0.4021, + "rewards/accuracies": 1.0, + "rewards/generated": -5.822685718536377, + "rewards/margins": 26.468358993530273, + "rewards/real": 20.645671844482422, + "step": 3950 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 54.59701336045353, + "learning_rate": 2.910208444681988e-07, + "logits/generated": -2.2042269706726074, + "logits/real": -2.208150625228882, + "logps/generated": -578.5890502929688, + "logps/real": -277.19866943359375, + "loss": 0.4799, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.8450977206230164, + "rewards/margins": 21.83233070373535, + "rewards/real": 22.677427291870117, + "step": 3960 + }, + { + "epoch": 0.47739297739297737, + "grad_norm": 4.805880140636131, + "learning_rate": 2.903527525387493e-07, + "logits/generated": -2.2389702796936035, + "logits/real": -2.255767583847046, + "logps/generated": -519.0675659179688, + "logps/real": -219.37637329101562, + "loss": 0.4427, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.1465516090393066, + "rewards/margins": 19.872838973999023, + "rewards/real": 16.726289749145508, + "step": 3970 + }, + { + "epoch": 0.4785954785954786, + "grad_norm": 64.9203979827319, + "learning_rate": 2.896846606092998e-07, + "logits/generated": -2.1840157508850098, + "logits/real": -2.27323317527771, + "logps/generated": -549.7556762695312, + "logps/real": -334.9892272949219, + "loss": 0.6059, + "rewards/accuracies": 0.875, + "rewards/generated": 0.023430729284882545, + "rewards/margins": 22.822460174560547, + "rewards/real": 22.845895767211914, + "step": 3980 + }, + { + "epoch": 0.4797979797979798, + "grad_norm": 382.27072011663375, + "learning_rate": 2.8901656867985037e-07, + "logits/generated": -2.1771812438964844, + "logits/real": -2.262188196182251, + "logps/generated": -599.5274658203125, + "logps/real": -259.5793762207031, + "loss": 0.4793, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.041812896728516, + "rewards/margins": 28.17879867553711, + "rewards/real": 20.13698959350586, + "step": 3990 + }, + { + "epoch": 0.481000481000481, + "grad_norm": 10.010252032118698, + "learning_rate": 2.883484767504009e-07, + "logits/generated": -2.1737263202667236, + "logits/real": -2.2469887733459473, + "logps/generated": -543.7032470703125, + "logps/real": -244.39498901367188, + "loss": 0.5971, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.1522727012634277, + "rewards/margins": 21.82972526550293, + "rewards/real": 18.677452087402344, + "step": 4000 + }, + { + "epoch": 0.481000481000481, + "eval_logits/generated": -2.240758180618286, + "eval_logits/real": -2.287383556365967, + "eval_logps/generated": -494.74700927734375, + "eval_logps/real": -266.41497802734375, + "eval_loss": 0.43489810824394226, + "eval_rewards/accuracies": 0.9553571343421936, + "eval_rewards/generated": -0.49654555320739746, + "eval_rewards/margins": 20.982145309448242, + "eval_rewards/real": 20.485599517822266, + "eval_runtime": 159.3593, + "eval_samples_per_second": 6.275, + "eval_steps_per_second": 0.527, + "step": 4000 + }, + { + "epoch": 0.4822029822029822, + "grad_norm": 6.445596093537904, + "learning_rate": 2.876803848209514e-07, + "logits/generated": -2.178842067718506, + "logits/real": -2.245429277420044, + "logps/generated": -537.5115966796875, + "logps/real": -223.5152130126953, + "loss": 0.2816, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.587291717529297, + "rewards/margins": 20.84734535217285, + "rewards/real": 15.260052680969238, + "step": 4010 + }, + { + "epoch": 0.4834054834054834, + "grad_norm": 181.80066698601422, + "learning_rate": 2.8701229289150184e-07, + "logits/generated": -2.1754794120788574, + "logits/real": -2.26991605758667, + "logps/generated": -446.3489685058594, + "logps/real": -302.1445007324219, + "loss": 0.8848, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 1.6614997386932373, + "rewards/margins": 19.420612335205078, + "rewards/real": 21.08211326599121, + "step": 4020 + }, + { + "epoch": 0.4846079846079846, + "grad_norm": 10.999344866462478, + "learning_rate": 2.8634420096205234e-07, + "logits/generated": -2.2095115184783936, + "logits/real": -2.3496994972229004, + "logps/generated": -639.6976318359375, + "logps/real": -293.78436279296875, + "loss": 0.6402, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 4.128434181213379, + "rewards/margins": 23.227733612060547, + "rewards/real": 27.356164932250977, + "step": 4030 + }, + { + "epoch": 0.4858104858104858, + "grad_norm": 49.49335315176659, + "learning_rate": 2.8567610903260285e-07, + "logits/generated": -2.1898367404937744, + "logits/real": -2.2683277130126953, + "logps/generated": -567.0438232421875, + "logps/real": -238.8550262451172, + "loss": 0.6797, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.44430685043335, + "rewards/margins": 25.625652313232422, + "rewards/real": 21.181346893310547, + "step": 4040 + }, + { + "epoch": 0.487012987012987, + "grad_norm": 1352.6411990032718, + "learning_rate": 2.8500801710315336e-07, + "logits/generated": -2.189768075942993, + "logits/real": -2.24078106880188, + "logps/generated": -406.25115966796875, + "logps/real": -223.9019317626953, + "loss": 0.7146, + "rewards/accuracies": 0.875, + "rewards/generated": -4.249040126800537, + "rewards/margins": 18.82101058959961, + "rewards/real": 14.571968078613281, + "step": 4050 + }, + { + "epoch": 0.4882154882154882, + "grad_norm": 9.88780533710644, + "learning_rate": 2.8433992517370386e-07, + "logits/generated": -2.208832263946533, + "logits/real": -2.219021797180176, + "logps/generated": -428.3487243652344, + "logps/real": -189.39505004882812, + "loss": 0.4363, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.900750637054443, + "rewards/margins": 19.817745208740234, + "rewards/real": 13.916994094848633, + "step": 4060 + }, + { + "epoch": 0.4894179894179894, + "grad_norm": 141.58807713316912, + "learning_rate": 2.836718332442544e-07, + "logits/generated": -2.2209315299987793, + "logits/real": -2.262233257293701, + "logps/generated": -545.9734497070312, + "logps/real": -252.4385528564453, + "loss": 0.441, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.508567810058594, + "rewards/margins": 24.501379013061523, + "rewards/real": 19.992813110351562, + "step": 4070 + }, + { + "epoch": 0.4906204906204906, + "grad_norm": 9.10219444841313, + "learning_rate": 2.8300374131480493e-07, + "logits/generated": -2.22090744972229, + "logits/real": -2.2742881774902344, + "logps/generated": -505.38427734375, + "logps/real": -247.7169189453125, + "loss": 0.4201, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.9506261348724365, + "rewards/margins": 20.888425827026367, + "rewards/real": 18.937801361083984, + "step": 4080 + }, + { + "epoch": 0.49182299182299183, + "grad_norm": 2.509799430542211, + "learning_rate": 2.8233564938535543e-07, + "logits/generated": -2.089916229248047, + "logits/real": -2.238445997238159, + "logps/generated": -484.5831604003906, + "logps/real": -222.5631866455078, + "loss": 0.4783, + "rewards/accuracies": 1.0, + "rewards/generated": -7.338548183441162, + "rewards/margins": 24.402645111083984, + "rewards/real": 17.064098358154297, + "step": 4090 + }, + { + "epoch": 0.49302549302549303, + "grad_norm": 801.1080653021605, + "learning_rate": 2.8166755745590594e-07, + "logits/generated": -2.205252170562744, + "logits/real": -2.2826685905456543, + "logps/generated": -569.2166748046875, + "logps/real": -209.69137573242188, + "loss": 0.5325, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.447823524475098, + "rewards/margins": 21.286632537841797, + "rewards/real": 15.838808059692383, + "step": 4100 + }, + { + "epoch": 0.49422799422799424, + "grad_norm": 13.449589197643624, + "learning_rate": 2.8099946552645645e-07, + "logits/generated": -2.1758921146392822, + "logits/real": -2.2592873573303223, + "logps/generated": -606.7257690429688, + "logps/real": -278.8661804199219, + "loss": 0.5223, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.077622413635254, + "rewards/margins": 24.696016311645508, + "rewards/real": 19.61839485168457, + "step": 4110 + }, + { + "epoch": 0.49543049543049544, + "grad_norm": 41.94418584186682, + "learning_rate": 2.803313735970069e-07, + "logits/generated": -2.265984058380127, + "logits/real": -2.2622601985931396, + "logps/generated": -510.1703186035156, + "logps/real": -228.3834228515625, + "loss": 0.4545, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.8092894554138184, + "rewards/margins": 22.863197326660156, + "rewards/real": 19.053909301757812, + "step": 4120 + }, + { + "epoch": 0.49663299663299665, + "grad_norm": 76.52416670658293, + "learning_rate": 2.796632816675574e-07, + "logits/generated": -2.324951648712158, + "logits/real": -2.3724920749664307, + "logps/generated": -561.64892578125, + "logps/real": -283.00494384765625, + "loss": 1.0526, + "rewards/accuracies": 0.875, + "rewards/generated": 3.573587417602539, + "rewards/margins": 20.410449981689453, + "rewards/real": 23.984039306640625, + "step": 4130 + }, + { + "epoch": 0.49783549783549785, + "grad_norm": 241.36928343336322, + "learning_rate": 2.789951897381079e-07, + "logits/generated": -2.2231245040893555, + "logits/real": -2.29709529876709, + "logps/generated": -488.70989990234375, + "logps/real": -226.192138671875, + "loss": 0.4705, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -1.9012588262557983, + "rewards/margins": 20.033512115478516, + "rewards/real": 18.132253646850586, + "step": 4140 + }, + { + "epoch": 0.49903799903799906, + "grad_norm": 489.43241464733893, + "learning_rate": 2.7832709780865847e-07, + "logits/generated": -2.2443854808807373, + "logits/real": -2.2525110244750977, + "logps/generated": -504.917724609375, + "logps/real": -257.2982177734375, + "loss": 0.6455, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.7884670495986938, + "rewards/margins": 17.46521759033203, + "rewards/real": 19.25368309020996, + "step": 4150 + }, + { + "epoch": 0.5002405002405003, + "grad_norm": 12.460598594612183, + "learning_rate": 2.77659005879209e-07, + "logits/generated": -2.1264469623565674, + "logits/real": -2.263762950897217, + "logps/generated": -571.4362182617188, + "logps/real": -244.6605987548828, + "loss": 0.356, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.415511131286621, + "rewards/margins": 23.671628952026367, + "rewards/real": 18.256114959716797, + "step": 4160 + }, + { + "epoch": 0.5014430014430015, + "grad_norm": 103.6520781489663, + "learning_rate": 2.769909139497595e-07, + "logits/generated": -2.121946334838867, + "logits/real": -2.2184207439422607, + "logps/generated": -561.1491088867188, + "logps/real": -255.83102416992188, + "loss": 0.6462, + "rewards/accuracies": 1.0, + "rewards/generated": -0.4852834641933441, + "rewards/margins": 22.426822662353516, + "rewards/real": 21.941539764404297, + "step": 4170 + }, + { + "epoch": 0.5026455026455027, + "grad_norm": 6.917698758779508, + "learning_rate": 2.7632282202031e-07, + "logits/generated": -2.1365036964416504, + "logits/real": -2.297579526901245, + "logps/generated": -497.52410888671875, + "logps/real": -276.71240234375, + "loss": 0.3082, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -2.8837666511535645, + "rewards/margins": 25.9765682220459, + "rewards/real": 23.092803955078125, + "step": 4180 + }, + { + "epoch": 0.5038480038480039, + "grad_norm": 35.29472337904465, + "learning_rate": 2.756547300908605e-07, + "logits/generated": -2.231517791748047, + "logits/real": -2.3270859718322754, + "logps/generated": -512.0706176757812, + "logps/real": -302.6792297363281, + "loss": 0.5751, + "rewards/accuracies": 0.875, + "rewards/generated": 3.048867702484131, + "rewards/margins": 19.952205657958984, + "rewards/real": 23.00107192993164, + "step": 4190 + }, + { + "epoch": 0.5050505050505051, + "grad_norm": 5.824501053955008, + "learning_rate": 2.74986638161411e-07, + "logits/generated": -2.1285107135772705, + "logits/real": -2.1955060958862305, + "logps/generated": -407.77099609375, + "logps/real": -229.5274200439453, + "loss": 0.336, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.16152045130729675, + "rewards/margins": 18.308820724487305, + "rewards/real": 18.470340728759766, + "step": 4200 + }, + { + "epoch": 0.5062530062530063, + "grad_norm": 425.6269134016861, + "learning_rate": 2.743185462319615e-07, + "logits/generated": -2.1789069175720215, + "logits/real": -2.305410385131836, + "logps/generated": -587.2059326171875, + "logps/real": -291.29583740234375, + "loss": 0.6028, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 4.074764251708984, + "rewards/margins": 20.289831161499023, + "rewards/real": 24.36459732055664, + "step": 4210 + }, + { + "epoch": 0.5074555074555075, + "grad_norm": 17.987909722350626, + "learning_rate": 2.7365045430251197e-07, + "logits/generated": -2.1587955951690674, + "logits/real": -2.255331516265869, + "logps/generated": -565.9151000976562, + "logps/real": -317.25189208984375, + "loss": 0.8092, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.0316405296325684, + "rewards/margins": 25.426677703857422, + "rewards/real": 23.395034790039062, + "step": 4220 + }, + { + "epoch": 0.5086580086580087, + "grad_norm": 17.09561538762422, + "learning_rate": 2.729823623730625e-07, + "logits/generated": -2.098085641860962, + "logits/real": -2.2109179496765137, + "logps/generated": -608.9376831054688, + "logps/real": -244.25595092773438, + "loss": 0.3335, + "rewards/accuracies": 1.0, + "rewards/generated": -7.205714225769043, + "rewards/margins": 24.17291831970215, + "rewards/real": 16.96720314025879, + "step": 4230 + }, + { + "epoch": 0.5098605098605099, + "grad_norm": 4.895647233741109, + "learning_rate": 2.7231427044361303e-07, + "logits/generated": -2.158693790435791, + "logits/real": -2.3043694496154785, + "logps/generated": -489.5899353027344, + "logps/real": -231.9427032470703, + "loss": 0.4309, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.786470890045166, + "rewards/margins": 23.3662052154541, + "rewards/real": 20.579734802246094, + "step": 4240 + }, + { + "epoch": 0.5110630110630111, + "grad_norm": 61.37309275152032, + "learning_rate": 2.7164617851416354e-07, + "logits/generated": -2.2396254539489746, + "logits/real": -2.316911220550537, + "logps/generated": -544.9541625976562, + "logps/real": -302.1190185546875, + "loss": 0.7248, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": 2.833289623260498, + "rewards/margins": 22.39664077758789, + "rewards/real": 25.229928970336914, + "step": 4250 + }, + { + "epoch": 0.5122655122655123, + "grad_norm": 16.05886857939807, + "learning_rate": 2.7097808658471404e-07, + "logits/generated": -2.2014529705047607, + "logits/real": -2.3276166915893555, + "logps/generated": -540.337158203125, + "logps/real": -325.4220886230469, + "loss": 0.5844, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.1857694387435913, + "rewards/margins": 27.976749420166016, + "rewards/real": 26.79098129272461, + "step": 4260 + }, + { + "epoch": 0.5134680134680135, + "grad_norm": 154.12658595517766, + "learning_rate": 2.7030999465526455e-07, + "logits/generated": -2.1820855140686035, + "logits/real": -2.2535789012908936, + "logps/generated": -452.01446533203125, + "logps/real": -194.50796508789062, + "loss": 0.5686, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.419161319732666, + "rewards/margins": 18.813814163208008, + "rewards/real": 16.3946533203125, + "step": 4270 + }, + { + "epoch": 0.5146705146705147, + "grad_norm": 388.34247577646323, + "learning_rate": 2.6964190272581506e-07, + "logits/generated": -2.1291770935058594, + "logits/real": -2.2273473739624023, + "logps/generated": -515.061767578125, + "logps/real": -171.399658203125, + "loss": 0.7451, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -7.2243475914001465, + "rewards/margins": 21.68121910095215, + "rewards/real": 14.456872940063477, + "step": 4280 + }, + { + "epoch": 0.5158730158730159, + "grad_norm": 9.629887855782778, + "learning_rate": 2.6897381079636556e-07, + "logits/generated": -2.1529221534729004, + "logits/real": -2.2144412994384766, + "logps/generated": -592.399169921875, + "logps/real": -262.12939453125, + "loss": 0.8383, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.442110776901245, + "rewards/margins": 22.367305755615234, + "rewards/real": 19.925195693969727, + "step": 4290 + }, + { + "epoch": 0.517075517075517, + "grad_norm": 37.95328601503405, + "learning_rate": 2.6830571886691607e-07, + "logits/generated": -2.1695070266723633, + "logits/real": -2.266334056854248, + "logps/generated": -548.8338623046875, + "logps/real": -218.3822479248047, + "loss": 0.4624, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.688658237457275, + "rewards/margins": 21.59425163269043, + "rewards/real": 16.905593872070312, + "step": 4300 + }, + { + "epoch": 0.5182780182780182, + "grad_norm": 462.2514751934306, + "learning_rate": 2.6763762693746663e-07, + "logits/generated": -2.2629504203796387, + "logits/real": -2.304380416870117, + "logps/generated": -492.2303161621094, + "logps/real": -231.40292358398438, + "loss": 0.9736, + "rewards/accuracies": 0.875, + "rewards/generated": 0.7846146821975708, + "rewards/margins": 18.682939529418945, + "rewards/real": 19.467554092407227, + "step": 4310 + }, + { + "epoch": 0.5194805194805194, + "grad_norm": 642.0571785403265, + "learning_rate": 2.6696953500801714e-07, + "logits/generated": -2.064579486846924, + "logits/real": -2.1611812114715576, + "logps/generated": -471.14105224609375, + "logps/real": -195.31710815429688, + "loss": 0.5209, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.035947799682617, + "rewards/margins": 20.410633087158203, + "rewards/real": 14.37468433380127, + "step": 4320 + }, + { + "epoch": 0.5206830206830206, + "grad_norm": 4.065203395958874, + "learning_rate": 2.663014430785676e-07, + "logits/generated": -2.0566446781158447, + "logits/real": -2.177860975265503, + "logps/generated": -618.2171630859375, + "logps/real": -298.42803955078125, + "loss": 0.4679, + "rewards/accuracies": 1.0, + "rewards/generated": -10.0460844039917, + "rewards/margins": 29.6387996673584, + "rewards/real": 19.59271812438965, + "step": 4330 + }, + { + "epoch": 0.5218855218855218, + "grad_norm": 425.4068108907742, + "learning_rate": 2.656333511491181e-07, + "logits/generated": -2.126441478729248, + "logits/real": -2.2806851863861084, + "logps/generated": -589.7638549804688, + "logps/real": -284.98175048828125, + "loss": 0.3274, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.0182390213012695, + "rewards/margins": 29.235637664794922, + "rewards/real": 22.21739959716797, + "step": 4340 + }, + { + "epoch": 0.523088023088023, + "grad_norm": 63.78711095544259, + "learning_rate": 2.649652592196686e-07, + "logits/generated": -2.158569097518921, + "logits/real": -2.268720865249634, + "logps/generated": -575.865478515625, + "logps/real": -278.2831115722656, + "loss": 0.5682, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.7679094076156616, + "rewards/margins": 24.413375854492188, + "rewards/real": 22.645465850830078, + "step": 4350 + }, + { + "epoch": 0.5242905242905243, + "grad_norm": 32.803578430575214, + "learning_rate": 2.642971672902191e-07, + "logits/generated": -2.1932380199432373, + "logits/real": -2.247445821762085, + "logps/generated": -623.6664428710938, + "logps/real": -294.74267578125, + "loss": 0.4304, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.904718399047852, + "rewards/margins": 29.348217010498047, + "rewards/real": 24.443500518798828, + "step": 4360 + }, + { + "epoch": 0.5254930254930255, + "grad_norm": 13.421504285193922, + "learning_rate": 2.636290753607696e-07, + "logits/generated": -2.2893805503845215, + "logits/real": -2.3213396072387695, + "logps/generated": -472.1160583496094, + "logps/real": -254.22085571289062, + "loss": 0.3534, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.9855934977531433, + "rewards/margins": 19.296947479248047, + "rewards/real": 20.28253746032715, + "step": 4370 + }, + { + "epoch": 0.5266955266955267, + "grad_norm": 296.78305518652917, + "learning_rate": 2.629609834313201e-07, + "logits/generated": -2.2980878353118896, + "logits/real": -2.361016273498535, + "logps/generated": -516.3341674804688, + "logps/real": -303.3874206542969, + "loss": 0.5903, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 4.377728462219238, + "rewards/margins": 19.445972442626953, + "rewards/real": 23.823699951171875, + "step": 4380 + }, + { + "epoch": 0.5278980278980279, + "grad_norm": 233.02334201767775, + "learning_rate": 2.622928915018707e-07, + "logits/generated": -2.1487505435943604, + "logits/real": -2.2552433013916016, + "logps/generated": -536.1646118164062, + "logps/real": -255.44229125976562, + "loss": 0.4646, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -5.993595123291016, + "rewards/margins": 23.187070846557617, + "rewards/real": 17.1934757232666, + "step": 4390 + }, + { + "epoch": 0.5291005291005291, + "grad_norm": 28.309176553575497, + "learning_rate": 2.616247995724212e-07, + "logits/generated": -2.1555910110473633, + "logits/real": -2.258888006210327, + "logps/generated": -531.6666259765625, + "logps/real": -224.35879516601562, + "loss": 0.5644, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.229693412780762, + "rewards/margins": 24.458274841308594, + "rewards/real": 19.228580474853516, + "step": 4400 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 8.738143579495391, + "learning_rate": 2.609567076429717e-07, + "logits/generated": -2.0264885425567627, + "logits/real": -2.2401506900787354, + "logps/generated": -559.1680908203125, + "logps/real": -238.16748046875, + "loss": 0.2895, + "rewards/accuracies": 1.0, + "rewards/generated": -11.519235610961914, + "rewards/margins": 30.78560447692871, + "rewards/real": 19.266366958618164, + "step": 4410 + }, + { + "epoch": 0.5315055315055315, + "grad_norm": 45.80025387344876, + "learning_rate": 2.602886157135222e-07, + "logits/generated": -2.1714773178100586, + "logits/real": -2.288045883178711, + "logps/generated": -480.1744079589844, + "logps/real": -183.60899353027344, + "loss": 0.642, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.770855903625488, + "rewards/margins": 20.459308624267578, + "rewards/real": 15.688453674316406, + "step": 4420 + }, + { + "epoch": 0.5327080327080327, + "grad_norm": 315.0925130759843, + "learning_rate": 2.5962052378407265e-07, + "logits/generated": -2.2238011360168457, + "logits/real": -2.3348443508148193, + "logps/generated": -574.4429931640625, + "logps/real": -252.9526824951172, + "loss": 0.5518, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.0077054500579834, + "rewards/margins": 22.653966903686523, + "rewards/real": 21.646263122558594, + "step": 4430 + }, + { + "epoch": 0.5339105339105339, + "grad_norm": 3.65988090515442, + "learning_rate": 2.5895243185462316e-07, + "logits/generated": -2.117257595062256, + "logits/real": -2.20717191696167, + "logps/generated": -528.3685913085938, + "logps/real": -202.49948120117188, + "loss": 0.472, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.200154304504395, + "rewards/margins": 27.355255126953125, + "rewards/real": 17.155101776123047, + "step": 4440 + }, + { + "epoch": 0.5351130351130351, + "grad_norm": 27.958395736363162, + "learning_rate": 2.5828433992517367e-07, + "logits/generated": -2.157768726348877, + "logits/real": -2.287290096282959, + "logps/generated": -614.3516845703125, + "logps/real": -249.3728790283203, + "loss": 0.507, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.969240188598633, + "rewards/margins": 28.98443603515625, + "rewards/real": 18.015193939208984, + "step": 4450 + }, + { + "epoch": 0.5363155363155363, + "grad_norm": 145.6163028300804, + "learning_rate": 2.5761624799572417e-07, + "logits/generated": -2.1811509132385254, + "logits/real": -2.272597074508667, + "logps/generated": -468.0287170410156, + "logps/real": -245.0470733642578, + "loss": 0.8015, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 0.17038825154304504, + "rewards/margins": 21.22028160095215, + "rewards/real": 21.390670776367188, + "step": 4460 + }, + { + "epoch": 0.5375180375180375, + "grad_norm": 326.0530242436695, + "learning_rate": 2.5694815606627473e-07, + "logits/generated": -2.125600576400757, + "logits/real": -2.271019697189331, + "logps/generated": -618.333251953125, + "logps/real": -288.1407165527344, + "loss": 0.4697, + "rewards/accuracies": 0.875, + "rewards/generated": -10.342113494873047, + "rewards/margins": 30.96488380432129, + "rewards/real": 20.622770309448242, + "step": 4470 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 8.59878987701069, + "learning_rate": 2.5628006413682524e-07, + "logits/generated": -2.0594630241394043, + "logits/real": -2.1491947174072266, + "logps/generated": -571.4081420898438, + "logps/real": -204.54397583007812, + "loss": 0.6242, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -15.420013427734375, + "rewards/margins": 30.255401611328125, + "rewards/real": 14.8353910446167, + "step": 4480 + }, + { + "epoch": 0.5399230399230399, + "grad_norm": 18.588685320751384, + "learning_rate": 2.5561197220737575e-07, + "logits/generated": -2.122663974761963, + "logits/real": -2.1763739585876465, + "logps/generated": -630.9954223632812, + "logps/real": -239.72048950195312, + "loss": 0.7294, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -9.249320030212402, + "rewards/margins": 27.802181243896484, + "rewards/real": 18.552860260009766, + "step": 4490 + }, + { + "epoch": 0.5411255411255411, + "grad_norm": 7.443956790286945, + "learning_rate": 2.5494388027792625e-07, + "logits/generated": -2.0731163024902344, + "logits/real": -2.243588447570801, + "logps/generated": -573.4722900390625, + "logps/real": -237.854248046875, + "loss": 0.56, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.057721138000488, + "rewards/margins": 27.704166412353516, + "rewards/real": 18.646442413330078, + "step": 4500 + }, + { + "epoch": 0.5423280423280423, + "grad_norm": 590.8963263743082, + "learning_rate": 2.5427578834847676e-07, + "logits/generated": -2.1537926197052, + "logits/real": -2.278892993927002, + "logps/generated": -554.9658203125, + "logps/real": -272.32745361328125, + "loss": 0.4551, + "rewards/accuracies": 1.0, + "rewards/generated": -8.246979713439941, + "rewards/margins": 30.182857513427734, + "rewards/real": 21.935882568359375, + "step": 4510 + }, + { + "epoch": 0.5435305435305435, + "grad_norm": 9.00170587382552, + "learning_rate": 2.5360769641902726e-07, + "logits/generated": -2.189779281616211, + "logits/real": -2.1937317848205566, + "logps/generated": -582.6541748046875, + "logps/real": -279.0462646484375, + "loss": 0.75, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -4.090191841125488, + "rewards/margins": 24.9362850189209, + "rewards/real": 20.84609603881836, + "step": 4520 + }, + { + "epoch": 0.5447330447330447, + "grad_norm": 7.287777276537829, + "learning_rate": 2.529396044895777e-07, + "logits/generated": -2.156224012374878, + "logits/real": -2.2385141849517822, + "logps/generated": -508.9384765625, + "logps/real": -226.91232299804688, + "loss": 0.4841, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.3534417152404785, + "rewards/margins": 22.22745704650879, + "rewards/real": 17.874013900756836, + "step": 4530 + }, + { + "epoch": 0.5459355459355459, + "grad_norm": 9.560588297044037, + "learning_rate": 2.522715125601282e-07, + "logits/generated": -2.1847052574157715, + "logits/real": -2.221968412399292, + "logps/generated": -565.5045166015625, + "logps/real": -307.8379211425781, + "loss": 0.3461, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.35024070739746094, + "rewards/margins": 23.976543426513672, + "rewards/real": 23.626300811767578, + "step": 4540 + }, + { + "epoch": 0.5471380471380471, + "grad_norm": 855.0784316315297, + "learning_rate": 2.516034206306788e-07, + "logits/generated": -2.2074856758117676, + "logits/real": -2.2566702365875244, + "logps/generated": -506.1853942871094, + "logps/real": -253.6111297607422, + "loss": 0.6346, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 0.37258845567703247, + "rewards/margins": 21.079408645629883, + "rewards/real": 21.451997756958008, + "step": 4550 + }, + { + "epoch": 0.5483405483405484, + "grad_norm": 269.6334200133016, + "learning_rate": 2.509353287012293e-07, + "logits/generated": -2.172344207763672, + "logits/real": -2.2206785678863525, + "logps/generated": -537.7301025390625, + "logps/real": -306.47161865234375, + "loss": 0.8162, + "rewards/accuracies": 0.875, + "rewards/generated": -1.3855597972869873, + "rewards/margins": 20.728174209594727, + "rewards/real": 19.342613220214844, + "step": 4560 + }, + { + "epoch": 0.5495430495430496, + "grad_norm": 16.075733740721557, + "learning_rate": 2.502672367717798e-07, + "logits/generated": -2.1734321117401123, + "logits/real": -2.187448740005493, + "logps/generated": -473.0311584472656, + "logps/real": -198.8009796142578, + "loss": 0.66, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.780452728271484, + "rewards/margins": 21.286962509155273, + "rewards/real": 16.506511688232422, + "step": 4570 + }, + { + "epoch": 0.5507455507455508, + "grad_norm": 138.58626667433194, + "learning_rate": 2.495991448423303e-07, + "logits/generated": -2.16571307182312, + "logits/real": -2.2767515182495117, + "logps/generated": -563.0849609375, + "logps/real": -245.07321166992188, + "loss": 0.5309, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.1942694187164307, + "rewards/margins": 19.43132209777832, + "rewards/real": 18.237051010131836, + "step": 4580 + }, + { + "epoch": 0.551948051948052, + "grad_norm": 648.1946419644537, + "learning_rate": 2.489310529128808e-07, + "logits/generated": -2.1995575428009033, + "logits/real": -2.2965829372406006, + "logps/generated": -555.7725830078125, + "logps/real": -311.797607421875, + "loss": 0.8279, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.440613269805908, + "rewards/margins": 24.914764404296875, + "rewards/real": 22.474153518676758, + "step": 4590 + }, + { + "epoch": 0.5531505531505532, + "grad_norm": 647.4498063120639, + "learning_rate": 2.482629609834313e-07, + "logits/generated": -2.211308002471924, + "logits/real": -2.269942045211792, + "logps/generated": -506.86395263671875, + "logps/real": -261.13433837890625, + "loss": 0.4098, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.497762441635132, + "rewards/margins": 24.93470001220703, + "rewards/real": 22.436935424804688, + "step": 4600 + }, + { + "epoch": 0.5543530543530544, + "grad_norm": 200.72127308793227, + "learning_rate": 2.475948690539818e-07, + "logits/generated": -2.075261116027832, + "logits/real": -2.1782639026641846, + "logps/generated": -457.989990234375, + "logps/real": -176.88552856445312, + "loss": 0.5318, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.84760856628418, + "rewards/margins": 23.295930862426758, + "rewards/real": 14.448323249816895, + "step": 4610 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 27.45296345983844, + "learning_rate": 2.4692677712453233e-07, + "logits/generated": -2.098384380340576, + "logits/real": -2.219517469406128, + "logps/generated": -512.6092529296875, + "logps/real": -205.8456268310547, + "loss": 0.3834, + "rewards/accuracies": 1.0, + "rewards/generated": -9.539118766784668, + "rewards/margins": 27.148761749267578, + "rewards/real": 17.609642028808594, + "step": 4620 + }, + { + "epoch": 0.5567580567580568, + "grad_norm": 104.32724521467392, + "learning_rate": 2.4625868519508284e-07, + "logits/generated": -2.1575980186462402, + "logits/real": -2.1771247386932373, + "logps/generated": -608.8951416015625, + "logps/real": -234.56838989257812, + "loss": 0.7239, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.182953834533691, + "rewards/margins": 24.645427703857422, + "rewards/real": 18.462474822998047, + "step": 4630 + }, + { + "epoch": 0.557960557960558, + "grad_norm": 358.8584874258517, + "learning_rate": 2.4559059326563334e-07, + "logits/generated": -2.117887496948242, + "logits/real": -2.1815619468688965, + "logps/generated": -480.5862731933594, + "logps/real": -256.3266906738281, + "loss": 0.4473, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.405491828918457, + "rewards/margins": 24.457073211669922, + "rewards/real": 18.051584243774414, + "step": 4640 + }, + { + "epoch": 0.5591630591630592, + "grad_norm": 530.6903562987329, + "learning_rate": 2.4492250133618385e-07, + "logits/generated": -2.153294563293457, + "logits/real": -2.2766127586364746, + "logps/generated": -532.4159545898438, + "logps/real": -253.4622802734375, + "loss": 0.5976, + "rewards/accuracies": 0.875, + "rewards/generated": -3.4828033447265625, + "rewards/margins": 24.550065994262695, + "rewards/real": 21.067264556884766, + "step": 4650 + }, + { + "epoch": 0.5603655603655604, + "grad_norm": 114.18689365535329, + "learning_rate": 2.4425440940673435e-07, + "logits/generated": -2.113819122314453, + "logits/real": -2.2033469676971436, + "logps/generated": -414.91937255859375, + "logps/real": -219.30322265625, + "loss": 0.6139, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.890634536743164, + "rewards/margins": 19.421606063842773, + "rewards/real": 15.530969619750977, + "step": 4660 + }, + { + "epoch": 0.5615680615680616, + "grad_norm": 6.1113228792604755, + "learning_rate": 2.4358631747728486e-07, + "logits/generated": -2.1934733390808105, + "logits/real": -2.259469985961914, + "logps/generated": -512.7033081054688, + "logps/real": -256.7181701660156, + "loss": 0.4412, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.2184066772460938, + "rewards/margins": 25.44553565979004, + "rewards/real": 22.227128982543945, + "step": 4670 + }, + { + "epoch": 0.5627705627705628, + "grad_norm": 612.5756888336651, + "learning_rate": 2.4291822554783537e-07, + "logits/generated": -2.2330594062805176, + "logits/real": -2.2993900775909424, + "logps/generated": -575.5404052734375, + "logps/real": -278.91351318359375, + "loss": 0.5093, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.784644365310669, + "rewards/margins": 23.952312469482422, + "rewards/real": 21.167667388916016, + "step": 4680 + }, + { + "epoch": 0.563973063973064, + "grad_norm": 501.67042350771, + "learning_rate": 2.422501336183859e-07, + "logits/generated": -2.172921657562256, + "logits/real": -2.312257766723633, + "logps/generated": -566.7554321289062, + "logps/real": -301.3826599121094, + "loss": 0.5214, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.1573255062103271, + "rewards/margins": 25.16106605529785, + "rewards/real": 24.003740310668945, + "step": 4690 + }, + { + "epoch": 0.5651755651755652, + "grad_norm": 6.8216741272885395, + "learning_rate": 2.415820416889364e-07, + "logits/generated": -2.1608948707580566, + "logits/real": -2.173957109451294, + "logps/generated": -521.7059326171875, + "logps/real": -191.4599151611328, + "loss": 0.5399, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.457831382751465, + "rewards/margins": 20.805423736572266, + "rewards/real": 15.34759521484375, + "step": 4700 + }, + { + "epoch": 0.5663780663780664, + "grad_norm": 78.41588467368189, + "learning_rate": 2.409139497594869e-07, + "logits/generated": -2.216021776199341, + "logits/real": -2.2811038494110107, + "logps/generated": -452.0791015625, + "logps/real": -255.2354278564453, + "loss": 0.5296, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 2.0734400749206543, + "rewards/margins": 20.416227340698242, + "rewards/real": 22.489665985107422, + "step": 4710 + }, + { + "epoch": 0.5675805675805676, + "grad_norm": 346.7871605597502, + "learning_rate": 2.402458578300374e-07, + "logits/generated": -2.256971597671509, + "logits/real": -2.269321918487549, + "logps/generated": -584.9602661132812, + "logps/real": -244.5535430908203, + "loss": 0.6844, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.950667381286621, + "rewards/margins": 25.587900161743164, + "rewards/real": 20.637231826782227, + "step": 4720 + }, + { + "epoch": 0.5687830687830688, + "grad_norm": 10.61780206218379, + "learning_rate": 2.395777659005879e-07, + "logits/generated": -2.1691219806671143, + "logits/real": -2.277150869369507, + "logps/generated": -576.3107299804688, + "logps/real": -228.7597198486328, + "loss": 0.3825, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.529725074768066, + "rewards/margins": 27.679636001586914, + "rewards/real": 15.149909973144531, + "step": 4730 + }, + { + "epoch": 0.56998556998557, + "grad_norm": 48.30832593367765, + "learning_rate": 2.389096739711384e-07, + "logits/generated": -2.2049403190612793, + "logits/real": -2.2890119552612305, + "logps/generated": -489.464599609375, + "logps/real": -249.37661743164062, + "loss": 0.4363, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.34356689453125, + "rewards/margins": 23.882205963134766, + "rewards/real": 20.538637161254883, + "step": 4740 + }, + { + "epoch": 0.5711880711880711, + "grad_norm": 481.38532300937277, + "learning_rate": 2.3824158204168894e-07, + "logits/generated": -2.117734432220459, + "logits/real": -2.254347562789917, + "logps/generated": -598.91015625, + "logps/real": -262.19464111328125, + "loss": 0.594, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -10.40277099609375, + "rewards/margins": 33.866859436035156, + "rewards/real": 23.464086532592773, + "step": 4750 + }, + { + "epoch": 0.5723905723905723, + "grad_norm": 1172.5219283790636, + "learning_rate": 2.3757349011223942e-07, + "logits/generated": -2.2469725608825684, + "logits/real": -2.3247828483581543, + "logps/generated": -598.7657470703125, + "logps/real": -303.0007019042969, + "loss": 0.6705, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.37302571535110474, + "rewards/margins": 24.82962417602539, + "rewards/real": 25.20265007019043, + "step": 4760 + }, + { + "epoch": 0.5735930735930735, + "grad_norm": 118.60208214977199, + "learning_rate": 2.3690539818278993e-07, + "logits/generated": -2.162161111831665, + "logits/real": -2.188624620437622, + "logps/generated": -547.6588134765625, + "logps/real": -236.26162719726562, + "loss": 0.3569, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.1087751388549805, + "rewards/margins": 25.359819412231445, + "rewards/real": 20.251041412353516, + "step": 4770 + }, + { + "epoch": 0.5747955747955747, + "grad_norm": 140.3193831233246, + "learning_rate": 2.3623730625334046e-07, + "logits/generated": -2.1264591217041016, + "logits/real": -2.267110824584961, + "logps/generated": -461.0486755371094, + "logps/real": -199.8528594970703, + "loss": 0.6175, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.784832000732422, + "rewards/margins": 24.5319766998291, + "rewards/real": 16.74714469909668, + "step": 4780 + }, + { + "epoch": 0.575998075998076, + "grad_norm": 104.62016696163226, + "learning_rate": 2.3556921432389097e-07, + "logits/generated": -2.056239366531372, + "logits/real": -2.2162742614746094, + "logps/generated": -481.51568603515625, + "logps/real": -254.6533966064453, + "loss": 0.7163, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.585820198059082, + "rewards/margins": 27.165359497070312, + "rewards/real": 20.579540252685547, + "step": 4790 + }, + { + "epoch": 0.5772005772005772, + "grad_norm": 4.807259602897871, + "learning_rate": 2.3490112239444147e-07, + "logits/generated": -2.1776959896087646, + "logits/real": -2.2318732738494873, + "logps/generated": -485.2994079589844, + "logps/real": -257.42547607421875, + "loss": 0.4402, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.6174880862236023, + "rewards/margins": 22.350873947143555, + "rewards/real": 22.968364715576172, + "step": 4800 + }, + { + "epoch": 0.5784030784030784, + "grad_norm": 304.66402094080183, + "learning_rate": 2.3423303046499195e-07, + "logits/generated": -2.0449118614196777, + "logits/real": -2.196877956390381, + "logps/generated": -529.0111083984375, + "logps/real": -236.92764282226562, + "loss": 1.1566, + "rewards/accuracies": 1.0, + "rewards/generated": -5.8719611167907715, + "rewards/margins": 22.51728057861328, + "rewards/real": 16.64531898498535, + "step": 4810 + }, + { + "epoch": 0.5796055796055796, + "grad_norm": 45.365321601083735, + "learning_rate": 2.3356493853554248e-07, + "logits/generated": -2.1713552474975586, + "logits/real": -2.264538526535034, + "logps/generated": -631.4778442382812, + "logps/real": -281.14837646484375, + "loss": 0.4348, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.010066032409668, + "rewards/margins": 28.2640438079834, + "rewards/real": 24.253976821899414, + "step": 4820 + }, + { + "epoch": 0.5808080808080808, + "grad_norm": 275.7824791264617, + "learning_rate": 2.32896846606093e-07, + "logits/generated": -2.1254169940948486, + "logits/real": -2.2445826530456543, + "logps/generated": -427.7737731933594, + "logps/real": -225.2066192626953, + "loss": 0.6204, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -0.2635501027107239, + "rewards/margins": 19.014629364013672, + "rewards/real": 18.75107765197754, + "step": 4830 + }, + { + "epoch": 0.582010582010582, + "grad_norm": 11.846372987741605, + "learning_rate": 2.322287546766435e-07, + "logits/generated": -2.119321346282959, + "logits/real": -2.217468023300171, + "logps/generated": -502.81781005859375, + "logps/real": -218.6884307861328, + "loss": 0.4669, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.799521446228027, + "rewards/margins": 23.770450592041016, + "rewards/real": 16.970928192138672, + "step": 4840 + }, + { + "epoch": 0.5832130832130832, + "grad_norm": 61.45898220843556, + "learning_rate": 2.31560662747194e-07, + "logits/generated": -2.158651828765869, + "logits/real": -2.2841765880584717, + "logps/generated": -594.6893310546875, + "logps/real": -245.26876831054688, + "loss": 0.6698, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.5212578773498535, + "rewards/margins": 28.7156925201416, + "rewards/real": 23.194433212280273, + "step": 4850 + }, + { + "epoch": 0.5844155844155844, + "grad_norm": 239.29847546686884, + "learning_rate": 2.308925708177445e-07, + "logits/generated": -2.169623851776123, + "logits/real": -2.30018949508667, + "logps/generated": -599.7987670898438, + "logps/real": -324.08929443359375, + "loss": 0.5165, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.353647708892822, + "rewards/margins": 32.837196350097656, + "rewards/real": 27.48354721069336, + "step": 4860 + }, + { + "epoch": 0.5856180856180856, + "grad_norm": 30.769959529932, + "learning_rate": 2.3022447888829502e-07, + "logits/generated": -2.1507740020751953, + "logits/real": -2.128061532974243, + "logps/generated": -489.041748046875, + "logps/real": -224.7561492919922, + "loss": 0.4079, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.5154643058776855, + "rewards/margins": 23.196117401123047, + "rewards/real": 17.680652618408203, + "step": 4870 + }, + { + "epoch": 0.5868205868205868, + "grad_norm": 4.848906062900028, + "learning_rate": 2.2955638695884552e-07, + "logits/generated": -2.1520180702209473, + "logits/real": -2.237091064453125, + "logps/generated": -609.17041015625, + "logps/real": -242.2404022216797, + "loss": 0.2811, + "rewards/accuracies": 1.0, + "rewards/generated": -11.879868507385254, + "rewards/margins": 31.69455337524414, + "rewards/real": 19.81468391418457, + "step": 4880 + }, + { + "epoch": 0.588023088023088, + "grad_norm": 313.5974176423623, + "learning_rate": 2.2888829502939603e-07, + "logits/generated": -2.1980972290039062, + "logits/real": -2.259403705596924, + "logps/generated": -566.7474365234375, + "logps/real": -256.3862609863281, + "loss": 0.3452, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.331935167312622, + "rewards/margins": 24.445987701416016, + "rewards/real": 21.114049911499023, + "step": 4890 + }, + { + "epoch": 0.5892255892255892, + "grad_norm": 572.6774350234213, + "learning_rate": 2.2822020309994656e-07, + "logits/generated": -2.1327273845672607, + "logits/real": -2.1624417304992676, + "logps/generated": -560.4371948242188, + "logps/real": -244.71969604492188, + "loss": 0.7624, + "rewards/accuracies": 0.875, + "rewards/generated": -6.603463649749756, + "rewards/margins": 24.912017822265625, + "rewards/real": 18.308555603027344, + "step": 4900 + }, + { + "epoch": 0.5904280904280904, + "grad_norm": 37.134041407638364, + "learning_rate": 2.2755211117049704e-07, + "logits/generated": -2.1626598834991455, + "logits/real": -2.2314906120300293, + "logps/generated": -504.32177734375, + "logps/real": -207.5485382080078, + "loss": 0.5152, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.761170387268066, + "rewards/margins": 25.588455200195312, + "rewards/real": 17.827287673950195, + "step": 4910 + }, + { + "epoch": 0.5916305916305916, + "grad_norm": 410.62480019639355, + "learning_rate": 2.2688401924104755e-07, + "logits/generated": -2.0983004570007324, + "logits/real": -2.194437265396118, + "logps/generated": -481.0418395996094, + "logps/real": -225.505615234375, + "loss": 0.808, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.941951274871826, + "rewards/margins": 25.70985984802246, + "rewards/real": 20.76790428161621, + "step": 4920 + }, + { + "epoch": 0.5928330928330928, + "grad_norm": 3.2504897147129155, + "learning_rate": 2.2621592731159806e-07, + "logits/generated": -2.0753908157348633, + "logits/real": -2.1659369468688965, + "logps/generated": -538.7593994140625, + "logps/real": -209.56631469726562, + "loss": 0.6038, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.15286922454834, + "rewards/margins": 22.93928337097168, + "rewards/real": 17.786415100097656, + "step": 4930 + }, + { + "epoch": 0.594035594035594, + "grad_norm": 317.2347512715378, + "learning_rate": 2.255478353821486e-07, + "logits/generated": -2.15364408493042, + "logits/real": -2.2721619606018066, + "logps/generated": -505.0735778808594, + "logps/real": -261.0234375, + "loss": 0.8925, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.7675155401229858, + "rewards/margins": 21.99693489074707, + "rewards/real": 20.229415893554688, + "step": 4940 + }, + { + "epoch": 0.5952380952380952, + "grad_norm": 5.36587952872531, + "learning_rate": 2.248797434526991e-07, + "logits/generated": -2.0879244804382324, + "logits/real": -2.1579527854919434, + "logps/generated": -502.4501037597656, + "logps/real": -238.88949584960938, + "loss": 0.2876, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.519805908203125, + "rewards/margins": 25.457929611206055, + "rewards/real": 17.938121795654297, + "step": 4950 + }, + { + "epoch": 0.5964405964405964, + "grad_norm": 389.43453301017496, + "learning_rate": 2.2421165152324957e-07, + "logits/generated": -2.1876790523529053, + "logits/real": -2.260486125946045, + "logps/generated": -598.5149536132812, + "logps/real": -302.97760009765625, + "loss": 0.6785, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.6763100624084473, + "rewards/margins": 27.476821899414062, + "rewards/real": 24.80051040649414, + "step": 4960 + }, + { + "epoch": 0.5976430976430976, + "grad_norm": 249.65916655479336, + "learning_rate": 2.2354355959380008e-07, + "logits/generated": -2.1480464935302734, + "logits/real": -2.2145352363586426, + "logps/generated": -614.873046875, + "logps/real": -289.75042724609375, + "loss": 0.4912, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.20695424079895, + "rewards/margins": 25.05868148803711, + "rewards/real": 22.851726531982422, + "step": 4970 + }, + { + "epoch": 0.5988455988455988, + "grad_norm": 204.81001460914447, + "learning_rate": 2.2287546766435061e-07, + "logits/generated": -2.0724501609802246, + "logits/real": -2.2189583778381348, + "logps/generated": -591.15283203125, + "logps/real": -243.0890655517578, + "loss": 0.4145, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.009787559509277, + "rewards/margins": 32.64847183227539, + "rewards/real": 20.638683319091797, + "step": 4980 + }, + { + "epoch": 0.6000481000481, + "grad_norm": 4.543041941388648, + "learning_rate": 2.2220737573490112e-07, + "logits/generated": -2.071737766265869, + "logits/real": -2.21366286277771, + "logps/generated": -529.865966796875, + "logps/real": -186.53591918945312, + "loss": 0.2482, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.864470481872559, + "rewards/margins": 22.951242446899414, + "rewards/real": 15.086771011352539, + "step": 4990 + }, + { + "epoch": 0.6012506012506013, + "grad_norm": 14.838024605051249, + "learning_rate": 2.2153928380545163e-07, + "logits/generated": -2.2031359672546387, + "logits/real": -2.316646099090576, + "logps/generated": -499.90338134765625, + "logps/real": -267.6744079589844, + "loss": 0.418, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.471962928771973, + "rewards/margins": 26.8084659576416, + "rewards/real": 22.336502075195312, + "step": 5000 + }, + { + "epoch": 0.6012506012506013, + "eval_logits/generated": -2.207758903503418, + "eval_logits/real": -2.256751537322998, + "eval_logps/generated": -509.637451171875, + "eval_logps/real": -257.37213134765625, + "eval_loss": 0.4741517901420593, + "eval_rewards/accuracies": 0.961309552192688, + "eval_rewards/generated": -1.9855931997299194, + "eval_rewards/margins": 23.375478744506836, + "eval_rewards/real": 21.389888763427734, + "eval_runtime": 159.0112, + "eval_samples_per_second": 6.289, + "eval_steps_per_second": 0.528, + "step": 5000 + }, + { + "epoch": 0.6024531024531025, + "grad_norm": 112.36891804205959, + "learning_rate": 2.208711918760021e-07, + "logits/generated": -2.1393589973449707, + "logits/real": -2.206692934036255, + "logps/generated": -498.6484375, + "logps/real": -234.1495819091797, + "loss": 0.6687, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.348142623901367, + "rewards/margins": 23.36178207397461, + "rewards/real": 20.01363754272461, + "step": 5010 + }, + { + "epoch": 0.6036556036556037, + "grad_norm": 159.44117873507713, + "learning_rate": 2.2020309994655264e-07, + "logits/generated": -2.194477081298828, + "logits/real": -2.1287307739257812, + "logps/generated": -581.6893310546875, + "logps/real": -229.9732208251953, + "loss": 0.567, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -3.6853485107421875, + "rewards/margins": 23.484485626220703, + "rewards/real": 19.79913902282715, + "step": 5020 + }, + { + "epoch": 0.6048581048581049, + "grad_norm": 393.8714196973956, + "learning_rate": 2.1953500801710315e-07, + "logits/generated": -1.9988247156143188, + "logits/real": -2.2962424755096436, + "logps/generated": -581.6021118164062, + "logps/real": -289.51068115234375, + "loss": 0.541, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.950952053070068, + "rewards/margins": 30.75851821899414, + "rewards/real": 23.807567596435547, + "step": 5030 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 452.055430809275, + "learning_rate": 2.1886691608765365e-07, + "logits/generated": -2.1991615295410156, + "logits/real": -2.1468417644500732, + "logps/generated": -572.5194091796875, + "logps/real": -186.25791931152344, + "loss": 0.316, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.603403091430664, + "rewards/margins": 25.295948028564453, + "rewards/real": 16.69254493713379, + "step": 5040 + }, + { + "epoch": 0.6072631072631073, + "grad_norm": 4.622813566085611, + "learning_rate": 2.1819882415820416e-07, + "logits/generated": -2.07625150680542, + "logits/real": -2.1684768199920654, + "logps/generated": -437.0848083496094, + "logps/real": -192.69497680664062, + "loss": 0.2383, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.420557022094727, + "rewards/margins": 23.884294509887695, + "rewards/real": 15.463737487792969, + "step": 5050 + }, + { + "epoch": 0.6084656084656085, + "grad_norm": 696.8993519314927, + "learning_rate": 2.1753073222875467e-07, + "logits/generated": -2.1094517707824707, + "logits/real": -2.2138452529907227, + "logps/generated": -641.497314453125, + "logps/real": -246.94497680664062, + "loss": 0.7344, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -7.214853763580322, + "rewards/margins": 28.655588150024414, + "rewards/real": 21.440731048583984, + "step": 5060 + }, + { + "epoch": 0.6096681096681097, + "grad_norm": 26.17801486404397, + "learning_rate": 2.1686264029930517e-07, + "logits/generated": -2.01373291015625, + "logits/real": -2.134265422821045, + "logps/generated": -577.6567993164062, + "logps/real": -246.38558959960938, + "loss": 0.4124, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.266793251037598, + "rewards/margins": 29.215808868408203, + "rewards/real": 18.949010848999023, + "step": 5070 + }, + { + "epoch": 0.6108706108706109, + "grad_norm": 20.446915382607806, + "learning_rate": 2.1619454836985568e-07, + "logits/generated": -2.1533255577087402, + "logits/real": -2.224644184112549, + "logps/generated": -678.8082885742188, + "logps/real": -314.3406677246094, + "loss": 0.4912, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.711610317230225, + "rewards/margins": 28.086917877197266, + "rewards/real": 22.375307083129883, + "step": 5080 + }, + { + "epoch": 0.6120731120731121, + "grad_norm": 13.903932402505381, + "learning_rate": 2.1552645644040619e-07, + "logits/generated": -2.196721315383911, + "logits/real": -2.222590923309326, + "logps/generated": -572.3262939453125, + "logps/real": -248.94873046875, + "loss": 0.507, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.296078205108643, + "rewards/margins": 28.022445678710938, + "rewards/real": 21.726369857788086, + "step": 5090 + }, + { + "epoch": 0.6132756132756133, + "grad_norm": 10.818548609026534, + "learning_rate": 2.1485836451095672e-07, + "logits/generated": -2.0338661670684814, + "logits/real": -2.21252179145813, + "logps/generated": -399.80047607421875, + "logps/real": -232.8976287841797, + "loss": 0.6458, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.923448085784912, + "rewards/margins": 21.71235466003418, + "rewards/real": 18.78890609741211, + "step": 5100 + }, + { + "epoch": 0.6144781144781145, + "grad_norm": 45.33669880957139, + "learning_rate": 2.141902725815072e-07, + "logits/generated": -1.9484567642211914, + "logits/real": -2.158186435699463, + "logps/generated": -486.42034912109375, + "logps/real": -195.52969360351562, + "loss": 0.5066, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.924624443054199, + "rewards/margins": 25.316829681396484, + "rewards/real": 17.3922061920166, + "step": 5110 + }, + { + "epoch": 0.6156806156806157, + "grad_norm": 13.4415581964778, + "learning_rate": 2.135221806520577e-07, + "logits/generated": -2.100008010864258, + "logits/real": -2.19741153717041, + "logps/generated": -582.730224609375, + "logps/real": -227.67434692382812, + "loss": 0.3412, + "rewards/accuracies": 1.0, + "rewards/generated": -9.561922073364258, + "rewards/margins": 27.262313842773438, + "rewards/real": 17.70039176940918, + "step": 5120 + }, + { + "epoch": 0.6168831168831169, + "grad_norm": 94.93494353194716, + "learning_rate": 2.128540887226082e-07, + "logits/generated": -2.14306902885437, + "logits/real": -2.1899149417877197, + "logps/generated": -427.8895568847656, + "logps/real": -209.77548217773438, + "loss": 0.3483, + "rewards/accuracies": 1.0, + "rewards/generated": -2.1006529331207275, + "rewards/margins": 20.818208694458008, + "rewards/real": 18.71755599975586, + "step": 5130 + }, + { + "epoch": 0.6180856180856181, + "grad_norm": 206.2417095504339, + "learning_rate": 2.1218599679315874e-07, + "logits/generated": -2.167590379714966, + "logits/real": -2.1929023265838623, + "logps/generated": -542.5330810546875, + "logps/real": -215.34066772460938, + "loss": 0.5407, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.4143515825271606, + "rewards/margins": 19.591306686401367, + "rewards/real": 18.17695426940918, + "step": 5140 + }, + { + "epoch": 0.6192881192881193, + "grad_norm": 10.163706195238307, + "learning_rate": 2.1151790486370925e-07, + "logits/generated": -2.1078526973724365, + "logits/real": -2.1608290672302246, + "logps/generated": -559.0823364257812, + "logps/real": -183.32273864746094, + "loss": 0.4086, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.760201454162598, + "rewards/margins": 26.649608612060547, + "rewards/real": 15.889409065246582, + "step": 5150 + }, + { + "epoch": 0.6204906204906205, + "grad_norm": 25.25433949615869, + "learning_rate": 2.1084981293425973e-07, + "logits/generated": -2.070712089538574, + "logits/real": -2.1418638229370117, + "logps/generated": -536.2274169921875, + "logps/real": -262.9801940917969, + "loss": 0.4258, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -1.6793304681777954, + "rewards/margins": 22.470706939697266, + "rewards/real": 20.79137420654297, + "step": 5160 + }, + { + "epoch": 0.6216931216931217, + "grad_norm": 4.333580297827752, + "learning_rate": 2.1018172100481024e-07, + "logits/generated": -2.023705244064331, + "logits/real": -2.0775601863861084, + "logps/generated": -642.1519775390625, + "logps/real": -283.9187316894531, + "loss": 0.5697, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -6.47070837020874, + "rewards/margins": 29.55684471130371, + "rewards/real": 23.086135864257812, + "step": 5170 + }, + { + "epoch": 0.622895622895623, + "grad_norm": 25.221501546308797, + "learning_rate": 2.0951362907536077e-07, + "logits/generated": -2.1672415733337402, + "logits/real": -2.0736351013183594, + "logps/generated": -578.4718017578125, + "logps/real": -255.50955200195312, + "loss": 0.4278, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.31916409730911255, + "rewards/margins": 20.810937881469727, + "rewards/real": 20.49177360534668, + "step": 5180 + }, + { + "epoch": 0.6240981240981242, + "grad_norm": 8.732133139172033, + "learning_rate": 2.0884553714591128e-07, + "logits/generated": -2.047583818435669, + "logits/real": -2.1747946739196777, + "logps/generated": -596.5800170898438, + "logps/real": -201.64271545410156, + "loss": 0.5734, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.516444206237793, + "rewards/margins": 28.915258407592773, + "rewards/real": 18.398815155029297, + "step": 5190 + }, + { + "epoch": 0.6253006253006252, + "grad_norm": 370.3254087257309, + "learning_rate": 2.0817744521646178e-07, + "logits/generated": -2.069950580596924, + "logits/real": -2.2123332023620605, + "logps/generated": -665.2952880859375, + "logps/real": -226.219482421875, + "loss": 0.5487, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -14.587949752807617, + "rewards/margins": 34.09016799926758, + "rewards/real": 19.50221824645996, + "step": 5200 + }, + { + "epoch": 0.6265031265031265, + "grad_norm": 5.007378874706131, + "learning_rate": 2.0750935328701226e-07, + "logits/generated": -2.0197086334228516, + "logits/real": -2.12208890914917, + "logps/generated": -512.1553955078125, + "logps/real": -155.11798095703125, + "loss": 0.3861, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.422812461853027, + "rewards/margins": 25.80704689025879, + "rewards/real": 13.384236335754395, + "step": 5210 + }, + { + "epoch": 0.6277056277056277, + "grad_norm": 135.436747139612, + "learning_rate": 2.068412613575628e-07, + "logits/generated": -2.103518009185791, + "logits/real": -2.1637232303619385, + "logps/generated": -487.9015197753906, + "logps/real": -233.65478515625, + "loss": 0.7396, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -2.6520657539367676, + "rewards/margins": 24.12938117980957, + "rewards/real": 21.477313995361328, + "step": 5220 + }, + { + "epoch": 0.6289081289081289, + "grad_norm": 26.49116652112335, + "learning_rate": 2.061731694281133e-07, + "logits/generated": -2.0646491050720215, + "logits/real": -2.1591761112213135, + "logps/generated": -614.2501220703125, + "logps/real": -311.14678955078125, + "loss": 0.5308, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -5.64684534072876, + "rewards/margins": 30.806594848632812, + "rewards/real": 25.15974998474121, + "step": 5230 + }, + { + "epoch": 0.6301106301106301, + "grad_norm": 36.768821984186204, + "learning_rate": 2.055050774986638e-07, + "logits/generated": -2.1185431480407715, + "logits/real": -2.1948776245117188, + "logps/generated": -635.8701171875, + "logps/real": -278.8142395019531, + "loss": 0.3912, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.881353378295898, + "rewards/margins": 28.80423927307129, + "rewards/real": 19.92288589477539, + "step": 5240 + }, + { + "epoch": 0.6313131313131313, + "grad_norm": 447.741760926687, + "learning_rate": 2.0483698556921431e-07, + "logits/generated": -2.1760456562042236, + "logits/real": -2.197248935699463, + "logps/generated": -524.3167114257812, + "logps/real": -197.69581604003906, + "loss": 0.7793, + "rewards/accuracies": 0.875, + "rewards/generated": -1.4320223331451416, + "rewards/margins": 19.761924743652344, + "rewards/real": 18.32990264892578, + "step": 5250 + }, + { + "epoch": 0.6325156325156325, + "grad_norm": 150.43983607994468, + "learning_rate": 2.0416889363976482e-07, + "logits/generated": -2.1038804054260254, + "logits/real": -2.1359784603118896, + "logps/generated": -433.695556640625, + "logps/real": -245.3255615234375, + "loss": 0.7071, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 1.2940962314605713, + "rewards/margins": 18.276500701904297, + "rewards/real": 19.570598602294922, + "step": 5260 + }, + { + "epoch": 0.6337181337181337, + "grad_norm": 13.51250696223638, + "learning_rate": 2.0350080171031533e-07, + "logits/generated": -2.062173843383789, + "logits/real": -2.139639377593994, + "logps/generated": -596.686279296875, + "logps/real": -313.1416931152344, + "loss": 0.3623, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.309140682220459, + "rewards/margins": 27.867406845092773, + "rewards/real": 22.558265686035156, + "step": 5270 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 13.454090853723232, + "learning_rate": 2.0283270978086583e-07, + "logits/generated": -2.088893413543701, + "logits/real": -2.1333022117614746, + "logps/generated": -550.6292724609375, + "logps/real": -226.11404418945312, + "loss": 0.5424, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.3938703536987305, + "rewards/margins": 26.277690887451172, + "rewards/real": 18.883819580078125, + "step": 5280 + }, + { + "epoch": 0.6361231361231361, + "grad_norm": 162.15397128407542, + "learning_rate": 2.0216461785141634e-07, + "logits/generated": -2.1805367469787598, + "logits/real": -2.2109501361846924, + "logps/generated": -442.6311950683594, + "logps/real": -217.99612426757812, + "loss": 0.3433, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.4469165802001953, + "rewards/margins": 21.494651794433594, + "rewards/real": 18.047733306884766, + "step": 5290 + }, + { + "epoch": 0.6373256373256373, + "grad_norm": 43.28127747297377, + "learning_rate": 2.0149652592196687e-07, + "logits/generated": -2.2064998149871826, + "logits/real": -2.196180820465088, + "logps/generated": -554.3094482421875, + "logps/real": -271.9448547363281, + "loss": 0.4625, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.1922428607940674, + "rewards/margins": 26.602619171142578, + "rewards/real": 23.41037940979004, + "step": 5300 + }, + { + "epoch": 0.6385281385281385, + "grad_norm": 355.1036412273798, + "learning_rate": 2.0082843399251735e-07, + "logits/generated": -2.1402595043182373, + "logits/real": -2.250767946243286, + "logps/generated": -532.1314697265625, + "logps/real": -248.40756225585938, + "loss": 0.4621, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.158871650695801, + "rewards/margins": 24.802583694458008, + "rewards/real": 19.64371109008789, + "step": 5310 + }, + { + "epoch": 0.6397306397306397, + "grad_norm": 43.81793349573058, + "learning_rate": 2.0016034206306786e-07, + "logits/generated": -2.138998031616211, + "logits/real": -2.1637184619903564, + "logps/generated": -660.0232543945312, + "logps/real": -312.3558349609375, + "loss": 0.5127, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.3712105751037598, + "rewards/margins": 29.01308822631836, + "rewards/real": 25.64188003540039, + "step": 5320 + }, + { + "epoch": 0.6409331409331409, + "grad_norm": 28.48245777540848, + "learning_rate": 1.9949225013361837e-07, + "logits/generated": -2.059576988220215, + "logits/real": -2.113861560821533, + "logps/generated": -755.4419555664062, + "logps/real": -264.72528076171875, + "loss": 0.5073, + "rewards/accuracies": 1.0, + "rewards/generated": -18.074052810668945, + "rewards/margins": 38.974754333496094, + "rewards/real": 20.900699615478516, + "step": 5330 + }, + { + "epoch": 0.6421356421356421, + "grad_norm": 8.245086448679869, + "learning_rate": 1.988241582041689e-07, + "logits/generated": -2.0729799270629883, + "logits/real": -2.182405471801758, + "logps/generated": -580.021484375, + "logps/real": -287.5953063964844, + "loss": 0.4583, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.655449867248535, + "rewards/margins": 27.83284568786621, + "rewards/real": 22.17739486694336, + "step": 5340 + }, + { + "epoch": 0.6433381433381433, + "grad_norm": 1149.7886967956752, + "learning_rate": 1.981560662747194e-07, + "logits/generated": -2.096813201904297, + "logits/real": -2.152869939804077, + "logps/generated": -550.7825317382812, + "logps/real": -268.78656005859375, + "loss": 0.5668, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.9959911108016968, + "rewards/margins": 22.33914566040039, + "rewards/real": 20.343156814575195, + "step": 5350 + }, + { + "epoch": 0.6445406445406445, + "grad_norm": 195.37705731684127, + "learning_rate": 1.9748797434526989e-07, + "logits/generated": -2.0178475379943848, + "logits/real": -2.0801384449005127, + "logps/generated": -467.127197265625, + "logps/real": -196.71151733398438, + "loss": 0.4073, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -6.072965621948242, + "rewards/margins": 22.643278121948242, + "rewards/real": 16.570314407348633, + "step": 5360 + }, + { + "epoch": 0.6457431457431457, + "grad_norm": 653.2545289632761, + "learning_rate": 1.968198824158204e-07, + "logits/generated": -2.115100383758545, + "logits/real": -2.1691863536834717, + "logps/generated": -491.2147521972656, + "logps/real": -195.5085906982422, + "loss": 0.6017, + "rewards/accuracies": 1.0, + "rewards/generated": -4.717627048492432, + "rewards/margins": 22.232973098754883, + "rewards/real": 17.515344619750977, + "step": 5370 + }, + { + "epoch": 0.6469456469456469, + "grad_norm": 4.235068886873702, + "learning_rate": 1.9615179048637093e-07, + "logits/generated": -2.1100916862487793, + "logits/real": -2.1362085342407227, + "logps/generated": -632.2305908203125, + "logps/real": -290.26898193359375, + "loss": 0.3229, + "rewards/accuracies": 1.0, + "rewards/generated": -7.390206336975098, + "rewards/margins": 30.07211685180664, + "rewards/real": 22.68191146850586, + "step": 5380 + }, + { + "epoch": 0.6481481481481481, + "grad_norm": 758.1775530985985, + "learning_rate": 1.9548369855692143e-07, + "logits/generated": -2.063777446746826, + "logits/real": -2.129456043243408, + "logps/generated": -422.287109375, + "logps/real": -223.12283325195312, + "loss": 0.5461, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.549731254577637, + "rewards/margins": 21.80051612854004, + "rewards/real": 17.25078582763672, + "step": 5390 + }, + { + "epoch": 0.6493506493506493, + "grad_norm": 4.045416311092152, + "learning_rate": 1.9481560662747194e-07, + "logits/generated": -2.0610225200653076, + "logits/real": -2.075570821762085, + "logps/generated": -612.0631103515625, + "logps/real": -253.8446044921875, + "loss": 0.9237, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.8615546226501465, + "rewards/margins": 24.515655517578125, + "rewards/real": 16.654102325439453, + "step": 5400 + }, + { + "epoch": 0.6505531505531505, + "grad_norm": 296.0250028944842, + "learning_rate": 1.9414751469802242e-07, + "logits/generated": -1.991396188735962, + "logits/real": -2.116002321243286, + "logps/generated": -589.1071166992188, + "logps/real": -270.3807067871094, + "loss": 0.2863, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.402120590209961, + "rewards/margins": 30.735759735107422, + "rewards/real": 22.33364486694336, + "step": 5410 + }, + { + "epoch": 0.6517556517556518, + "grad_norm": 501.9845126673603, + "learning_rate": 1.9347942276857295e-07, + "logits/generated": -2.101869821548462, + "logits/real": -2.1424288749694824, + "logps/generated": -531.9818115234375, + "logps/real": -191.2215118408203, + "loss": 0.5794, + "rewards/accuracies": 1.0, + "rewards/generated": -5.969061851501465, + "rewards/margins": 24.978412628173828, + "rewards/real": 19.00935173034668, + "step": 5420 + }, + { + "epoch": 0.652958152958153, + "grad_norm": 15.897220212742806, + "learning_rate": 1.9281133083912346e-07, + "logits/generated": -2.0708134174346924, + "logits/real": -2.177319049835205, + "logps/generated": -529.0983276367188, + "logps/real": -257.287109375, + "loss": 0.4172, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.855011940002441, + "rewards/margins": 27.917526245117188, + "rewards/real": 22.062515258789062, + "step": 5430 + }, + { + "epoch": 0.6541606541606542, + "grad_norm": 167.7270009301335, + "learning_rate": 1.9214323890967396e-07, + "logits/generated": -2.202213764190674, + "logits/real": -2.2001771926879883, + "logps/generated": -585.6710815429688, + "logps/real": -242.23291015625, + "loss": 0.5382, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.8997087478637695, + "rewards/margins": 27.777307510375977, + "rewards/real": 21.87760353088379, + "step": 5440 + }, + { + "epoch": 0.6553631553631554, + "grad_norm": 296.6462020101403, + "learning_rate": 1.9147514698022447e-07, + "logits/generated": -2.138925075531006, + "logits/real": -2.197624921798706, + "logps/generated": -473.5198669433594, + "logps/real": -224.0499267578125, + "loss": 0.6335, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.7679665088653564, + "rewards/margins": 21.625890731811523, + "rewards/real": 17.857921600341797, + "step": 5450 + }, + { + "epoch": 0.6565656565656566, + "grad_norm": 4.355353127300036, + "learning_rate": 1.9080705505077498e-07, + "logits/generated": -2.145355701446533, + "logits/real": -2.248900890350342, + "logps/generated": -412.97637939453125, + "logps/real": -194.98733520507812, + "loss": 0.3381, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.5382111072540283, + "rewards/margins": 20.178089141845703, + "rewards/real": 16.639875411987305, + "step": 5460 + }, + { + "epoch": 0.6577681577681578, + "grad_norm": 4.318768270434199, + "learning_rate": 1.9013896312132548e-07, + "logits/generated": -2.0456652641296387, + "logits/real": -2.139618396759033, + "logps/generated": -522.5445556640625, + "logps/real": -227.39199829101562, + "loss": 0.444, + "rewards/accuracies": 1.0, + "rewards/generated": -5.697910785675049, + "rewards/margins": 26.31026840209961, + "rewards/real": 20.61235809326172, + "step": 5470 + }, + { + "epoch": 0.658970658970659, + "grad_norm": 6.296367735745215, + "learning_rate": 1.89470871191876e-07, + "logits/generated": -2.0240321159362793, + "logits/real": -2.1139636039733887, + "logps/generated": -528.0909423828125, + "logps/real": -231.0218963623047, + "loss": 0.4341, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.761052131652832, + "rewards/margins": 27.318090438842773, + "rewards/real": 19.557037353515625, + "step": 5480 + }, + { + "epoch": 0.6601731601731602, + "grad_norm": 373.84841874812463, + "learning_rate": 1.888027792624265e-07, + "logits/generated": -2.052645444869995, + "logits/real": -2.149437427520752, + "logps/generated": -547.0812377929688, + "logps/real": -232.49789428710938, + "loss": 0.3688, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.1844024658203125, + "rewards/margins": 25.815088272094727, + "rewards/real": 18.630680084228516, + "step": 5490 + }, + { + "epoch": 0.6613756613756614, + "grad_norm": 550.8932954693026, + "learning_rate": 1.8813468733297703e-07, + "logits/generated": -2.090024709701538, + "logits/real": -2.1519649028778076, + "logps/generated": -482.130615234375, + "logps/real": -256.81280517578125, + "loss": 0.5207, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -0.7443699836730957, + "rewards/margins": 20.62835693359375, + "rewards/real": 19.883983612060547, + "step": 5500 + }, + { + "epoch": 0.6625781625781626, + "grad_norm": 10.982656623342988, + "learning_rate": 1.874665954035275e-07, + "logits/generated": -2.1782007217407227, + "logits/real": -2.216689109802246, + "logps/generated": -604.4592895507812, + "logps/real": -289.37969970703125, + "loss": 0.4007, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.4544551372528076, + "rewards/margins": 26.7467041015625, + "rewards/real": 25.292251586914062, + "step": 5510 + }, + { + "epoch": 0.6637806637806638, + "grad_norm": 632.2383449502196, + "learning_rate": 1.8679850347407802e-07, + "logits/generated": -2.1914610862731934, + "logits/real": -2.211108684539795, + "logps/generated": -595.5673217773438, + "logps/real": -277.1487731933594, + "loss": 0.6555, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.866485595703125, + "rewards/margins": 23.17861557006836, + "rewards/real": 24.045101165771484, + "step": 5520 + }, + { + "epoch": 0.664983164983165, + "grad_norm": 22.145461961198414, + "learning_rate": 1.8613041154462852e-07, + "logits/generated": -1.9987192153930664, + "logits/real": -2.1111345291137695, + "logps/generated": -458.30291748046875, + "logps/real": -181.0555419921875, + "loss": 0.4282, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.222566604614258, + "rewards/margins": 23.844867706298828, + "rewards/real": 14.622297286987305, + "step": 5530 + }, + { + "epoch": 0.6661856661856662, + "grad_norm": 31.09422927151055, + "learning_rate": 1.8546231961517905e-07, + "logits/generated": -2.0543627738952637, + "logits/real": -2.140010356903076, + "logps/generated": -481.1874084472656, + "logps/real": -235.50100708007812, + "loss": 0.3559, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.622293472290039, + "rewards/margins": 23.292308807373047, + "rewards/real": 18.670015335083008, + "step": 5540 + }, + { + "epoch": 0.6673881673881674, + "grad_norm": 182.68321796237728, + "learning_rate": 1.8479422768572956e-07, + "logits/generated": -2.1397509574890137, + "logits/real": -2.166752815246582, + "logps/generated": -607.8590698242188, + "logps/real": -323.15216064453125, + "loss": 0.8327, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.7382205128669739, + "rewards/margins": 29.84052085876465, + "rewards/real": 29.1023006439209, + "step": 5550 + }, + { + "epoch": 0.6685906685906686, + "grad_norm": 624.3960294411138, + "learning_rate": 1.8412613575628004e-07, + "logits/generated": -1.9738527536392212, + "logits/real": -2.0829389095306396, + "logps/generated": -466.9283752441406, + "logps/real": -216.74368286132812, + "loss": 0.413, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.393265724182129, + "rewards/margins": 23.90040397644043, + "rewards/real": 17.507137298583984, + "step": 5560 + }, + { + "epoch": 0.6697931697931698, + "grad_norm": 200.27427354551966, + "learning_rate": 1.8345804382683055e-07, + "logits/generated": -2.095628499984741, + "logits/real": -2.0717434883117676, + "logps/generated": -503.68194580078125, + "logps/real": -270.31744384765625, + "loss": 0.494, + "rewards/accuracies": 0.800000011920929, + "rewards/generated": 2.8924455642700195, + "rewards/margins": 18.55295753479004, + "rewards/real": 21.445402145385742, + "step": 5570 + }, + { + "epoch": 0.670995670995671, + "grad_norm": 147.98607420805476, + "learning_rate": 1.8278995189738108e-07, + "logits/generated": -2.0196242332458496, + "logits/real": -2.0777719020843506, + "logps/generated": -483.0901794433594, + "logps/real": -207.4373321533203, + "loss": 0.3298, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.44189739227295, + "rewards/margins": 24.731090545654297, + "rewards/real": 16.289196014404297, + "step": 5580 + }, + { + "epoch": 0.6721981721981722, + "grad_norm": 8.402665709870938, + "learning_rate": 1.821218599679316e-07, + "logits/generated": -2.0823845863342285, + "logits/real": -2.113119602203369, + "logps/generated": -497.16619873046875, + "logps/real": -248.0476837158203, + "loss": 0.5447, + "rewards/accuracies": 1.0, + "rewards/generated": -3.768075466156006, + "rewards/margins": 23.414453506469727, + "rewards/real": 19.646379470825195, + "step": 5590 + }, + { + "epoch": 0.6734006734006734, + "grad_norm": 188.20614427776823, + "learning_rate": 1.814537680384821e-07, + "logits/generated": -2.0918502807617188, + "logits/real": -2.226760149002075, + "logps/generated": -531.3416137695312, + "logps/real": -247.00234985351562, + "loss": 0.3956, + "rewards/accuracies": 1.0, + "rewards/generated": -6.44674825668335, + "rewards/margins": 27.317821502685547, + "rewards/real": 20.871074676513672, + "step": 5600 + }, + { + "epoch": 0.6746031746031746, + "grad_norm": 377.66966547315417, + "learning_rate": 1.8078567610903257e-07, + "logits/generated": -2.034510850906372, + "logits/real": -2.0546746253967285, + "logps/generated": -477.59014892578125, + "logps/real": -204.1554718017578, + "loss": 0.4414, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.174881935119629, + "rewards/margins": 20.559953689575195, + "rewards/real": 16.385068893432617, + "step": 5610 + }, + { + "epoch": 0.6758056758056759, + "grad_norm": 17.852773970762474, + "learning_rate": 1.801175841795831e-07, + "logits/generated": -1.9572585821151733, + "logits/real": -2.0911784172058105, + "logps/generated": -576.0623779296875, + "logps/real": -236.39163208007812, + "loss": 0.3946, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.064367294311523, + "rewards/margins": 31.457971572875977, + "rewards/real": 20.393604278564453, + "step": 5620 + }, + { + "epoch": 0.6770081770081771, + "grad_norm": 12.508713084195344, + "learning_rate": 1.794494922501336e-07, + "logits/generated": -2.0246353149414062, + "logits/real": -2.0877811908721924, + "logps/generated": -526.1453857421875, + "logps/real": -233.11831665039062, + "loss": 0.6675, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.685117244720459, + "rewards/margins": 26.600414276123047, + "rewards/real": 19.91529655456543, + "step": 5630 + }, + { + "epoch": 0.6782106782106783, + "grad_norm": 3.189240337896366, + "learning_rate": 1.7878140032068412e-07, + "logits/generated": -2.022733211517334, + "logits/real": -2.0900216102600098, + "logps/generated": -552.0997924804688, + "logps/real": -228.74978637695312, + "loss": 0.323, + "rewards/accuracies": 1.0, + "rewards/generated": -3.4965484142303467, + "rewards/margins": 24.248926162719727, + "rewards/real": 20.752376556396484, + "step": 5640 + }, + { + "epoch": 0.6794131794131794, + "grad_norm": 622.0812126260017, + "learning_rate": 1.7811330839123463e-07, + "logits/generated": -2.0119502544403076, + "logits/real": -2.050553798675537, + "logps/generated": -435.39068603515625, + "logps/real": -227.7612762451172, + "loss": 0.551, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -5.234732151031494, + "rewards/margins": 22.789058685302734, + "rewards/real": 17.5543270111084, + "step": 5650 + }, + { + "epoch": 0.6806156806156806, + "grad_norm": 5.240742719763742, + "learning_rate": 1.7744521646178516e-07, + "logits/generated": -2.0209696292877197, + "logits/real": -2.0807480812072754, + "logps/generated": -569.5696411132812, + "logps/real": -262.31939697265625, + "loss": 0.4188, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.3068060874938965, + "rewards/margins": 27.74716567993164, + "rewards/real": 24.44036102294922, + "step": 5660 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 21.301646869618097, + "learning_rate": 1.7677712453233564e-07, + "logits/generated": -2.172571897506714, + "logits/real": -2.1529417037963867, + "logps/generated": -620.6819458007812, + "logps/real": -294.45037841796875, + "loss": 0.3965, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.7491583824157715, + "rewards/margins": 26.50079917907715, + "rewards/real": 25.75164222717285, + "step": 5670 + }, + { + "epoch": 0.683020683020683, + "grad_norm": 271.4522301801874, + "learning_rate": 1.7610903260288615e-07, + "logits/generated": -2.031538724899292, + "logits/real": -2.1985344886779785, + "logps/generated": -451.4507751464844, + "logps/real": -258.37646484375, + "loss": 0.5469, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -1.760457992553711, + "rewards/margins": 22.76521873474121, + "rewards/real": 21.004764556884766, + "step": 5680 + }, + { + "epoch": 0.6842231842231842, + "grad_norm": 99.75632907600144, + "learning_rate": 1.7544094067343665e-07, + "logits/generated": -2.0461623668670654, + "logits/real": -2.0558812618255615, + "logps/generated": -610.9781494140625, + "logps/real": -274.8094787597656, + "loss": 0.7229, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -5.510365962982178, + "rewards/margins": 26.819034576416016, + "rewards/real": 21.308670043945312, + "step": 5690 + }, + { + "epoch": 0.6854256854256854, + "grad_norm": 50.96943824689933, + "learning_rate": 1.7477284874398718e-07, + "logits/generated": -2.001626491546631, + "logits/real": -2.144613742828369, + "logps/generated": -557.7528686523438, + "logps/real": -265.2804260253906, + "loss": 0.5109, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.805296897888184, + "rewards/margins": 26.44843101501465, + "rewards/real": 20.643136978149414, + "step": 5700 + }, + { + "epoch": 0.6866281866281866, + "grad_norm": 5.990396117820854, + "learning_rate": 1.741047568145377e-07, + "logits/generated": -2.0240745544433594, + "logits/real": -2.1288633346557617, + "logps/generated": -508.2197265625, + "logps/real": -285.49688720703125, + "loss": 0.7271, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": 0.6658209562301636, + "rewards/margins": 24.800508499145508, + "rewards/real": 25.46632957458496, + "step": 5710 + }, + { + "epoch": 0.6878306878306878, + "grad_norm": 3.140266926337619, + "learning_rate": 1.7343666488508817e-07, + "logits/generated": -2.002182722091675, + "logits/real": -2.0924344062805176, + "logps/generated": -647.659912109375, + "logps/real": -244.3607940673828, + "loss": 0.5404, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.544800758361816, + "rewards/margins": 30.63179588317871, + "rewards/real": 23.08699607849121, + "step": 5720 + }, + { + "epoch": 0.689033189033189, + "grad_norm": 567.7624407796745, + "learning_rate": 1.7276857295563868e-07, + "logits/generated": -1.9848625659942627, + "logits/real": -2.1359121799468994, + "logps/generated": -606.2243041992188, + "logps/real": -305.01019287109375, + "loss": 0.6091, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.167637348175049, + "rewards/margins": 31.583202362060547, + "rewards/real": 25.415569305419922, + "step": 5730 + }, + { + "epoch": 0.6902356902356902, + "grad_norm": 322.53393331117076, + "learning_rate": 1.721004810261892e-07, + "logits/generated": -2.008922576904297, + "logits/real": -2.085409641265869, + "logps/generated": -479.20367431640625, + "logps/real": -213.8471221923828, + "loss": 0.5368, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.688708782196045, + "rewards/margins": 21.00283432006836, + "rewards/real": 16.314123153686523, + "step": 5740 + }, + { + "epoch": 0.6914381914381914, + "grad_norm": 9.76122340799412, + "learning_rate": 1.7143238909673972e-07, + "logits/generated": -2.0706915855407715, + "logits/real": -2.057990312576294, + "logps/generated": -579.1304321289062, + "logps/real": -279.77874755859375, + "loss": 0.6366, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.460413932800293, + "rewards/margins": 25.602603912353516, + "rewards/real": 19.142189025878906, + "step": 5750 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 18.230797953612804, + "learning_rate": 1.7076429716729022e-07, + "logits/generated": -1.9432156085968018, + "logits/real": -2.0407614707946777, + "logps/generated": -468.81231689453125, + "logps/real": -285.3664245605469, + "loss": 0.452, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.701001167297363, + "rewards/margins": 25.964786529541016, + "rewards/real": 21.263784408569336, + "step": 5760 + }, + { + "epoch": 0.6938431938431938, + "grad_norm": 2.952253383717668, + "learning_rate": 1.700962052378407e-07, + "logits/generated": -1.9354089498519897, + "logits/real": -2.0565319061279297, + "logps/generated": -650.11083984375, + "logps/real": -210.1114044189453, + "loss": 0.5012, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -14.890890121459961, + "rewards/margins": 33.58374786376953, + "rewards/real": 18.692859649658203, + "step": 5770 + }, + { + "epoch": 0.695045695045695, + "grad_norm": 108.8064878680952, + "learning_rate": 1.6942811330839124e-07, + "logits/generated": -1.940023422241211, + "logits/real": -2.0163187980651855, + "logps/generated": -510.94744873046875, + "logps/real": -232.2707977294922, + "loss": 0.5536, + "rewards/accuracies": 0.875, + "rewards/generated": -3.8307113647460938, + "rewards/margins": 23.60468101501465, + "rewards/real": 19.773967742919922, + "step": 5780 + }, + { + "epoch": 0.6962481962481962, + "grad_norm": 164.3427417398238, + "learning_rate": 1.6876002137894174e-07, + "logits/generated": -2.051631212234497, + "logits/real": -2.044851779937744, + "logps/generated": -502.2870178222656, + "logps/real": -201.31678771972656, + "loss": 0.3162, + "rewards/accuracies": 1.0, + "rewards/generated": -7.455059051513672, + "rewards/margins": 25.29348373413086, + "rewards/real": 17.838424682617188, + "step": 5790 + }, + { + "epoch": 0.6974506974506974, + "grad_norm": 29.78272376689156, + "learning_rate": 1.6809192944949225e-07, + "logits/generated": -2.0994935035705566, + "logits/real": -2.1238558292388916, + "logps/generated": -623.0423583984375, + "logps/real": -267.8221435546875, + "loss": 0.6061, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.353062152862549, + "rewards/margins": 28.156200408935547, + "rewards/real": 24.80313491821289, + "step": 5800 + }, + { + "epoch": 0.6986531986531986, + "grad_norm": 4.69845125660451, + "learning_rate": 1.6742383752004276e-07, + "logits/generated": -1.9189367294311523, + "logits/real": -2.0586776733398438, + "logps/generated": -522.8191528320312, + "logps/real": -200.33062744140625, + "loss": 0.6415, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.614595413208008, + "rewards/margins": 27.717693328857422, + "rewards/real": 17.103099822998047, + "step": 5810 + }, + { + "epoch": 0.6998556998556998, + "grad_norm": 90.75671553332367, + "learning_rate": 1.6675574559059326e-07, + "logits/generated": -2.0881330966949463, + "logits/real": -2.0540738105773926, + "logps/generated": -507.21990966796875, + "logps/real": -233.40072631835938, + "loss": 0.3856, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.9376654624938965, + "rewards/margins": 24.910873413085938, + "rewards/real": 20.973209381103516, + "step": 5820 + }, + { + "epoch": 0.701058201058201, + "grad_norm": 132.21204600637103, + "learning_rate": 1.6608765366114377e-07, + "logits/generated": -2.0674076080322266, + "logits/real": -2.0564589500427246, + "logps/generated": -470.0575256347656, + "logps/real": -280.09295654296875, + "loss": 0.4871, + "rewards/accuracies": 1.0, + "rewards/generated": -3.8033595085144043, + "rewards/margins": 26.579105377197266, + "rewards/real": 22.775747299194336, + "step": 5830 + }, + { + "epoch": 0.7022607022607023, + "grad_norm": 89.54335729475466, + "learning_rate": 1.6541956173169427e-07, + "logits/generated": -1.9924952983856201, + "logits/real": -2.0267462730407715, + "logps/generated": -573.144775390625, + "logps/real": -223.96987915039062, + "loss": 0.5523, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.361065864562988, + "rewards/margins": 23.596874237060547, + "rewards/real": 18.235807418823242, + "step": 5840 + }, + { + "epoch": 0.7034632034632035, + "grad_norm": 320.41593254069187, + "learning_rate": 1.6475146980224478e-07, + "logits/generated": -2.0367844104766846, + "logits/real": -2.0753395557403564, + "logps/generated": -585.0662841796875, + "logps/real": -220.3904571533203, + "loss": 0.469, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -12.774155616760254, + "rewards/margins": 27.506397247314453, + "rewards/real": 14.732243537902832, + "step": 5850 + }, + { + "epoch": 0.7046657046657047, + "grad_norm": 6.764828870520677, + "learning_rate": 1.640833778727953e-07, + "logits/generated": -2.030487060546875, + "logits/real": -2.0653622150421143, + "logps/generated": -505.8202209472656, + "logps/real": -328.5468444824219, + "loss": 0.5061, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.1627795696258545, + "rewards/margins": 24.476404190063477, + "rewards/real": 24.313623428344727, + "step": 5860 + }, + { + "epoch": 0.7058682058682059, + "grad_norm": 16.987320333356795, + "learning_rate": 1.634152859433458e-07, + "logits/generated": -1.9706943035125732, + "logits/real": -2.079078435897827, + "logps/generated": -696.0484619140625, + "logps/real": -298.5466003417969, + "loss": 0.8071, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -8.095951080322266, + "rewards/margins": 31.24196434020996, + "rewards/real": 23.146013259887695, + "step": 5870 + }, + { + "epoch": 0.7070707070707071, + "grad_norm": 15.059796604406513, + "learning_rate": 1.627471940138963e-07, + "logits/generated": -1.966286063194275, + "logits/real": -2.106269121170044, + "logps/generated": -605.654296875, + "logps/real": -273.3966979980469, + "loss": 0.6209, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -8.536871910095215, + "rewards/margins": 28.432214736938477, + "rewards/real": 19.895343780517578, + "step": 5880 + }, + { + "epoch": 0.7082732082732083, + "grad_norm": 27.996792266130523, + "learning_rate": 1.620791020844468e-07, + "logits/generated": -2.0093941688537598, + "logits/real": -2.0758864879608154, + "logps/generated": -580.8387451171875, + "logps/real": -225.3408966064453, + "loss": 0.4954, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.541053295135498, + "rewards/margins": 24.935813903808594, + "rewards/real": 19.39476203918457, + "step": 5890 + }, + { + "epoch": 0.7094757094757095, + "grad_norm": 593.1716641091133, + "learning_rate": 1.6141101015499731e-07, + "logits/generated": -2.03402042388916, + "logits/real": -2.0171828269958496, + "logps/generated": -601.9954833984375, + "logps/real": -259.0872497558594, + "loss": 0.5209, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.456506252288818, + "rewards/margins": 29.47256088256836, + "rewards/real": 23.016056060791016, + "step": 5900 + }, + { + "epoch": 0.7106782106782107, + "grad_norm": 6.097200252474049, + "learning_rate": 1.6074291822554785e-07, + "logits/generated": -1.9546695947647095, + "logits/real": -2.1085476875305176, + "logps/generated": -585.7703857421875, + "logps/real": -260.1246337890625, + "loss": 0.3801, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.080541133880615, + "rewards/margins": 30.35806655883789, + "rewards/real": 23.277523040771484, + "step": 5910 + }, + { + "epoch": 0.7118807118807119, + "grad_norm": 5.425167454398412, + "learning_rate": 1.6007482629609833e-07, + "logits/generated": -2.052748203277588, + "logits/real": -2.085552215576172, + "logps/generated": -553.0166625976562, + "logps/real": -243.44729614257812, + "loss": 0.3871, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.7129673957824707, + "rewards/margins": 22.420289993286133, + "rewards/real": 19.707324981689453, + "step": 5920 + }, + { + "epoch": 0.7130832130832131, + "grad_norm": 194.37528617332168, + "learning_rate": 1.5940673436664883e-07, + "logits/generated": -1.8565423488616943, + "logits/real": -2.048894166946411, + "logps/generated": -611.2785034179688, + "logps/real": -222.39208984375, + "loss": 0.5031, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.9310884475708, + "rewards/margins": 30.078760147094727, + "rewards/real": 18.14767074584961, + "step": 5930 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 584.9361748337963, + "learning_rate": 1.5873864243719934e-07, + "logits/generated": -2.0264651775360107, + "logits/real": -1.9897197484970093, + "logps/generated": -498.8528747558594, + "logps/real": -231.07693481445312, + "loss": 0.5602, + "rewards/accuracies": 0.875, + "rewards/generated": -3.90936541557312, + "rewards/margins": 22.191932678222656, + "rewards/real": 18.28256607055664, + "step": 5940 + }, + { + "epoch": 0.7154882154882155, + "grad_norm": 73.77401089746371, + "learning_rate": 1.5807055050774987e-07, + "logits/generated": -1.9922844171524048, + "logits/real": -2.053990602493286, + "logps/generated": -582.9013671875, + "logps/real": -260.50164794921875, + "loss": 0.2838, + "rewards/accuracies": 1.0, + "rewards/generated": -3.423548936843872, + "rewards/margins": 27.149404525756836, + "rewards/real": 23.72585678100586, + "step": 5950 + }, + { + "epoch": 0.7166907166907167, + "grad_norm": 264.109642798798, + "learning_rate": 1.5740245857830038e-07, + "logits/generated": -1.9862995147705078, + "logits/real": -1.9136476516723633, + "logps/generated": -516.9673461914062, + "logps/real": -205.4317626953125, + "loss": 0.3064, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.891024589538574, + "rewards/margins": 24.194900512695312, + "rewards/real": 16.303874969482422, + "step": 5960 + }, + { + "epoch": 0.7178932178932179, + "grad_norm": 82.02801608322811, + "learning_rate": 1.5673436664885086e-07, + "logits/generated": -2.0520498752593994, + "logits/real": -2.0354788303375244, + "logps/generated": -523.3507690429688, + "logps/real": -227.84402465820312, + "loss": 0.565, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": 0.1284686028957367, + "rewards/margins": 19.374645233154297, + "rewards/real": 19.503116607666016, + "step": 5970 + }, + { + "epoch": 0.7190957190957191, + "grad_norm": 202.0180463553884, + "learning_rate": 1.5606627471940136e-07, + "logits/generated": -2.0054705142974854, + "logits/real": -2.126939058303833, + "logps/generated": -569.4356689453125, + "logps/real": -259.4375, + "loss": 0.4142, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.6010572910308838, + "rewards/margins": 21.88002586364746, + "rewards/real": 22.4810848236084, + "step": 5980 + }, + { + "epoch": 0.7202982202982203, + "grad_norm": 16.238480948127794, + "learning_rate": 1.553981827899519e-07, + "logits/generated": -1.9652553796768188, + "logits/real": -1.9859685897827148, + "logps/generated": -503.29608154296875, + "logps/real": -235.278564453125, + "loss": 0.3261, + "rewards/accuracies": 1.0, + "rewards/generated": -4.676265716552734, + "rewards/margins": 25.252643585205078, + "rewards/real": 20.576377868652344, + "step": 5990 + }, + { + "epoch": 0.7215007215007215, + "grad_norm": 4.694603966980558, + "learning_rate": 1.547300908605024e-07, + "logits/generated": -1.9758752584457397, + "logits/real": -2.0604944229125977, + "logps/generated": -581.5227661132812, + "logps/real": -266.998291015625, + "loss": 0.4272, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.795146942138672, + "rewards/margins": 28.78839111328125, + "rewards/real": 21.993244171142578, + "step": 6000 + }, + { + "epoch": 0.7215007215007215, + "eval_logits/generated": -2.024104595184326, + "eval_logits/real": -2.0559768676757812, + "eval_logps/generated": -516.4866333007812, + "eval_logps/real": -255.58375549316406, + "eval_loss": 0.4181888997554779, + "eval_rewards/accuracies": 0.9583333134651184, + "eval_rewards/generated": -2.6705148220062256, + "eval_rewards/margins": 24.239242553710938, + "eval_rewards/real": 21.568729400634766, + "eval_runtime": 158.085, + "eval_samples_per_second": 6.326, + "eval_steps_per_second": 0.531, + "step": 6000 + }, + { + "epoch": 0.7227032227032227, + "grad_norm": 4.4590254558840705, + "learning_rate": 1.540619989310529e-07, + "logits/generated": -2.055990219116211, + "logits/real": -2.114842176437378, + "logps/generated": -518.754638671875, + "logps/real": -254.91616821289062, + "loss": 0.6621, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.7546546459198, + "rewards/margins": 24.697486877441406, + "rewards/real": 21.94283103942871, + "step": 6010 + }, + { + "epoch": 0.7239057239057239, + "grad_norm": 200.0553975932058, + "learning_rate": 1.533939070016034e-07, + "logits/generated": -1.9855819940567017, + "logits/real": -2.0946459770202637, + "logps/generated": -540.9707641601562, + "logps/real": -222.64013671875, + "loss": 0.365, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.403491973876953, + "rewards/margins": 29.19797706604004, + "rewards/real": 19.794485092163086, + "step": 6020 + }, + { + "epoch": 0.7251082251082251, + "grad_norm": 3.003949807765099, + "learning_rate": 1.5272581507215392e-07, + "logits/generated": -1.8843629360198975, + "logits/real": -2.0339131355285645, + "logps/generated": -535.8517456054688, + "logps/real": -227.3661346435547, + "loss": 0.4721, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -5.808887958526611, + "rewards/margins": 24.485149383544922, + "rewards/real": 18.676258087158203, + "step": 6030 + }, + { + "epoch": 0.7263107263107264, + "grad_norm": 232.73246617854906, + "learning_rate": 1.5205772314270443e-07, + "logits/generated": -2.076704978942871, + "logits/real": -2.051879405975342, + "logps/generated": -525.3424072265625, + "logps/real": -320.03985595703125, + "loss": 0.3817, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -0.6194555163383484, + "rewards/margins": 25.497285842895508, + "rewards/real": 24.87782859802246, + "step": 6040 + }, + { + "epoch": 0.7275132275132276, + "grad_norm": 955.3834726679851, + "learning_rate": 1.5138963121325494e-07, + "logits/generated": -2.004138946533203, + "logits/real": -2.0520410537719727, + "logps/generated": -532.3648071289062, + "logps/real": -284.3950500488281, + "loss": 0.4399, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.6230058670043945, + "rewards/margins": 29.23556137084961, + "rewards/real": 23.612552642822266, + "step": 6050 + }, + { + "epoch": 0.7287157287157288, + "grad_norm": 428.49457126504836, + "learning_rate": 1.5072153928380544e-07, + "logits/generated": -2.0264859199523926, + "logits/real": -1.9557311534881592, + "logps/generated": -502.14862060546875, + "logps/real": -221.5615997314453, + "loss": 0.4106, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.6217753887176514, + "rewards/margins": 22.318477630615234, + "rewards/real": 18.696704864501953, + "step": 6060 + }, + { + "epoch": 0.72991822991823, + "grad_norm": 5.379462742272412, + "learning_rate": 1.5005344735435595e-07, + "logits/generated": -2.019275426864624, + "logits/real": -2.112255334854126, + "logps/generated": -654.1192016601562, + "logps/real": -278.43084716796875, + "loss": 0.3359, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.167357921600342, + "rewards/margins": 29.26619529724121, + "rewards/real": 25.09884262084961, + "step": 6070 + }, + { + "epoch": 0.7311207311207312, + "grad_norm": 4.860276357003056, + "learning_rate": 1.4938535542490646e-07, + "logits/generated": -2.0853078365325928, + "logits/real": -2.122804641723633, + "logps/generated": -506.30419921875, + "logps/real": -212.27352905273438, + "loss": 0.5013, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.887076377868652, + "rewards/margins": 28.033214569091797, + "rewards/real": 19.146141052246094, + "step": 6080 + }, + { + "epoch": 0.7323232323232324, + "grad_norm": 113.56043168777462, + "learning_rate": 1.4871726349545696e-07, + "logits/generated": -2.0797107219696045, + "logits/real": -2.0907530784606934, + "logps/generated": -433.6893615722656, + "logps/real": -209.5296173095703, + "loss": 0.6372, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -2.3338398933410645, + "rewards/margins": 21.73061752319336, + "rewards/real": 19.396774291992188, + "step": 6090 + }, + { + "epoch": 0.7335257335257336, + "grad_norm": 178.6721479700718, + "learning_rate": 1.4804917156600747e-07, + "logits/generated": -1.9706804752349854, + "logits/real": -2.0371265411376953, + "logps/generated": -474.51690673828125, + "logps/real": -227.3787384033203, + "loss": 0.6643, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.39561653137207, + "rewards/margins": 23.835596084594727, + "rewards/real": 19.439977645874023, + "step": 6100 + }, + { + "epoch": 0.7347282347282347, + "grad_norm": 6.262325763848138, + "learning_rate": 1.47381079636558e-07, + "logits/generated": -1.9313364028930664, + "logits/real": -1.927351951599121, + "logps/generated": -519.8493041992188, + "logps/real": -231.4251251220703, + "loss": 0.5785, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.794744491577148, + "rewards/margins": 22.81560707092285, + "rewards/real": 18.020862579345703, + "step": 6110 + }, + { + "epoch": 0.7359307359307359, + "grad_norm": 626.9184452960253, + "learning_rate": 1.4671298770710848e-07, + "logits/generated": -1.9691321849822998, + "logits/real": -2.0907559394836426, + "logps/generated": -637.7929077148438, + "logps/real": -257.151123046875, + "loss": 0.5389, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.425045013427734, + "rewards/margins": 28.295806884765625, + "rewards/real": 22.87076187133789, + "step": 6120 + }, + { + "epoch": 0.7371332371332371, + "grad_norm": 188.77303204853564, + "learning_rate": 1.46044895777659e-07, + "logits/generated": -1.9789931774139404, + "logits/real": -2.029228687286377, + "logps/generated": -568.4287719726562, + "logps/real": -198.3484344482422, + "loss": 0.3117, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.4860520362854, + "rewards/margins": 23.19948959350586, + "rewards/real": 16.713438034057617, + "step": 6130 + }, + { + "epoch": 0.7383357383357383, + "grad_norm": 7.017986930229006, + "learning_rate": 1.453768038482095e-07, + "logits/generated": -1.9300180673599243, + "logits/real": -2.0610289573669434, + "logps/generated": -556.9735107421875, + "logps/real": -248.61489868164062, + "loss": 0.3547, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.820444583892822, + "rewards/margins": 23.19576644897461, + "rewards/real": 18.375324249267578, + "step": 6140 + }, + { + "epoch": 0.7395382395382395, + "grad_norm": 13.35008192716633, + "learning_rate": 1.4470871191876003e-07, + "logits/generated": -1.946213722229004, + "logits/real": -2.075281858444214, + "logps/generated": -557.5587768554688, + "logps/real": -218.5151824951172, + "loss": 0.6689, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.02005386352539, + "rewards/margins": 28.047130584716797, + "rewards/real": 20.027080535888672, + "step": 6150 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 9.684037448105407, + "learning_rate": 1.4404061998931053e-07, + "logits/generated": -2.0908665657043457, + "logits/real": -2.102464199066162, + "logps/generated": -444.47808837890625, + "logps/real": -203.1186981201172, + "loss": 0.6941, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.7162132263183594, + "rewards/margins": 19.0086612701416, + "rewards/real": 18.29244613647461, + "step": 6160 + }, + { + "epoch": 0.7419432419432419, + "grad_norm": 10.34774445413632, + "learning_rate": 1.4337252805986101e-07, + "logits/generated": -1.8997472524642944, + "logits/real": -2.0007100105285645, + "logps/generated": -667.5238037109375, + "logps/real": -193.0680694580078, + "loss": 0.3097, + "rewards/accuracies": 1.0, + "rewards/generated": -13.620000839233398, + "rewards/margins": 33.0391731262207, + "rewards/real": 19.419170379638672, + "step": 6170 + }, + { + "epoch": 0.7431457431457431, + "grad_norm": 561.9992672748349, + "learning_rate": 1.4270443613041152e-07, + "logits/generated": -2.072580337524414, + "logits/real": -2.018799304962158, + "logps/generated": -526.4044189453125, + "logps/real": -262.51361083984375, + "loss": 0.6514, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.1388444900512695, + "rewards/margins": 26.15079116821289, + "rewards/real": 21.011945724487305, + "step": 6180 + }, + { + "epoch": 0.7443482443482443, + "grad_norm": 4.51774798412955, + "learning_rate": 1.4203634420096205e-07, + "logits/generated": -1.9605543613433838, + "logits/real": -2.016939163208008, + "logps/generated": -458.9302673339844, + "logps/real": -176.44093322753906, + "loss": 0.2359, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.735542297363281, + "rewards/margins": 25.70003318786621, + "rewards/real": 15.964492797851562, + "step": 6190 + }, + { + "epoch": 0.7455507455507455, + "grad_norm": 805.110846002169, + "learning_rate": 1.4136825227151256e-07, + "logits/generated": -2.067617893218994, + "logits/real": -2.074852466583252, + "logps/generated": -521.3026123046875, + "logps/real": -248.1642608642578, + "loss": 0.3533, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -0.04916229099035263, + "rewards/margins": 21.618560791015625, + "rewards/real": 21.569400787353516, + "step": 6200 + }, + { + "epoch": 0.7467532467532467, + "grad_norm": 4.917719176714294, + "learning_rate": 1.4070016034206307e-07, + "logits/generated": -2.026477813720703, + "logits/real": -2.0135531425476074, + "logps/generated": -603.4595947265625, + "logps/real": -240.67446899414062, + "loss": 0.2961, + "rewards/accuracies": 1.0, + "rewards/generated": -9.173086166381836, + "rewards/margins": 29.656009674072266, + "rewards/real": 20.482921600341797, + "step": 6210 + }, + { + "epoch": 0.7479557479557479, + "grad_norm": 30.80535543136375, + "learning_rate": 1.4003206841261355e-07, + "logits/generated": -1.9997466802597046, + "logits/real": -2.16223406791687, + "logps/generated": -482.9454040527344, + "logps/real": -225.31100463867188, + "loss": 0.4796, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.104976177215576, + "rewards/margins": 26.778528213500977, + "rewards/real": 20.67354965209961, + "step": 6220 + }, + { + "epoch": 0.7491582491582491, + "grad_norm": 5.194576848457341, + "learning_rate": 1.3936397648316408e-07, + "logits/generated": -1.9211280345916748, + "logits/real": -2.0602774620056152, + "logps/generated": -482.47930908203125, + "logps/real": -195.07928466796875, + "loss": 0.5361, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.157212257385254, + "rewards/margins": 25.305757522583008, + "rewards/real": 16.14854621887207, + "step": 6230 + }, + { + "epoch": 0.7503607503607503, + "grad_norm": 12.592927666077152, + "learning_rate": 1.3869588455371459e-07, + "logits/generated": -1.9879329204559326, + "logits/real": -2.105809211730957, + "logps/generated": -668.959228515625, + "logps/real": -240.8298797607422, + "loss": 0.5272, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.408947944641113, + "rewards/margins": 31.326852798461914, + "rewards/real": 20.917903900146484, + "step": 6240 + }, + { + "epoch": 0.7515632515632515, + "grad_norm": 6.367074419526923, + "learning_rate": 1.380277926242651e-07, + "logits/generated": -1.926006555557251, + "logits/real": -2.0523173809051514, + "logps/generated": -558.9520263671875, + "logps/real": -211.4641571044922, + "loss": 0.3938, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.068964004516602, + "rewards/margins": 25.008941650390625, + "rewards/real": 18.939977645874023, + "step": 6250 + }, + { + "epoch": 0.7527657527657527, + "grad_norm": 9.75024619425914, + "learning_rate": 1.373597006948156e-07, + "logits/generated": -2.0039355754852295, + "logits/real": -2.078279495239258, + "logps/generated": -633.4993286132812, + "logps/real": -246.19046020507812, + "loss": 0.7311, + "rewards/accuracies": 1.0, + "rewards/generated": -6.896474361419678, + "rewards/margins": 29.550867080688477, + "rewards/real": 22.654390335083008, + "step": 6260 + }, + { + "epoch": 0.753968253968254, + "grad_norm": 6.1874289646490634, + "learning_rate": 1.366916087653661e-07, + "logits/generated": -2.0454373359680176, + "logits/real": -2.014486789703369, + "logps/generated": -567.9034423828125, + "logps/real": -252.9257049560547, + "loss": 0.351, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -3.65690279006958, + "rewards/margins": 25.702701568603516, + "rewards/real": 22.045795440673828, + "step": 6270 + }, + { + "epoch": 0.7551707551707552, + "grad_norm": 78.83575585100137, + "learning_rate": 1.360235168359166e-07, + "logits/generated": -1.9779586791992188, + "logits/real": -2.050172805786133, + "logps/generated": -512.7911987304688, + "logps/real": -200.3998565673828, + "loss": 0.4225, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.451817512512207, + "rewards/margins": 28.826004028320312, + "rewards/real": 19.37418556213379, + "step": 6280 + }, + { + "epoch": 0.7563732563732564, + "grad_norm": 151.34186107944922, + "learning_rate": 1.3535542490646712e-07, + "logits/generated": -1.965846061706543, + "logits/real": -2.0169053077697754, + "logps/generated": -538.800048828125, + "logps/real": -193.9132537841797, + "loss": 0.9546, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.19506549835205, + "rewards/margins": 27.961584091186523, + "rewards/real": 18.766517639160156, + "step": 6290 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 36.86441070717078, + "learning_rate": 1.3468733297701762e-07, + "logits/generated": -1.937940001487732, + "logits/real": -1.9921560287475586, + "logps/generated": -608.825439453125, + "logps/real": -214.90573120117188, + "loss": 0.2837, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.684560775756836, + "rewards/margins": 28.25661849975586, + "rewards/real": 17.572059631347656, + "step": 6300 + }, + { + "epoch": 0.7587782587782588, + "grad_norm": 5.9247245043509045, + "learning_rate": 1.3401924104756816e-07, + "logits/generated": -1.9313808679580688, + "logits/real": -1.9859815835952759, + "logps/generated": -597.9343872070312, + "logps/real": -250.38735961914062, + "loss": 0.5041, + "rewards/accuracies": 1.0, + "rewards/generated": -15.1850004196167, + "rewards/margins": 33.82495880126953, + "rewards/real": 18.639957427978516, + "step": 6310 + }, + { + "epoch": 0.75998075998076, + "grad_norm": 57.69450815352928, + "learning_rate": 1.3335114911811864e-07, + "logits/generated": -2.0614192485809326, + "logits/real": -2.0687270164489746, + "logps/generated": -633.4412841796875, + "logps/real": -255.1488037109375, + "loss": 0.8344, + "rewards/accuracies": 1.0, + "rewards/generated": -7.370565891265869, + "rewards/margins": 29.701574325561523, + "rewards/real": 22.331008911132812, + "step": 6320 + }, + { + "epoch": 0.7611832611832612, + "grad_norm": 17.046417261383656, + "learning_rate": 1.3268305718866914e-07, + "logits/generated": -1.9685719013214111, + "logits/real": -2.101987361907959, + "logps/generated": -700.80517578125, + "logps/real": -268.86553955078125, + "loss": 0.5641, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -11.582005500793457, + "rewards/margins": 31.284893035888672, + "rewards/real": 19.702882766723633, + "step": 6330 + }, + { + "epoch": 0.7623857623857624, + "grad_norm": 57.379866714393906, + "learning_rate": 1.3201496525921965e-07, + "logits/generated": -1.9597723484039307, + "logits/real": -2.05348801612854, + "logps/generated": -749.7882080078125, + "logps/real": -304.0247802734375, + "loss": 0.421, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.726692199707031, + "rewards/margins": 34.836952209472656, + "rewards/real": 23.11026382446289, + "step": 6340 + }, + { + "epoch": 0.7635882635882636, + "grad_norm": 9.697832231116402, + "learning_rate": 1.3134687332977018e-07, + "logits/generated": -2.0030665397644043, + "logits/real": -2.068018913269043, + "logps/generated": -565.5142822265625, + "logps/real": -237.0439453125, + "loss": 0.5869, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.754905700683594, + "rewards/margins": 25.79233169555664, + "rewards/real": 20.037425994873047, + "step": 6350 + }, + { + "epoch": 0.7647907647907648, + "grad_norm": 15.4845204333541, + "learning_rate": 1.306787814003207e-07, + "logits/generated": -2.0362257957458496, + "logits/real": -2.0825929641723633, + "logps/generated": -501.44171142578125, + "logps/real": -245.36727905273438, + "loss": 0.342, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.3118367195129395, + "rewards/margins": 25.564722061157227, + "rewards/real": 21.252885818481445, + "step": 6360 + }, + { + "epoch": 0.765993265993266, + "grad_norm": 11.618511890856688, + "learning_rate": 1.3001068947087117e-07, + "logits/generated": -1.9474092721939087, + "logits/real": -2.0226938724517822, + "logps/generated": -556.8031005859375, + "logps/real": -235.0730743408203, + "loss": 0.4303, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.593226432800293, + "rewards/margins": 26.619098663330078, + "rewards/real": 20.02587127685547, + "step": 6370 + }, + { + "epoch": 0.7671957671957672, + "grad_norm": 63.097235994103166, + "learning_rate": 1.2934259754142168e-07, + "logits/generated": -1.989145278930664, + "logits/real": -2.0506412982940674, + "logps/generated": -560.9464111328125, + "logps/real": -222.3338623046875, + "loss": 0.7063, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.6073455810546875, + "rewards/margins": 24.4576358795166, + "rewards/real": 17.850292205810547, + "step": 6380 + }, + { + "epoch": 0.7683982683982684, + "grad_norm": 1148.8564040017468, + "learning_rate": 1.286745056119722e-07, + "logits/generated": -2.025768756866455, + "logits/real": -2.137629270553589, + "logps/generated": -731.1598510742188, + "logps/real": -310.55816650390625, + "loss": 0.402, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.420269966125488, + "rewards/margins": 33.447731018066406, + "rewards/real": 28.027462005615234, + "step": 6390 + }, + { + "epoch": 0.7696007696007696, + "grad_norm": 933.7753958864845, + "learning_rate": 1.2800641368252272e-07, + "logits/generated": -2.009885311126709, + "logits/real": -2.0939135551452637, + "logps/generated": -620.836181640625, + "logps/real": -272.7360534667969, + "loss": 0.4951, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.9595947265625, + "rewards/margins": 29.82866859436035, + "rewards/real": 22.86907196044922, + "step": 6400 + }, + { + "epoch": 0.7708032708032708, + "grad_norm": 7.801546521060971, + "learning_rate": 1.2733832175307322e-07, + "logits/generated": -2.013359785079956, + "logits/real": -2.1400272846221924, + "logps/generated": -516.2764892578125, + "logps/real": -220.3167724609375, + "loss": 0.6196, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.015575885772705, + "rewards/margins": 25.577045440673828, + "rewards/real": 19.56147003173828, + "step": 6410 + }, + { + "epoch": 0.772005772005772, + "grad_norm": 230.38571589406325, + "learning_rate": 1.266702298236237e-07, + "logits/generated": -1.9715086221694946, + "logits/real": -2.0108389854431152, + "logps/generated": -586.1846923828125, + "logps/real": -239.9268798828125, + "loss": 0.5405, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.519183158874512, + "rewards/margins": 26.368633270263672, + "rewards/real": 18.84945297241211, + "step": 6420 + }, + { + "epoch": 0.7732082732082732, + "grad_norm": 130.59965528022764, + "learning_rate": 1.2600213789417423e-07, + "logits/generated": -2.043574810028076, + "logits/real": -2.1122307777404785, + "logps/generated": -652.2904663085938, + "logps/real": -296.2471008300781, + "loss": 0.6708, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -1.0932037830352783, + "rewards/margins": 26.344066619873047, + "rewards/real": 25.25086212158203, + "step": 6430 + }, + { + "epoch": 0.7744107744107744, + "grad_norm": 182.6011016842905, + "learning_rate": 1.2533404596472474e-07, + "logits/generated": -2.0451924800872803, + "logits/real": -2.1155965328216553, + "logps/generated": -623.5161743164062, + "logps/real": -239.1260986328125, + "loss": 0.4863, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.482655048370361, + "rewards/margins": 26.55521011352539, + "rewards/real": 21.072551727294922, + "step": 6440 + }, + { + "epoch": 0.7756132756132756, + "grad_norm": 4.40352858864226, + "learning_rate": 1.2466595403527525e-07, + "logits/generated": -2.032482147216797, + "logits/real": -2.0970304012298584, + "logps/generated": -574.2525024414062, + "logps/real": -230.63027954101562, + "loss": 0.5059, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.142547607421875, + "rewards/margins": 33.791221618652344, + "rewards/real": 20.648677825927734, + "step": 6450 + }, + { + "epoch": 0.7768157768157768, + "grad_norm": 978.9515729424127, + "learning_rate": 1.2399786210582575e-07, + "logits/generated": -2.05214262008667, + "logits/real": -2.0589637756347656, + "logps/generated": -510.7826232910156, + "logps/real": -182.57005310058594, + "loss": 0.787, + "rewards/accuracies": 0.875, + "rewards/generated": -3.8935463428497314, + "rewards/margins": 19.70840835571289, + "rewards/real": 15.814860343933105, + "step": 6460 + }, + { + "epoch": 0.778018278018278, + "grad_norm": 900.5029847031657, + "learning_rate": 1.2332977017637626e-07, + "logits/generated": -2.0314135551452637, + "logits/real": -2.061591148376465, + "logps/generated": -541.0048217773438, + "logps/real": -246.12844848632812, + "loss": 0.5588, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -3.4981799125671387, + "rewards/margins": 23.609556198120117, + "rewards/real": 20.111377716064453, + "step": 6470 + }, + { + "epoch": 0.7792207792207793, + "grad_norm": 350.450392517772, + "learning_rate": 1.2266167824692677e-07, + "logits/generated": -1.9987430572509766, + "logits/real": -2.0528876781463623, + "logps/generated": -507.3895568847656, + "logps/real": -253.5883331298828, + "loss": 0.3899, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.739121437072754, + "rewards/margins": 27.059429168701172, + "rewards/real": 22.320308685302734, + "step": 6480 + }, + { + "epoch": 0.7804232804232805, + "grad_norm": 2.570811632585443, + "learning_rate": 1.2199358631747727e-07, + "logits/generated": -2.061544179916382, + "logits/real": -1.9877605438232422, + "logps/generated": -541.1376342773438, + "logps/real": -254.91421508789062, + "loss": 0.2376, + "rewards/accuracies": 1.0, + "rewards/generated": -3.4496307373046875, + "rewards/margins": 26.540197372436523, + "rewards/real": 23.090566635131836, + "step": 6490 + }, + { + "epoch": 0.7816257816257817, + "grad_norm": 400.89249109207947, + "learning_rate": 1.2132549438802778e-07, + "logits/generated": -2.032627582550049, + "logits/real": -2.1158928871154785, + "logps/generated": -471.20208740234375, + "logps/real": -218.10458374023438, + "loss": 0.5599, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.873850345611572, + "rewards/margins": 26.938655853271484, + "rewards/real": 20.064807891845703, + "step": 6500 + }, + { + "epoch": 0.7828282828282829, + "grad_norm": 226.3122224179574, + "learning_rate": 1.2065740245857829e-07, + "logits/generated": -1.9897022247314453, + "logits/real": -2.034402370452881, + "logps/generated": -495.94775390625, + "logps/real": -235.3659210205078, + "loss": 0.4043, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.257264137268066, + "rewards/margins": 25.510311126708984, + "rewards/real": 21.253047943115234, + "step": 6510 + }, + { + "epoch": 0.7840307840307841, + "grad_norm": 18.683962626117665, + "learning_rate": 1.1998931052912882e-07, + "logits/generated": -1.992257833480835, + "logits/real": -2.17830228805542, + "logps/generated": -511.45526123046875, + "logps/real": -207.0403289794922, + "loss": 0.8142, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.154990196228027, + "rewards/margins": 26.8048095703125, + "rewards/real": 19.649816513061523, + "step": 6520 + }, + { + "epoch": 0.7852332852332853, + "grad_norm": 40.479713613093054, + "learning_rate": 1.193212185996793e-07, + "logits/generated": -1.9591795206069946, + "logits/real": -2.120176315307617, + "logps/generated": -476.0768127441406, + "logps/real": -264.5736389160156, + "loss": 0.3787, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.221365451812744, + "rewards/margins": 26.010305404663086, + "rewards/real": 20.788936614990234, + "step": 6530 + }, + { + "epoch": 0.7864357864357865, + "grad_norm": 546.0181649497696, + "learning_rate": 1.1865312667022982e-07, + "logits/generated": -1.9258826971054077, + "logits/real": -2.137423276901245, + "logps/generated": -565.8904418945312, + "logps/real": -247.72500610351562, + "loss": 0.5775, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -8.614789962768555, + "rewards/margins": 28.28069496154785, + "rewards/real": 19.665903091430664, + "step": 6540 + }, + { + "epoch": 0.7876382876382877, + "grad_norm": 201.09913616241806, + "learning_rate": 1.1798503474078033e-07, + "logits/generated": -1.8074251413345337, + "logits/real": -2.046027421951294, + "logps/generated": -423.1866760253906, + "logps/real": -219.33309936523438, + "loss": 0.4462, + "rewards/accuracies": 1.0, + "rewards/generated": -9.052923202514648, + "rewards/margins": 29.064559936523438, + "rewards/real": 20.01163101196289, + "step": 6550 + }, + { + "epoch": 0.7888407888407888, + "grad_norm": 770.987711097659, + "learning_rate": 1.1731694281133083e-07, + "logits/generated": -2.021021604537964, + "logits/real": -2.1250627040863037, + "logps/generated": -503.29962158203125, + "logps/real": -233.9160919189453, + "loss": 0.452, + "rewards/accuracies": 0.875, + "rewards/generated": -11.509394645690918, + "rewards/margins": 31.916366577148438, + "rewards/real": 20.406970977783203, + "step": 6560 + }, + { + "epoch": 0.79004329004329, + "grad_norm": 9.47549186282315, + "learning_rate": 1.1664885088188134e-07, + "logits/generated": -1.9661697149276733, + "logits/real": -2.0995280742645264, + "logps/generated": -666.216552734375, + "logps/real": -290.75347900390625, + "loss": 0.5366, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.03934097290039, + "rewards/margins": 35.27936553955078, + "rewards/real": 26.240026473999023, + "step": 6570 + }, + { + "epoch": 0.7912457912457912, + "grad_norm": 573.4073702302659, + "learning_rate": 1.1598075895243186e-07, + "logits/generated": -2.035170078277588, + "logits/real": -2.088874340057373, + "logps/generated": -565.6022338867188, + "logps/real": -191.28726196289062, + "loss": 0.5497, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -12.782261848449707, + "rewards/margins": 30.251855850219727, + "rewards/real": 17.469593048095703, + "step": 6580 + }, + { + "epoch": 0.7924482924482924, + "grad_norm": 6.662413515690038, + "learning_rate": 1.1531266702298235e-07, + "logits/generated": -1.9405333995819092, + "logits/real": -1.9991796016693115, + "logps/generated": -465.4478454589844, + "logps/real": -166.42901611328125, + "loss": 0.5633, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.728341102600098, + "rewards/margins": 22.969757080078125, + "rewards/real": 15.241415023803711, + "step": 6590 + }, + { + "epoch": 0.7936507936507936, + "grad_norm": 7.935999121687919, + "learning_rate": 1.1464457509353287e-07, + "logits/generated": -2.0232961177825928, + "logits/real": -2.1475448608398438, + "logps/generated": -708.168701171875, + "logps/real": -335.34228515625, + "loss": 0.6386, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.621035575866699, + "rewards/margins": 35.099361419677734, + "rewards/real": 27.47833251953125, + "step": 6600 + }, + { + "epoch": 0.7948532948532948, + "grad_norm": 168.3026198119923, + "learning_rate": 1.1397648316408336e-07, + "logits/generated": -1.8957293033599854, + "logits/real": -2.0679030418395996, + "logps/generated": -671.3610229492188, + "logps/real": -263.46990966796875, + "loss": 0.6755, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -11.244115829467773, + "rewards/margins": 35.23192596435547, + "rewards/real": 23.987808227539062, + "step": 6610 + }, + { + "epoch": 0.796055796055796, + "grad_norm": 9.840976990538804, + "learning_rate": 1.1330839123463388e-07, + "logits/generated": -1.9226858615875244, + "logits/real": -2.0272715091705322, + "logps/generated": -622.4281616210938, + "logps/real": -232.2010955810547, + "loss": 0.4167, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.895415306091309, + "rewards/margins": 32.337223052978516, + "rewards/real": 22.441804885864258, + "step": 6620 + }, + { + "epoch": 0.7972582972582972, + "grad_norm": 3.6905020279777365, + "learning_rate": 1.1264029930518439e-07, + "logits/generated": -1.980645775794983, + "logits/real": -2.062753438949585, + "logps/generated": -537.4622802734375, + "logps/real": -228.3039093017578, + "loss": 0.4354, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.7731428146362305, + "rewards/margins": 26.594839096069336, + "rewards/real": 20.821697235107422, + "step": 6630 + }, + { + "epoch": 0.7984607984607984, + "grad_norm": 222.54171218343046, + "learning_rate": 1.119722073757349e-07, + "logits/generated": -1.9163379669189453, + "logits/real": -1.9841057062149048, + "logps/generated": -630.6839599609375, + "logps/real": -251.23843383789062, + "loss": 0.55, + "rewards/accuracies": 1.0, + "rewards/generated": -11.036982536315918, + "rewards/margins": 32.203826904296875, + "rewards/real": 21.16684341430664, + "step": 6640 + }, + { + "epoch": 0.7996632996632996, + "grad_norm": 758.4916729267836, + "learning_rate": 1.113041154462854e-07, + "logits/generated": -1.8351774215698242, + "logits/real": -1.9700706005096436, + "logps/generated": -513.4570922851562, + "logps/real": -160.77664184570312, + "loss": 0.3732, + "rewards/accuracies": 1.0, + "rewards/generated": -14.325859069824219, + "rewards/margins": 29.440387725830078, + "rewards/real": 15.114524841308594, + "step": 6650 + }, + { + "epoch": 0.8008658008658008, + "grad_norm": 130.25800665654907, + "learning_rate": 1.1063602351683592e-07, + "logits/generated": -1.9696983098983765, + "logits/real": -2.020036458969116, + "logps/generated": -462.839111328125, + "logps/real": -241.97744750976562, + "loss": 0.795, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.808574199676514, + "rewards/margins": 26.586395263671875, + "rewards/real": 21.77781867980957, + "step": 6660 + }, + { + "epoch": 0.802068302068302, + "grad_norm": 16.445419819966222, + "learning_rate": 1.0996793158738642e-07, + "logits/generated": -1.9043128490447998, + "logits/real": -2.0672061443328857, + "logps/generated": -650.8314208984375, + "logps/real": -262.8508605957031, + "loss": 0.384, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.162294387817383, + "rewards/margins": 29.417739868164062, + "rewards/real": 21.25544548034668, + "step": 6670 + }, + { + "epoch": 0.8032708032708032, + "grad_norm": 7.624439658847343, + "learning_rate": 1.0929983965793694e-07, + "logits/generated": -1.993038535118103, + "logits/real": -2.048717498779297, + "logps/generated": -583.8704833984375, + "logps/real": -280.97979736328125, + "loss": 0.3869, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.1804001331329346, + "rewards/margins": 25.349430084228516, + "rewards/real": 23.169029235839844, + "step": 6680 + }, + { + "epoch": 0.8044733044733045, + "grad_norm": 933.4362052887401, + "learning_rate": 1.0863174772848743e-07, + "logits/generated": -1.9302880764007568, + "logits/real": -2.0569405555725098, + "logps/generated": -593.2938232421875, + "logps/real": -216.85385131835938, + "loss": 0.5222, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -10.829951286315918, + "rewards/margins": 31.20162582397461, + "rewards/real": 20.371673583984375, + "step": 6690 + }, + { + "epoch": 0.8056758056758057, + "grad_norm": 4.893149792148093, + "learning_rate": 1.0796365579903795e-07, + "logits/generated": -1.9423173666000366, + "logits/real": -2.0856854915618896, + "logps/generated": -676.6719360351562, + "logps/real": -263.2870178222656, + "loss": 0.4651, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.902135848999023, + "rewards/margins": 33.78864669799805, + "rewards/real": 23.88651466369629, + "step": 6700 + }, + { + "epoch": 0.8068783068783069, + "grad_norm": 11.948304545504657, + "learning_rate": 1.0729556386958845e-07, + "logits/generated": -1.8900234699249268, + "logits/real": -2.0345757007598877, + "logps/generated": -510.62908935546875, + "logps/real": -217.3412628173828, + "loss": 0.4978, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.852546691894531, + "rewards/margins": 27.93526840209961, + "rewards/real": 19.08272361755371, + "step": 6710 + }, + { + "epoch": 0.8080808080808081, + "grad_norm": 864.3594472666391, + "learning_rate": 1.0662747194013896e-07, + "logits/generated": -1.904443383216858, + "logits/real": -2.0020639896392822, + "logps/generated": -621.1746826171875, + "logps/real": -263.2376403808594, + "loss": 0.8638, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -3.8390984535217285, + "rewards/margins": 24.18063735961914, + "rewards/real": 20.34153938293457, + "step": 6720 + }, + { + "epoch": 0.8092833092833093, + "grad_norm": 158.72252802279118, + "learning_rate": 1.0595938001068947e-07, + "logits/generated": -1.909276008605957, + "logits/real": -2.0260133743286133, + "logps/generated": -518.3134765625, + "logps/real": -230.59768676757812, + "loss": 0.55, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.227375984191895, + "rewards/margins": 30.50686264038086, + "rewards/real": 20.279489517211914, + "step": 6730 + }, + { + "epoch": 0.8104858104858105, + "grad_norm": 7.650011866937278, + "learning_rate": 1.0529128808123997e-07, + "logits/generated": -1.8908302783966064, + "logits/real": -2.0459790229797363, + "logps/generated": -572.00390625, + "logps/real": -218.95510864257812, + "loss": 0.3082, + "rewards/accuracies": 1.0, + "rewards/generated": -9.059887886047363, + "rewards/margins": 27.71695899963379, + "rewards/real": 18.657072067260742, + "step": 6740 + }, + { + "epoch": 0.8116883116883117, + "grad_norm": 115.32391375793355, + "learning_rate": 1.0462319615179048e-07, + "logits/generated": -1.9581515789031982, + "logits/real": -2.0542149543762207, + "logps/generated": -551.6161499023438, + "logps/real": -274.6429443359375, + "loss": 0.2775, + "rewards/accuracies": 1.0, + "rewards/generated": -6.853793144226074, + "rewards/margins": 29.69968605041504, + "rewards/real": 22.845888137817383, + "step": 6750 + }, + { + "epoch": 0.8128908128908129, + "grad_norm": 74.47718182874267, + "learning_rate": 1.03955104222341e-07, + "logits/generated": -2.052121639251709, + "logits/real": -2.0548155307769775, + "logps/generated": -547.2852783203125, + "logps/real": -237.0869140625, + "loss": 0.3278, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.021565437316895, + "rewards/margins": 28.7548828125, + "rewards/real": 20.73331642150879, + "step": 6760 + }, + { + "epoch": 0.8140933140933141, + "grad_norm": 152.04333408446354, + "learning_rate": 1.032870122928915e-07, + "logits/generated": -2.066861867904663, + "logits/real": -2.1651744842529297, + "logps/generated": -582.880615234375, + "logps/real": -310.56732177734375, + "loss": 0.5398, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": 2.206866502761841, + "rewards/margins": 24.49967384338379, + "rewards/real": 26.706539154052734, + "step": 6770 + }, + { + "epoch": 0.8152958152958153, + "grad_norm": 9.084553097851261, + "learning_rate": 1.0261892036344201e-07, + "logits/generated": -1.9865939617156982, + "logits/real": -2.064363479614258, + "logps/generated": -642.2066650390625, + "logps/real": -245.4627227783203, + "loss": 0.4214, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.984448432922363, + "rewards/margins": 35.89861297607422, + "rewards/real": 22.91416358947754, + "step": 6780 + }, + { + "epoch": 0.8164983164983165, + "grad_norm": 580.8818222282044, + "learning_rate": 1.019508284339925e-07, + "logits/generated": -2.004951000213623, + "logits/real": -2.1178276538848877, + "logps/generated": -463.5888671875, + "logps/real": -223.80380249023438, + "loss": 0.5307, + "rewards/accuracies": 0.875, + "rewards/generated": -3.36588978767395, + "rewards/margins": 22.291778564453125, + "rewards/real": 18.925886154174805, + "step": 6790 + }, + { + "epoch": 0.8177008177008177, + "grad_norm": 21.60433737742114, + "learning_rate": 1.0128273650454303e-07, + "logits/generated": -1.9284942150115967, + "logits/real": -2.1252195835113525, + "logps/generated": -642.3855590820312, + "logps/real": -238.59030151367188, + "loss": 0.4324, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -12.920016288757324, + "rewards/margins": 36.168033599853516, + "rewards/real": 23.24801254272461, + "step": 6800 + }, + { + "epoch": 0.8189033189033189, + "grad_norm": 5.858153105500669, + "learning_rate": 1.0061464457509353e-07, + "logits/generated": -1.8915159702301025, + "logits/real": -2.036672592163086, + "logps/generated": -510.6033630371094, + "logps/real": -263.3202209472656, + "loss": 0.6308, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -10.245492935180664, + "rewards/margins": 29.829025268554688, + "rewards/real": 19.58353042602539, + "step": 6810 + }, + { + "epoch": 0.8201058201058201, + "grad_norm": 225.78300369364672, + "learning_rate": 9.994655264564404e-08, + "logits/generated": -1.978859543800354, + "logits/real": -1.9988905191421509, + "logps/generated": -479.1241149902344, + "logps/real": -235.0752716064453, + "loss": 0.455, + "rewards/accuracies": 1.0, + "rewards/generated": -4.077695369720459, + "rewards/margins": 27.0776424407959, + "rewards/real": 22.999948501586914, + "step": 6820 + }, + { + "epoch": 0.8213083213083213, + "grad_norm": 133.61865275911646, + "learning_rate": 9.927846071619455e-08, + "logits/generated": -1.9701054096221924, + "logits/real": -2.0301175117492676, + "logps/generated": -533.6475219726562, + "logps/real": -220.45571899414062, + "loss": 0.5627, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.806885719299316, + "rewards/margins": 25.719934463500977, + "rewards/real": 18.913049697875977, + "step": 6830 + }, + { + "epoch": 0.8225108225108225, + "grad_norm": 170.27248987133945, + "learning_rate": 9.861036878674505e-08, + "logits/generated": -1.928148865699768, + "logits/real": -2.001552104949951, + "logps/generated": -599.8140258789062, + "logps/real": -286.3150329589844, + "loss": 0.3573, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.314204216003418, + "rewards/margins": 30.91522216796875, + "rewards/real": 23.601016998291016, + "step": 6840 + }, + { + "epoch": 0.8237133237133237, + "grad_norm": 4.315270534391476, + "learning_rate": 9.794227685729556e-08, + "logits/generated": -1.9817625284194946, + "logits/real": -2.1318881511688232, + "logps/generated": -531.9886474609375, + "logps/real": -194.29922485351562, + "loss": 0.2382, + "rewards/accuracies": 1.0, + "rewards/generated": -10.49198055267334, + "rewards/margins": 29.082393646240234, + "rewards/real": 18.590412139892578, + "step": 6850 + }, + { + "epoch": 0.8249158249158249, + "grad_norm": 86.45065458537489, + "learning_rate": 9.727418492784608e-08, + "logits/generated": -1.9709510803222656, + "logits/real": -2.0084948539733887, + "logps/generated": -638.9036254882812, + "logps/real": -209.0804901123047, + "loss": 0.5148, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -13.607638359069824, + "rewards/margins": 31.35004234313965, + "rewards/real": 17.74240493774414, + "step": 6860 + }, + { + "epoch": 0.8261183261183261, + "grad_norm": 4.1963448755801105, + "learning_rate": 9.660609299839657e-08, + "logits/generated": -2.0052900314331055, + "logits/real": -2.0138003826141357, + "logps/generated": -534.0217895507812, + "logps/real": -248.12564086914062, + "loss": 0.3906, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -7.690367698669434, + "rewards/margins": 29.35007667541504, + "rewards/real": 21.659709930419922, + "step": 6870 + }, + { + "epoch": 0.8273208273208273, + "grad_norm": 62.15658277048181, + "learning_rate": 9.593800106894709e-08, + "logits/generated": -2.0607285499572754, + "logits/real": -2.098620653152466, + "logps/generated": -441.8622131347656, + "logps/real": -192.32180786132812, + "loss": 0.374, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": 0.18954944610595703, + "rewards/margins": 17.867490768432617, + "rewards/real": 18.057039260864258, + "step": 6880 + }, + { + "epoch": 0.8285233285233286, + "grad_norm": 804.5373140504205, + "learning_rate": 9.526990913949758e-08, + "logits/generated": -1.9349445104599, + "logits/real": -2.020285129547119, + "logps/generated": -498.94610595703125, + "logps/real": -209.37149047851562, + "loss": 0.4807, + "rewards/accuracies": 1.0, + "rewards/generated": -9.69743537902832, + "rewards/margins": 29.25558853149414, + "rewards/real": 19.558155059814453, + "step": 6890 + }, + { + "epoch": 0.8297258297258298, + "grad_norm": 75.33035304692565, + "learning_rate": 9.46018172100481e-08, + "logits/generated": -2.0229721069335938, + "logits/real": -2.096458911895752, + "logps/generated": -484.26904296875, + "logps/real": -217.74746704101562, + "loss": 0.3872, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.0623698234558105, + "rewards/margins": 24.061389923095703, + "rewards/real": 20.999019622802734, + "step": 6900 + }, + { + "epoch": 0.830928330928331, + "grad_norm": 97.15901976461026, + "learning_rate": 9.393372528059861e-08, + "logits/generated": -1.9885648488998413, + "logits/real": -2.04494571685791, + "logps/generated": -567.73974609375, + "logps/real": -232.93212890625, + "loss": 0.7692, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -6.393340110778809, + "rewards/margins": 28.214279174804688, + "rewards/real": 21.820940017700195, + "step": 6910 + }, + { + "epoch": 0.8321308321308322, + "grad_norm": 35.0444948605439, + "learning_rate": 9.326563335114912e-08, + "logits/generated": -2.0096848011016846, + "logits/real": -2.067962169647217, + "logps/generated": -633.2785034179688, + "logps/real": -288.0771484375, + "loss": 0.4973, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.354552268981934, + "rewards/margins": 31.78200340270996, + "rewards/real": 23.42745018005371, + "step": 6920 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 123.22927466145892, + "learning_rate": 9.259754142169962e-08, + "logits/generated": -1.9568450450897217, + "logits/real": -2.014069080352783, + "logps/generated": -572.5548095703125, + "logps/real": -266.781982421875, + "loss": 0.6136, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -7.571218967437744, + "rewards/margins": 29.4760799407959, + "rewards/real": 21.904863357543945, + "step": 6930 + }, + { + "epoch": 0.8345358345358346, + "grad_norm": 122.64015784986167, + "learning_rate": 9.192944949225013e-08, + "logits/generated": -1.8873875141143799, + "logits/real": -2.0744128227233887, + "logps/generated": -655.6689453125, + "logps/real": -219.0276336669922, + "loss": 0.4071, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -16.227237701416016, + "rewards/margins": 32.821048736572266, + "rewards/real": 16.59380531311035, + "step": 6940 + }, + { + "epoch": 0.8357383357383358, + "grad_norm": 494.9675003166132, + "learning_rate": 9.126135756280064e-08, + "logits/generated": -1.9662196636199951, + "logits/real": -2.0359504222869873, + "logps/generated": -632.2025146484375, + "logps/real": -303.913330078125, + "loss": 0.4842, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.139708518981934, + "rewards/margins": 30.668176651000977, + "rewards/real": 25.52846336364746, + "step": 6950 + }, + { + "epoch": 0.836940836940837, + "grad_norm": 244.69796644920746, + "learning_rate": 9.059326563335116e-08, + "logits/generated": -1.9423549175262451, + "logits/real": -2.023282527923584, + "logps/generated": -599.820068359375, + "logps/real": -240.3815460205078, + "loss": 0.5156, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.402791023254395, + "rewards/margins": 30.808792114257812, + "rewards/real": 20.4060001373291, + "step": 6960 + }, + { + "epoch": 0.8381433381433382, + "grad_norm": 159.58700442465454, + "learning_rate": 8.992517370390165e-08, + "logits/generated": -1.9688711166381836, + "logits/real": -2.034064769744873, + "logps/generated": -570.5164794921875, + "logps/real": -255.22421264648438, + "loss": 0.4113, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.9481892585754395, + "rewards/margins": 27.642398834228516, + "rewards/real": 22.694210052490234, + "step": 6970 + }, + { + "epoch": 0.8393458393458394, + "grad_norm": 12.64567192145013, + "learning_rate": 8.925708177445217e-08, + "logits/generated": -1.9905973672866821, + "logits/real": -2.032841205596924, + "logps/generated": -592.6995849609375, + "logps/real": -225.61178588867188, + "loss": 0.302, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -6.645966529846191, + "rewards/margins": 25.207401275634766, + "rewards/real": 18.56143569946289, + "step": 6980 + }, + { + "epoch": 0.8405483405483406, + "grad_norm": 271.33612272282033, + "learning_rate": 8.858898984500266e-08, + "logits/generated": -1.8832422494888306, + "logits/real": -2.047926187515259, + "logps/generated": -593.6348876953125, + "logps/real": -221.31332397460938, + "loss": 0.575, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -12.091665267944336, + "rewards/margins": 30.641738891601562, + "rewards/real": 18.55006980895996, + "step": 6990 + }, + { + "epoch": 0.8417508417508418, + "grad_norm": 93.43620222046046, + "learning_rate": 8.792089791555318e-08, + "logits/generated": -1.8331258296966553, + "logits/real": -1.9706172943115234, + "logps/generated": -571.3441162109375, + "logps/real": -230.90087890625, + "loss": 0.408, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.930742263793945, + "rewards/margins": 31.25909996032715, + "rewards/real": 20.328359603881836, + "step": 7000 + }, + { + "epoch": 0.8417508417508418, + "eval_logits/generated": -1.9645313024520874, + "eval_logits/real": -2.0342915058135986, + "eval_logps/generated": -586.2899169921875, + "eval_logps/real": -257.3894958496094, + "eval_loss": 0.38710272312164307, + "eval_rewards/accuracies": 0.9732142686843872, + "eval_rewards/generated": -9.650843620300293, + "eval_rewards/margins": 31.038997650146484, + "eval_rewards/real": 21.388153076171875, + "eval_runtime": 158.3613, + "eval_samples_per_second": 6.315, + "eval_steps_per_second": 0.53, + "step": 7000 + }, + { + "epoch": 0.8429533429533429, + "grad_norm": 517.0074744751448, + "learning_rate": 8.725280598610369e-08, + "logits/generated": -1.8653123378753662, + "logits/real": -1.953324556350708, + "logps/generated": -561.4413452148438, + "logps/real": -200.07469177246094, + "loss": 0.6815, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -13.99188232421875, + "rewards/margins": 30.130197525024414, + "rewards/real": 16.13831901550293, + "step": 7010 + }, + { + "epoch": 0.8441558441558441, + "grad_norm": 41.85935550831186, + "learning_rate": 8.65847140566542e-08, + "logits/generated": -1.9038314819335938, + "logits/real": -1.9930827617645264, + "logps/generated": -547.8190307617188, + "logps/real": -272.4012145996094, + "loss": 0.5132, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -4.457758903503418, + "rewards/margins": 28.457626342773438, + "rewards/real": 23.999866485595703, + "step": 7020 + }, + { + "epoch": 0.8453583453583453, + "grad_norm": 44.89850728430392, + "learning_rate": 8.59166221272047e-08, + "logits/generated": -1.8842484951019287, + "logits/real": -2.023623466491699, + "logps/generated": -588.6026000976562, + "logps/real": -243.3715057373047, + "loss": 0.4327, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -14.154241561889648, + "rewards/margins": 35.011322021484375, + "rewards/real": 20.857078552246094, + "step": 7030 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 471.8481253389293, + "learning_rate": 8.524853019775521e-08, + "logits/generated": -1.8785381317138672, + "logits/real": -2.0542819499969482, + "logps/generated": -779.9952392578125, + "logps/real": -300.3929748535156, + "loss": 0.4442, + "rewards/accuracies": 1.0, + "rewards/generated": -14.369863510131836, + "rewards/margins": 42.48217010498047, + "rewards/real": 28.112308502197266, + "step": 7040 + }, + { + "epoch": 0.8477633477633477, + "grad_norm": 166.0510317451011, + "learning_rate": 8.458043826830571e-08, + "logits/generated": -1.9051055908203125, + "logits/real": -1.9138116836547852, + "logps/generated": -497.93084716796875, + "logps/real": -223.7330780029297, + "loss": 0.3103, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.9573392868042, + "rewards/margins": 27.659826278686523, + "rewards/real": 18.702489852905273, + "step": 7050 + }, + { + "epoch": 0.8489658489658489, + "grad_norm": 10.610026437075513, + "learning_rate": 8.391234633885623e-08, + "logits/generated": -1.8697566986083984, + "logits/real": -1.965914011001587, + "logps/generated": -640.02001953125, + "logps/real": -240.3203887939453, + "loss": 0.5067, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -13.202654838562012, + "rewards/margins": 33.456077575683594, + "rewards/real": 20.253421783447266, + "step": 7060 + }, + { + "epoch": 0.8501683501683501, + "grad_norm": 64.04734108668225, + "learning_rate": 8.324425440940673e-08, + "logits/generated": -1.9457658529281616, + "logits/real": -2.020012140274048, + "logps/generated": -608.0465087890625, + "logps/real": -256.51568603515625, + "loss": 0.2771, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.455371856689453, + "rewards/margins": 29.595605850219727, + "rewards/real": 22.14023208618164, + "step": 7070 + }, + { + "epoch": 0.8513708513708513, + "grad_norm": 734.7277589875729, + "learning_rate": 8.257616247995723e-08, + "logits/generated": -1.9419653415679932, + "logits/real": -1.950111985206604, + "logps/generated": -528.2342529296875, + "logps/real": -194.86752319335938, + "loss": 0.6012, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.547017574310303, + "rewards/margins": 24.086828231811523, + "rewards/real": 17.539810180664062, + "step": 7080 + }, + { + "epoch": 0.8525733525733525, + "grad_norm": 10.21439558684444, + "learning_rate": 8.190807055050774e-08, + "logits/generated": -1.960397720336914, + "logits/real": -2.0412917137145996, + "logps/generated": -646.4866943359375, + "logps/real": -210.7541961669922, + "loss": 0.5506, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.799673080444336, + "rewards/margins": 29.200973510742188, + "rewards/real": 20.40129852294922, + "step": 7090 + }, + { + "epoch": 0.8537758537758537, + "grad_norm": 8.265386169096248, + "learning_rate": 8.123997862105825e-08, + "logits/generated": -1.9347922801971436, + "logits/real": -1.9989608526229858, + "logps/generated": -509.216796875, + "logps/real": -190.09078979492188, + "loss": 0.5294, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.542779922485352, + "rewards/margins": 26.825023651123047, + "rewards/real": 16.28224754333496, + "step": 7100 + }, + { + "epoch": 0.854978354978355, + "grad_norm": 94.99077031729618, + "learning_rate": 8.057188669160877e-08, + "logits/generated": -1.9375168085098267, + "logits/real": -2.0451226234436035, + "logps/generated": -736.7534790039062, + "logps/real": -355.28924560546875, + "loss": 0.4881, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.175879955291748, + "rewards/margins": 34.601112365722656, + "rewards/real": 30.425235748291016, + "step": 7110 + }, + { + "epoch": 0.8561808561808562, + "grad_norm": 326.2487012399304, + "learning_rate": 7.990379476215926e-08, + "logits/generated": -1.9523429870605469, + "logits/real": -2.1103038787841797, + "logps/generated": -689.837158203125, + "logps/real": -298.9279479980469, + "loss": 0.432, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.586548805236816, + "rewards/margins": 35.03469467163086, + "rewards/real": 24.44814682006836, + "step": 7120 + }, + { + "epoch": 0.8573833573833574, + "grad_norm": 588.9535954751599, + "learning_rate": 7.923570283270978e-08, + "logits/generated": -2.0439341068267822, + "logits/real": -2.042471408843994, + "logps/generated": -539.2032470703125, + "logps/real": -226.07455444335938, + "loss": 0.6732, + "rewards/accuracies": 0.875, + "rewards/generated": -7.877067565917969, + "rewards/margins": 27.782094955444336, + "rewards/real": 19.905027389526367, + "step": 7130 + }, + { + "epoch": 0.8585858585858586, + "grad_norm": 8.447035038625407, + "learning_rate": 7.856761090326027e-08, + "logits/generated": -1.9612295627593994, + "logits/real": -1.9542171955108643, + "logps/generated": -550.6262817382812, + "logps/real": -221.4468994140625, + "loss": 0.4235, + "rewards/accuracies": 0.875, + "rewards/generated": -2.9779059886932373, + "rewards/margins": 23.170961380004883, + "rewards/real": 20.19305419921875, + "step": 7140 + }, + { + "epoch": 0.8597883597883598, + "grad_norm": 10.612418017872224, + "learning_rate": 7.789951897381079e-08, + "logits/generated": -1.9437658786773682, + "logits/real": -2.0600814819335938, + "logps/generated": -625.6292724609375, + "logps/real": -257.81756591796875, + "loss": 0.3105, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -13.496139526367188, + "rewards/margins": 36.835514068603516, + "rewards/real": 23.339374542236328, + "step": 7150 + }, + { + "epoch": 0.860990860990861, + "grad_norm": 63.352885973480475, + "learning_rate": 7.72314270443613e-08, + "logits/generated": -1.8945993185043335, + "logits/real": -2.0037083625793457, + "logps/generated": -586.4344482421875, + "logps/real": -275.34893798828125, + "loss": 0.791, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.72334098815918, + "rewards/margins": 27.0490665435791, + "rewards/real": 22.325727462768555, + "step": 7160 + }, + { + "epoch": 0.8621933621933622, + "grad_norm": 2.8121678032177697, + "learning_rate": 7.65633351149118e-08, + "logits/generated": -1.8705205917358398, + "logits/real": -2.000699520111084, + "logps/generated": -539.0193481445312, + "logps/real": -236.5692138671875, + "loss": 0.4325, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.5588555335998535, + "rewards/margins": 25.52818489074707, + "rewards/real": 20.969329833984375, + "step": 7170 + }, + { + "epoch": 0.8633958633958634, + "grad_norm": 8.795268136151748, + "learning_rate": 7.589524318546231e-08, + "logits/generated": -1.9034866094589233, + "logits/real": -1.9636344909667969, + "logps/generated": -595.8299560546875, + "logps/real": -272.61981201171875, + "loss": 0.5842, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.489290237426758, + "rewards/margins": 33.71501541137695, + "rewards/real": 23.225725173950195, + "step": 7180 + }, + { + "epoch": 0.8645983645983646, + "grad_norm": 3.5256842285820325, + "learning_rate": 7.522715125601283e-08, + "logits/generated": -1.8724931478500366, + "logits/real": -1.9148505926132202, + "logps/generated": -525.8731689453125, + "logps/real": -235.45309448242188, + "loss": 0.2558, + "rewards/accuracies": 1.0, + "rewards/generated": -12.958587646484375, + "rewards/margins": 34.816864013671875, + "rewards/real": 21.858278274536133, + "step": 7190 + }, + { + "epoch": 0.8658008658008658, + "grad_norm": 16.04433358783389, + "learning_rate": 7.455905932656332e-08, + "logits/generated": -1.9577831029891968, + "logits/real": -2.068490982055664, + "logps/generated": -503.7447814941406, + "logps/real": -236.80325317382812, + "loss": 0.4185, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.415589332580566, + "rewards/margins": 26.97598648071289, + "rewards/real": 19.56039810180664, + "step": 7200 + }, + { + "epoch": 0.867003367003367, + "grad_norm": 18.83334364700047, + "learning_rate": 7.389096739711384e-08, + "logits/generated": -1.9357426166534424, + "logits/real": -2.0584323406219482, + "logps/generated": -594.2290649414062, + "logps/real": -190.7951202392578, + "loss": 0.561, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.759885787963867, + "rewards/margins": 29.231525421142578, + "rewards/real": 18.471637725830078, + "step": 7210 + }, + { + "epoch": 0.8682058682058682, + "grad_norm": 5.678415180008009, + "learning_rate": 7.322287546766434e-08, + "logits/generated": -1.943680763244629, + "logits/real": -1.9875980615615845, + "logps/generated": -566.5198364257812, + "logps/real": -239.02627563476562, + "loss": 0.5686, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -10.269927978515625, + "rewards/margins": 30.888427734375, + "rewards/real": 20.61849594116211, + "step": 7220 + }, + { + "epoch": 0.8694083694083694, + "grad_norm": 12.09045317910382, + "learning_rate": 7.255478353821486e-08, + "logits/generated": -1.8886387348175049, + "logits/real": -1.9483449459075928, + "logps/generated": -642.1652221679688, + "logps/real": -291.56207275390625, + "loss": 0.6902, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -9.111310958862305, + "rewards/margins": 31.13655662536621, + "rewards/real": 22.025249481201172, + "step": 7230 + }, + { + "epoch": 0.8706108706108706, + "grad_norm": 254.62835920882608, + "learning_rate": 7.188669160876536e-08, + "logits/generated": -1.8878610134124756, + "logits/real": -2.057114362716675, + "logps/generated": -629.2080078125, + "logps/real": -266.1337585449219, + "loss": 0.4407, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.400242805480957, + "rewards/margins": 33.62870407104492, + "rewards/real": 23.22845458984375, + "step": 7240 + }, + { + "epoch": 0.8718133718133718, + "grad_norm": 13.901591824064043, + "learning_rate": 7.121859967931587e-08, + "logits/generated": -1.9008684158325195, + "logits/real": -2.0401580333709717, + "logps/generated": -579.7138671875, + "logps/real": -243.6083984375, + "loss": 0.6432, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.450406074523926, + "rewards/margins": 33.217533111572266, + "rewards/real": 22.76712417602539, + "step": 7250 + }, + { + "epoch": 0.873015873015873, + "grad_norm": 2613.4291397945603, + "learning_rate": 7.055050774986638e-08, + "logits/generated": -1.8321186304092407, + "logits/real": -1.9275792837142944, + "logps/generated": -502.3748474121094, + "logps/real": -176.3705596923828, + "loss": 0.5526, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.413863182067871, + "rewards/margins": 28.312484741210938, + "rewards/real": 16.898624420166016, + "step": 7260 + }, + { + "epoch": 0.8742183742183742, + "grad_norm": 161.14984363106092, + "learning_rate": 6.988241582041688e-08, + "logits/generated": -1.9329860210418701, + "logits/real": -1.9756215810775757, + "logps/generated": -531.51953125, + "logps/real": -193.31625366210938, + "loss": 0.3499, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.719414710998535, + "rewards/margins": 25.512435913085938, + "rewards/real": 17.793020248413086, + "step": 7270 + }, + { + "epoch": 0.8754208754208754, + "grad_norm": 7.54855355781832, + "learning_rate": 6.921432389096739e-08, + "logits/generated": -1.894049048423767, + "logits/real": -1.9347572326660156, + "logps/generated": -717.5528564453125, + "logps/real": -291.5484313964844, + "loss": 0.3911, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.796369552612305, + "rewards/margins": 31.634693145751953, + "rewards/real": 25.83832359313965, + "step": 7280 + }, + { + "epoch": 0.8766233766233766, + "grad_norm": 4.585649150995327, + "learning_rate": 6.854623196151791e-08, + "logits/generated": -1.8500807285308838, + "logits/real": -1.949877142906189, + "logps/generated": -584.8020629882812, + "logps/real": -191.82269287109375, + "loss": 0.4532, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -14.867227554321289, + "rewards/margins": 33.48359298706055, + "rewards/real": 18.616365432739258, + "step": 7290 + }, + { + "epoch": 0.8778258778258778, + "grad_norm": 163.66992629619799, + "learning_rate": 6.78781400320684e-08, + "logits/generated": -1.8061542510986328, + "logits/real": -1.9497833251953125, + "logps/generated": -601.3792724609375, + "logps/real": -195.39517211914062, + "loss": 0.4238, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -13.47065258026123, + "rewards/margins": 31.295124053955078, + "rewards/real": 17.82447052001953, + "step": 7300 + }, + { + "epoch": 0.879028379028379, + "grad_norm": 874.8283124911345, + "learning_rate": 6.721004810261892e-08, + "logits/generated": -1.912121057510376, + "logits/real": -2.057476282119751, + "logps/generated": -625.40869140625, + "logps/real": -265.7558288574219, + "loss": 0.6074, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.613348960876465, + "rewards/margins": 30.51369857788086, + "rewards/real": 23.90035057067871, + "step": 7310 + }, + { + "epoch": 0.8802308802308803, + "grad_norm": 192.46196828072283, + "learning_rate": 6.654195617316941e-08, + "logits/generated": -1.8323675394058228, + "logits/real": -2.0080084800720215, + "logps/generated": -778.146484375, + "logps/real": -275.4057922363281, + "loss": 0.3373, + "rewards/accuracies": 1.0, + "rewards/generated": -20.505359649658203, + "rewards/margins": 46.92961883544922, + "rewards/real": 26.42426109313965, + "step": 7320 + }, + { + "epoch": 0.8814333814333815, + "grad_norm": 16.746936291128744, + "learning_rate": 6.587386424371993e-08, + "logits/generated": -1.8217490911483765, + "logits/real": -1.9621708393096924, + "logps/generated": -692.5020751953125, + "logps/real": -257.6358642578125, + "loss": 0.4704, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -15.981904983520508, + "rewards/margins": 39.987735748291016, + "rewards/real": 24.005828857421875, + "step": 7330 + }, + { + "epoch": 0.8826358826358827, + "grad_norm": 4.272381400943576, + "learning_rate": 6.520577231427044e-08, + "logits/generated": -1.9419548511505127, + "logits/real": -2.0278639793395996, + "logps/generated": -532.7462158203125, + "logps/real": -272.96588134765625, + "loss": 0.4588, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -4.291725158691406, + "rewards/margins": 27.473276138305664, + "rewards/real": 23.181550979614258, + "step": 7340 + }, + { + "epoch": 0.8838383838383839, + "grad_norm": 619.0048583791896, + "learning_rate": 6.453768038482095e-08, + "logits/generated": -1.829918622970581, + "logits/real": -1.9880342483520508, + "logps/generated": -636.5155029296875, + "logps/real": -252.5404815673828, + "loss": 0.5431, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.412005424499512, + "rewards/margins": 34.718605041503906, + "rewards/real": 22.306598663330078, + "step": 7350 + }, + { + "epoch": 0.8850408850408851, + "grad_norm": 3.541707656089414, + "learning_rate": 6.386958845537145e-08, + "logits/generated": -1.8724817037582397, + "logits/real": -1.9666862487792969, + "logps/generated": -564.6494140625, + "logps/real": -180.87533569335938, + "loss": 0.3055, + "rewards/accuracies": 1.0, + "rewards/generated": -13.268491744995117, + "rewards/margins": 29.77277183532715, + "rewards/real": 16.5042781829834, + "step": 7360 + }, + { + "epoch": 0.8862433862433863, + "grad_norm": 45.22433734634177, + "learning_rate": 6.320149652592196e-08, + "logits/generated": -1.825797438621521, + "logits/real": -1.95993971824646, + "logps/generated": -462.71942138671875, + "logps/real": -206.0211944580078, + "loss": 0.3297, + "rewards/accuracies": 1.0, + "rewards/generated": -6.641558647155762, + "rewards/margins": 24.676057815551758, + "rewards/real": 18.034500122070312, + "step": 7370 + }, + { + "epoch": 0.8874458874458875, + "grad_norm": 3.263328103984659, + "learning_rate": 6.253340459647247e-08, + "logits/generated": -1.9000494480133057, + "logits/real": -2.037306547164917, + "logps/generated": -552.0850830078125, + "logps/real": -226.86886596679688, + "loss": 0.6875, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.1590576171875, + "rewards/margins": 30.00253677368164, + "rewards/real": 21.84347915649414, + "step": 7380 + }, + { + "epoch": 0.8886483886483887, + "grad_norm": 1317.5802959768355, + "learning_rate": 6.186531266702299e-08, + "logits/generated": -1.9330966472625732, + "logits/real": -1.9535415172576904, + "logps/generated": -689.3192138671875, + "logps/real": -240.9399871826172, + "loss": 0.3784, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -17.82880401611328, + "rewards/margins": 36.90077209472656, + "rewards/real": 19.071969985961914, + "step": 7390 + }, + { + "epoch": 0.8898508898508899, + "grad_norm": 291.43035513035505, + "learning_rate": 6.119722073757349e-08, + "logits/generated": -1.9878549575805664, + "logits/real": -2.045534610748291, + "logps/generated": -672.8856201171875, + "logps/real": -267.2503356933594, + "loss": 0.6009, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.401240348815918, + "rewards/margins": 34.384620666503906, + "rewards/real": 25.983383178710938, + "step": 7400 + }, + { + "epoch": 0.8910533910533911, + "grad_norm": 2.316927988260796, + "learning_rate": 6.0529128808124e-08, + "logits/generated": -1.83743155002594, + "logits/real": -1.9109361171722412, + "logps/generated": -617.5614624023438, + "logps/real": -201.71026611328125, + "loss": 0.3709, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.857139587402344, + "rewards/margins": 32.52408981323242, + "rewards/real": 18.66695213317871, + "step": 7410 + }, + { + "epoch": 0.8922558922558923, + "grad_norm": 28.72886021301842, + "learning_rate": 5.98610368786745e-08, + "logits/generated": -1.8223955631256104, + "logits/real": -1.9261900186538696, + "logps/generated": -563.7001953125, + "logps/real": -244.0134735107422, + "loss": 0.3791, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -8.443937301635742, + "rewards/margins": 30.967792510986328, + "rewards/real": 22.523853302001953, + "step": 7420 + }, + { + "epoch": 0.8934583934583935, + "grad_norm": 10.349477439848279, + "learning_rate": 5.919294494922501e-08, + "logits/generated": -1.8162206411361694, + "logits/real": -1.994471549987793, + "logps/generated": -745.6394653320312, + "logps/real": -283.2781066894531, + "loss": 0.53, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -14.154869079589844, + "rewards/margins": 39.75368881225586, + "rewards/real": 25.598817825317383, + "step": 7430 + }, + { + "epoch": 0.8946608946608947, + "grad_norm": 15.26625109062255, + "learning_rate": 5.852485301977552e-08, + "logits/generated": -1.8924280405044556, + "logits/real": -1.9609463214874268, + "logps/generated": -615.1890869140625, + "logps/real": -264.7192687988281, + "loss": 0.5444, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.55783224105835, + "rewards/margins": 32.70935821533203, + "rewards/real": 25.15152931213379, + "step": 7440 + }, + { + "epoch": 0.8958633958633959, + "grad_norm": 3.263604527573491, + "learning_rate": 5.785676109032603e-08, + "logits/generated": -1.8957017660140991, + "logits/real": -1.939389944076538, + "logps/generated": -651.6373901367188, + "logps/real": -247.1461944580078, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/generated": -14.57429313659668, + "rewards/margins": 37.14098358154297, + "rewards/real": 22.56669044494629, + "step": 7450 + }, + { + "epoch": 0.897065897065897, + "grad_norm": 2.635712388647341, + "learning_rate": 5.718866916087654e-08, + "logits/generated": -1.8295533657073975, + "logits/real": -1.9486722946166992, + "logps/generated": -727.4974975585938, + "logps/real": -242.79702758789062, + "loss": 0.5288, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -21.53809356689453, + "rewards/margins": 41.07743835449219, + "rewards/real": 19.53934097290039, + "step": 7460 + }, + { + "epoch": 0.8982683982683982, + "grad_norm": 177.86797665233868, + "learning_rate": 5.6520577231427044e-08, + "logits/generated": -1.800366759300232, + "logits/real": -1.9663546085357666, + "logps/generated": -892.2824096679688, + "logps/real": -278.8819274902344, + "loss": 0.4458, + "rewards/accuracies": 1.0, + "rewards/generated": -21.799272537231445, + "rewards/margins": 46.44826889038086, + "rewards/real": 24.648998260498047, + "step": 7470 + }, + { + "epoch": 0.8994708994708994, + "grad_norm": 10.941101545300283, + "learning_rate": 5.585248530197755e-08, + "logits/generated": -1.7807782888412476, + "logits/real": -1.8585500717163086, + "logps/generated": -591.650146484375, + "logps/real": -203.64344787597656, + "loss": 0.4428, + "rewards/accuracies": 1.0, + "rewards/generated": -16.4709415435791, + "rewards/margins": 34.7379150390625, + "rewards/real": 18.26697540283203, + "step": 7480 + }, + { + "epoch": 0.9006734006734006, + "grad_norm": 3.097175905320357, + "learning_rate": 5.518439337252806e-08, + "logits/generated": -1.7874730825424194, + "logits/real": -2.030160903930664, + "logps/generated": -834.2674560546875, + "logps/real": -270.4729919433594, + "loss": 0.4874, + "rewards/accuracies": 1.0, + "rewards/generated": -25.003660202026367, + "rewards/margins": 50.28822326660156, + "rewards/real": 25.284561157226562, + "step": 7490 + }, + { + "epoch": 0.9018759018759018, + "grad_norm": 224.99376236215403, + "learning_rate": 5.451630144307857e-08, + "logits/generated": -1.7494442462921143, + "logits/real": -2.023285150527954, + "logps/generated": -550.2077026367188, + "logps/real": -246.26657104492188, + "loss": 0.4641, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -9.512105941772461, + "rewards/margins": 34.51152420043945, + "rewards/real": 24.999414443969727, + "step": 7500 + }, + { + "epoch": 0.903078403078403, + "grad_norm": 4.694829544789701, + "learning_rate": 5.384820951362908e-08, + "logits/generated": -1.9265758991241455, + "logits/real": -1.9717557430267334, + "logps/generated": -588.18994140625, + "logps/real": -232.59988403320312, + "loss": 0.3716, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -11.967580795288086, + "rewards/margins": 32.77628707885742, + "rewards/real": 20.808704376220703, + "step": 7510 + }, + { + "epoch": 0.9042809042809042, + "grad_norm": 3.5710623458466215, + "learning_rate": 5.3180117584179583e-08, + "logits/generated": -1.8794788122177124, + "logits/real": -1.8858184814453125, + "logps/generated": -442.54931640625, + "logps/real": -209.28738403320312, + "loss": 0.5693, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.389479637145996, + "rewards/margins": 21.84439468383789, + "rewards/real": 16.454912185668945, + "step": 7520 + }, + { + "epoch": 0.9054834054834054, + "grad_norm": 6.677030844505423, + "learning_rate": 5.251202565473009e-08, + "logits/generated": -1.841382384300232, + "logits/real": -1.9387576580047607, + "logps/generated": -678.8121337890625, + "logps/real": -250.5919952392578, + "loss": 0.4173, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.574440002441406, + "rewards/margins": 35.772891998291016, + "rewards/real": 22.19845199584961, + "step": 7530 + }, + { + "epoch": 0.9066859066859067, + "grad_norm": 6.179269175547231, + "learning_rate": 5.1843933725280596e-08, + "logits/generated": -1.8453378677368164, + "logits/real": -1.8640903234481812, + "logps/generated": -596.0718994140625, + "logps/real": -291.5093688964844, + "loss": 0.3728, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.117281913757324, + "rewards/margins": 32.634605407714844, + "rewards/real": 23.517322540283203, + "step": 7540 + }, + { + "epoch": 0.9078884078884079, + "grad_norm": 107.4651123020714, + "learning_rate": 5.117584179583111e-08, + "logits/generated": -1.8386691808700562, + "logits/real": -1.8639602661132812, + "logps/generated": -549.8564453125, + "logps/real": -174.96145629882812, + "loss": 0.3114, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.235156059265137, + "rewards/margins": 30.93783950805664, + "rewards/real": 17.702680587768555, + "step": 7550 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 2.703509066798271, + "learning_rate": 5.0507749866381616e-08, + "logits/generated": -1.8066171407699585, + "logits/real": -1.8202998638153076, + "logps/generated": -568.2048950195312, + "logps/real": -237.21194458007812, + "loss": 0.3881, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.227745056152344, + "rewards/margins": 29.165557861328125, + "rewards/real": 21.93781089782715, + "step": 7560 + }, + { + "epoch": 0.9102934102934103, + "grad_norm": 403.3692528199429, + "learning_rate": 4.983965793693212e-08, + "logits/generated": -1.8202968835830688, + "logits/real": -1.9496692419052124, + "logps/generated": -574.8258056640625, + "logps/real": -235.11007690429688, + "loss": 0.4766, + "rewards/accuracies": 0.875, + "rewards/generated": -9.05565071105957, + "rewards/margins": 29.5675106048584, + "rewards/real": 20.511859893798828, + "step": 7570 + }, + { + "epoch": 0.9114959114959115, + "grad_norm": 860.5112240891744, + "learning_rate": 4.917156600748263e-08, + "logits/generated": -1.8284162282943726, + "logits/real": -1.9748990535736084, + "logps/generated": -445.96661376953125, + "logps/real": -191.8043975830078, + "loss": 0.3949, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -11.082816123962402, + "rewards/margins": 26.514028549194336, + "rewards/real": 15.4312105178833, + "step": 7580 + }, + { + "epoch": 0.9126984126984127, + "grad_norm": 8.460906879650542, + "learning_rate": 4.8503474078033135e-08, + "logits/generated": -1.8548015356063843, + "logits/real": -1.9636313915252686, + "logps/generated": -608.8326416015625, + "logps/real": -298.0274963378906, + "loss": 0.4868, + "rewards/accuracies": 1.0, + "rewards/generated": -5.996362209320068, + "rewards/margins": 34.14162063598633, + "rewards/real": 28.1452579498291, + "step": 7590 + }, + { + "epoch": 0.9139009139009139, + "grad_norm": 67.33088596415548, + "learning_rate": 4.783538214858365e-08, + "logits/generated": -1.8178812265396118, + "logits/real": -1.9490362405776978, + "logps/generated": -563.5987548828125, + "logps/real": -217.73654174804688, + "loss": 0.3139, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.452391624450684, + "rewards/margins": 32.99445724487305, + "rewards/real": 21.542064666748047, + "step": 7600 + }, + { + "epoch": 0.9151034151034151, + "grad_norm": 19.193494222713156, + "learning_rate": 4.7167290219134155e-08, + "logits/generated": -1.8279447555541992, + "logits/real": -1.9227333068847656, + "logps/generated": -546.6016845703125, + "logps/real": -231.05239868164062, + "loss": 0.3154, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.202812194824219, + "rewards/margins": 26.593332290649414, + "rewards/real": 19.390522003173828, + "step": 7610 + }, + { + "epoch": 0.9163059163059163, + "grad_norm": 289.4963148912311, + "learning_rate": 4.649919828968466e-08, + "logits/generated": -1.9175913333892822, + "logits/real": -1.9148343801498413, + "logps/generated": -547.1802978515625, + "logps/real": -256.5948791503906, + "loss": 0.4737, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -5.248676776885986, + "rewards/margins": 27.513641357421875, + "rewards/real": 22.264965057373047, + "step": 7620 + }, + { + "epoch": 0.9175084175084175, + "grad_norm": 159.8466242647042, + "learning_rate": 4.583110636023517e-08, + "logits/generated": -1.8012142181396484, + "logits/real": -1.998228669166565, + "logps/generated": -684.0794677734375, + "logps/real": -250.2084197998047, + "loss": 0.3076, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.82398509979248, + "rewards/margins": 35.56842803955078, + "rewards/real": 21.744447708129883, + "step": 7630 + }, + { + "epoch": 0.9187109187109187, + "grad_norm": 133.1085974341585, + "learning_rate": 4.5163014430785674e-08, + "logits/generated": -1.816612958908081, + "logits/real": -1.9270000457763672, + "logps/generated": -477.7117614746094, + "logps/real": -221.5418701171875, + "loss": 0.3852, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -6.828951835632324, + "rewards/margins": 27.315725326538086, + "rewards/real": 20.486770629882812, + "step": 7640 + }, + { + "epoch": 0.9199134199134199, + "grad_norm": 121.06992542134766, + "learning_rate": 4.449492250133619e-08, + "logits/generated": -1.9036048650741577, + "logits/real": -1.869901418685913, + "logps/generated": -550.589111328125, + "logps/real": -232.84207153320312, + "loss": 0.6221, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -5.372431755065918, + "rewards/margins": 27.593318939208984, + "rewards/real": 22.220890045166016, + "step": 7650 + }, + { + "epoch": 0.9211159211159211, + "grad_norm": 116.92142166137279, + "learning_rate": 4.3826830571886693e-08, + "logits/generated": -1.9088249206542969, + "logits/real": -1.9617713689804077, + "logps/generated": -629.1837768554688, + "logps/real": -232.0994415283203, + "loss": 0.6329, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -11.240312576293945, + "rewards/margins": 29.797321319580078, + "rewards/real": 18.557008743286133, + "step": 7660 + }, + { + "epoch": 0.9223184223184223, + "grad_norm": 591.6891703932573, + "learning_rate": 4.31587386424372e-08, + "logits/generated": -1.738337516784668, + "logits/real": -1.9371941089630127, + "logps/generated": -666.4129638671875, + "logps/real": -223.22177124023438, + "loss": 0.3358, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -16.73686981201172, + "rewards/margins": 35.94083023071289, + "rewards/real": 19.203960418701172, + "step": 7670 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 7.646952696003572, + "learning_rate": 4.2490646712987706e-08, + "logits/generated": -1.9599723815917969, + "logits/real": -1.9543098211288452, + "logps/generated": -584.2188720703125, + "logps/real": -254.26943969726562, + "loss": 0.3055, + "rewards/accuracies": 1.0, + "rewards/generated": -4.110929489135742, + "rewards/margins": 26.450088500976562, + "rewards/real": 22.339157104492188, + "step": 7680 + }, + { + "epoch": 0.9247234247234247, + "grad_norm": 153.4210062718791, + "learning_rate": 4.182255478353822e-08, + "logits/generated": -1.7582743167877197, + "logits/real": -1.9109766483306885, + "logps/generated": -707.858154296875, + "logps/real": -196.25381469726562, + "loss": 0.4507, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -20.299272537231445, + "rewards/margins": 39.022865295410156, + "rewards/real": 18.723594665527344, + "step": 7690 + }, + { + "epoch": 0.9259259259259259, + "grad_norm": 246.81391093438802, + "learning_rate": 4.115446285408872e-08, + "logits/generated": -1.8733514547348022, + "logits/real": -1.9827098846435547, + "logps/generated": -631.9744262695312, + "logps/real": -274.9596252441406, + "loss": 0.5578, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.418924331665039, + "rewards/margins": 37.10205841064453, + "rewards/real": 26.683135986328125, + "step": 7700 + }, + { + "epoch": 0.9271284271284271, + "grad_norm": 5.8986953364246135, + "learning_rate": 4.0486370924639226e-08, + "logits/generated": -1.851654291152954, + "logits/real": -1.9533793926239014, + "logps/generated": -510.91668701171875, + "logps/real": -226.32955932617188, + "loss": 0.5021, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.667630195617676, + "rewards/margins": 28.51548194885254, + "rewards/real": 20.847850799560547, + "step": 7710 + }, + { + "epoch": 0.9283309283309283, + "grad_norm": 620.9371507413915, + "learning_rate": 3.981827899518973e-08, + "logits/generated": -1.8319000005722046, + "logits/real": -1.921460509300232, + "logps/generated": -625.7000122070312, + "logps/real": -281.3758544921875, + "loss": 0.3173, + "rewards/accuracies": 1.0, + "rewards/generated": -9.334914207458496, + "rewards/margins": 33.2500114440918, + "rewards/real": 23.91509246826172, + "step": 7720 + }, + { + "epoch": 0.9295334295334295, + "grad_norm": 4.111441403423276, + "learning_rate": 3.915018706574024e-08, + "logits/generated": -1.820752739906311, + "logits/real": -1.9629709720611572, + "logps/generated": -457.3055725097656, + "logps/real": -180.87144470214844, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/generated": -13.024332046508789, + "rewards/margins": 28.568084716796875, + "rewards/real": 15.543754577636719, + "step": 7730 + }, + { + "epoch": 0.9307359307359307, + "grad_norm": 24.938895839700333, + "learning_rate": 3.848209513629075e-08, + "logits/generated": -1.8357629776000977, + "logits/real": -1.8417953252792358, + "logps/generated": -658.307373046875, + "logps/real": -215.5572509765625, + "loss": 0.2889, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -16.049457550048828, + "rewards/margins": 34.50643539428711, + "rewards/real": 18.456981658935547, + "step": 7740 + }, + { + "epoch": 0.931938431938432, + "grad_norm": 482.8381043273678, + "learning_rate": 3.781400320684126e-08, + "logits/generated": -1.8643558025360107, + "logits/real": -1.970956802368164, + "logps/generated": -572.5831298828125, + "logps/real": -224.4773406982422, + "loss": 0.4011, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -12.906872749328613, + "rewards/margins": 34.622318267822266, + "rewards/real": 21.715444564819336, + "step": 7750 + }, + { + "epoch": 0.9331409331409332, + "grad_norm": 7.937441822014419, + "learning_rate": 3.7145911277391765e-08, + "logits/generated": -1.7950923442840576, + "logits/real": -1.9632022380828857, + "logps/generated": -577.6339721679688, + "logps/real": -219.2605743408203, + "loss": 0.3931, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -13.48230266571045, + "rewards/margins": 34.0289192199707, + "rewards/real": 20.546619415283203, + "step": 7760 + }, + { + "epoch": 0.9343434343434344, + "grad_norm": 50.0230999113016, + "learning_rate": 3.647781934794227e-08, + "logits/generated": -1.9379491806030273, + "logits/real": -1.9466701745986938, + "logps/generated": -499.440673828125, + "logps/real": -211.3698272705078, + "loss": 0.6335, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.978289604187012, + "rewards/margins": 26.517309188842773, + "rewards/real": 18.539020538330078, + "step": 7770 + }, + { + "epoch": 0.9355459355459356, + "grad_norm": 1179.5098864731124, + "learning_rate": 3.580972741849278e-08, + "logits/generated": -1.8757632970809937, + "logits/real": -1.9674293994903564, + "logps/generated": -588.0816650390625, + "logps/real": -160.77267456054688, + "loss": 0.3409, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -13.619443893432617, + "rewards/margins": 29.020843505859375, + "rewards/real": 15.401400566101074, + "step": 7780 + }, + { + "epoch": 0.9367484367484368, + "grad_norm": 59.70100347128945, + "learning_rate": 3.514163548904329e-08, + "logits/generated": -1.8476343154907227, + "logits/real": -1.9887018203735352, + "logps/generated": -655.5538330078125, + "logps/real": -200.73013305664062, + "loss": 0.4802, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -15.746861457824707, + "rewards/margins": 33.513301849365234, + "rewards/real": 17.76644515991211, + "step": 7790 + }, + { + "epoch": 0.937950937950938, + "grad_norm": 471.2838623781861, + "learning_rate": 3.44735435595938e-08, + "logits/generated": -1.8844900131225586, + "logits/real": -1.8739242553710938, + "logps/generated": -547.0294799804688, + "logps/real": -203.09417724609375, + "loss": 0.5798, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.673928260803223, + "rewards/margins": 26.7031307220459, + "rewards/real": 19.02920150756836, + "step": 7800 + }, + { + "epoch": 0.9391534391534392, + "grad_norm": 562.4521467479547, + "learning_rate": 3.3805451630144303e-08, + "logits/generated": -1.9220783710479736, + "logits/real": -2.022700548171997, + "logps/generated": -571.9171752929688, + "logps/real": -246.066162109375, + "loss": 0.7754, + "rewards/accuracies": 0.875, + "rewards/generated": -4.068185329437256, + "rewards/margins": 24.824108123779297, + "rewards/real": 20.75592041015625, + "step": 7810 + }, + { + "epoch": 0.9403559403559404, + "grad_norm": 13.578179882188723, + "learning_rate": 3.313735970069481e-08, + "logits/generated": -1.7413349151611328, + "logits/real": -1.9711958169937134, + "logps/generated": -750.6766967773438, + "logps/real": -277.9291076660156, + "loss": 0.4597, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -19.52254295349121, + "rewards/margins": 42.818477630615234, + "rewards/real": 23.29593276977539, + "step": 7820 + }, + { + "epoch": 0.9415584415584416, + "grad_norm": 42.735858968267095, + "learning_rate": 3.2469267771245316e-08, + "logits/generated": -1.8657029867172241, + "logits/real": -1.9247665405273438, + "logps/generated": -535.1868286132812, + "logps/real": -262.0387878417969, + "loss": 0.5369, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -6.889112949371338, + "rewards/margins": 27.30415916442871, + "rewards/real": 20.415042877197266, + "step": 7830 + }, + { + "epoch": 0.9427609427609428, + "grad_norm": 6.034575029065627, + "learning_rate": 3.180117584179583e-08, + "logits/generated": -1.8643524646759033, + "logits/real": -1.9991130828857422, + "logps/generated": -705.0940551757812, + "logps/real": -234.36495971679688, + "loss": 0.4216, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -12.718157768249512, + "rewards/margins": 35.234375, + "rewards/real": 22.516218185424805, + "step": 7840 + }, + { + "epoch": 0.943963443963444, + "grad_norm": 6.797818967782297, + "learning_rate": 3.1133083912346336e-08, + "logits/generated": -1.8352415561676025, + "logits/real": -1.9554344415664673, + "logps/generated": -634.4024658203125, + "logps/real": -219.031005859375, + "loss": 0.6585, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -16.06719970703125, + "rewards/margins": 35.9827766418457, + "rewards/real": 19.915573120117188, + "step": 7850 + }, + { + "epoch": 0.9451659451659452, + "grad_norm": 117.8575004614085, + "learning_rate": 3.046499198289685e-08, + "logits/generated": -1.776283621788025, + "logits/real": -2.061842203140259, + "logps/generated": -661.2219848632812, + "logps/real": -201.66799926757812, + "loss": 0.3905, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -21.539260864257812, + "rewards/margins": 38.302513122558594, + "rewards/real": 16.763261795043945, + "step": 7860 + }, + { + "epoch": 0.9463684463684464, + "grad_norm": 47.35907395759234, + "learning_rate": 2.9796900053447355e-08, + "logits/generated": -1.8681166172027588, + "logits/real": -2.0195064544677734, + "logps/generated": -605.47509765625, + "logps/real": -249.17617797851562, + "loss": 0.4233, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.160433769226074, + "rewards/margins": 33.22639846801758, + "rewards/real": 24.065961837768555, + "step": 7870 + }, + { + "epoch": 0.9475709475709476, + "grad_norm": 484.60036455117483, + "learning_rate": 2.9128808123997862e-08, + "logits/generated": -1.8744513988494873, + "logits/real": -1.9709593057632446, + "logps/generated": -558.8253173828125, + "logps/real": -219.21353149414062, + "loss": 0.3166, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -9.859278678894043, + "rewards/margins": 30.427806854248047, + "rewards/real": 20.568531036376953, + "step": 7880 + }, + { + "epoch": 0.9487734487734488, + "grad_norm": 1459.3056570761464, + "learning_rate": 2.8460716194548368e-08, + "logits/generated": -1.9523365497589111, + "logits/real": -1.997300386428833, + "logps/generated": -650.0694580078125, + "logps/real": -231.16244506835938, + "loss": 0.3053, + "rewards/accuracies": 1.0, + "rewards/generated": -16.40847396850586, + "rewards/margins": 35.732765197753906, + "rewards/real": 19.324291229248047, + "step": 7890 + }, + { + "epoch": 0.94997594997595, + "grad_norm": 17.23400580291654, + "learning_rate": 2.7792624265098878e-08, + "logits/generated": -1.8673444986343384, + "logits/real": -2.0318591594696045, + "logps/generated": -631.63916015625, + "logps/real": -232.9110870361328, + "loss": 0.3388, + "rewards/accuracies": 1.0, + "rewards/generated": -13.709556579589844, + "rewards/margins": 36.20250701904297, + "rewards/real": 22.492950439453125, + "step": 7900 + }, + { + "epoch": 0.9511784511784511, + "grad_norm": 237.08902331170333, + "learning_rate": 2.7124532335649385e-08, + "logits/generated": -1.8248322010040283, + "logits/real": -1.9297415018081665, + "logps/generated": -616.5086059570312, + "logps/real": -268.40985107421875, + "loss": 0.6605, + "rewards/accuracies": 1.0, + "rewards/generated": -13.933581352233887, + "rewards/margins": 35.22268295288086, + "rewards/real": 21.289104461669922, + "step": 7910 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 131.26288463473537, + "learning_rate": 2.6456440406199894e-08, + "logits/generated": -1.9274238348007202, + "logits/real": -2.008357524871826, + "logps/generated": -648.6353759765625, + "logps/real": -253.4458770751953, + "loss": 0.5342, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.945533752441406, + "rewards/margins": 32.83995819091797, + "rewards/real": 22.894426345825195, + "step": 7920 + }, + { + "epoch": 0.9535834535834535, + "grad_norm": 96.54038153175982, + "learning_rate": 2.57883484767504e-08, + "logits/generated": -1.8905365467071533, + "logits/real": -1.9958155155181885, + "logps/generated": -624.479736328125, + "logps/real": -270.9004821777344, + "loss": 0.4429, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -10.002233505249023, + "rewards/margins": 35.480838775634766, + "rewards/real": 25.47860336303711, + "step": 7930 + }, + { + "epoch": 0.9547859547859547, + "grad_norm": 44.17128416741591, + "learning_rate": 2.512025654730091e-08, + "logits/generated": -1.9015429019927979, + "logits/real": -2.0088698863983154, + "logps/generated": -656.9166259765625, + "logps/real": -232.25119018554688, + "loss": 0.3935, + "rewards/accuracies": 1.0, + "rewards/generated": -9.713232040405273, + "rewards/margins": 33.862403869628906, + "rewards/real": 24.149173736572266, + "step": 7940 + }, + { + "epoch": 0.9559884559884559, + "grad_norm": 3.537761969196036, + "learning_rate": 2.4452164617851417e-08, + "logits/generated": -1.862546682357788, + "logits/real": -1.9948028326034546, + "logps/generated": -537.6753540039062, + "logps/real": -192.64080810546875, + "loss": 0.3979, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -11.830598831176758, + "rewards/margins": 29.180246353149414, + "rewards/real": 17.349647521972656, + "step": 7950 + }, + { + "epoch": 0.9571909571909571, + "grad_norm": 1562.5980622278403, + "learning_rate": 2.3784072688401923e-08, + "logits/generated": -1.9070017337799072, + "logits/real": -1.9483369588851929, + "logps/generated": -572.1051025390625, + "logps/real": -230.68423461914062, + "loss": 0.6042, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -9.48015022277832, + "rewards/margins": 28.934789657592773, + "rewards/real": 19.454639434814453, + "step": 7960 + }, + { + "epoch": 0.9583934583934584, + "grad_norm": 204.34010807600555, + "learning_rate": 2.3115980758952433e-08, + "logits/generated": -1.9353723526000977, + "logits/real": -2.026029109954834, + "logps/generated": -581.7493286132812, + "logps/real": -226.6553955078125, + "loss": 0.5636, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -9.081565856933594, + "rewards/margins": 29.828304290771484, + "rewards/real": 20.746736526489258, + "step": 7970 + }, + { + "epoch": 0.9595959595959596, + "grad_norm": 37.015019863286604, + "learning_rate": 2.244788882950294e-08, + "logits/generated": -1.8648122549057007, + "logits/real": -1.9446719884872437, + "logps/generated": -450.04541015625, + "logps/real": -186.21939086914062, + "loss": 0.473, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -7.291723728179932, + "rewards/margins": 24.546016693115234, + "rewards/real": 17.25429344177246, + "step": 7980 + }, + { + "epoch": 0.9607984607984608, + "grad_norm": 9.915964783029532, + "learning_rate": 2.177979690005345e-08, + "logits/generated": -1.8747150897979736, + "logits/real": -1.9883369207382202, + "logps/generated": -540.0999755859375, + "logps/real": -238.61148071289062, + "loss": 0.4058, + "rewards/accuracies": 1.0, + "rewards/generated": -6.2435173988342285, + "rewards/margins": 29.89438247680664, + "rewards/real": 23.650863647460938, + "step": 7990 + }, + { + "epoch": 0.962000962000962, + "grad_norm": 284.84969341031046, + "learning_rate": 2.1111704970603956e-08, + "logits/generated": -1.9531828165054321, + "logits/real": -1.9930452108383179, + "logps/generated": -644.8855590820312, + "logps/real": -250.3666229248047, + "loss": 0.5954, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -11.983048439025879, + "rewards/margins": 34.054222106933594, + "rewards/real": 22.071170806884766, + "step": 8000 + }, + { + "epoch": 0.962000962000962, + "eval_logits/generated": -1.948014259338379, + "eval_logits/real": -2.000520706176758, + "eval_logps/generated": -562.68896484375, + "eval_logps/real": -250.2119903564453, + "eval_loss": 0.39723077416419983, + "eval_rewards/accuracies": 0.9642857313156128, + "eval_rewards/generated": -7.290745258331299, + "eval_rewards/margins": 29.396650314331055, + "eval_rewards/real": 22.105905532836914, + "eval_runtime": 158.3174, + "eval_samples_per_second": 6.316, + "eval_steps_per_second": 0.531, + "step": 8000 + }, + { + "epoch": 0.9632034632034632, + "grad_norm": 6.080219689125808, + "learning_rate": 2.044361304115446e-08, + "logits/generated": -1.8047806024551392, + "logits/real": -1.9242351055145264, + "logps/generated": -474.2330017089844, + "logps/real": -185.81861877441406, + "loss": 0.5476, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -10.688553810119629, + "rewards/margins": 27.152202606201172, + "rewards/real": 16.46364974975586, + "step": 8010 + }, + { + "epoch": 0.9644059644059644, + "grad_norm": 4.506819732896348, + "learning_rate": 1.977552111170497e-08, + "logits/generated": -1.803545355796814, + "logits/real": -1.9313043355941772, + "logps/generated": -510.97833251953125, + "logps/real": -190.4078826904297, + "loss": 0.2868, + "rewards/accuracies": 1.0, + "rewards/generated": -14.387975692749023, + "rewards/margins": 33.485694885253906, + "rewards/real": 19.097719192504883, + "step": 8020 + }, + { + "epoch": 0.9656084656084656, + "grad_norm": 175.28657069353605, + "learning_rate": 1.9107429182255475e-08, + "logits/generated": -1.907098412513733, + "logits/real": -2.042231798171997, + "logps/generated": -617.7903442382812, + "logps/real": -253.0595703125, + "loss": 0.4869, + "rewards/accuracies": 1.0, + "rewards/generated": -14.982889175415039, + "rewards/margins": 37.402259826660156, + "rewards/real": 22.419368743896484, + "step": 8030 + }, + { + "epoch": 0.9668109668109668, + "grad_norm": 4.54301889959483, + "learning_rate": 1.8439337252805985e-08, + "logits/generated": -1.8733489513397217, + "logits/real": -1.9778435230255127, + "logps/generated": -534.8236083984375, + "logps/real": -225.927490234375, + "loss": 0.3097, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -7.314043998718262, + "rewards/margins": 27.533193588256836, + "rewards/real": 20.219152450561523, + "step": 8040 + }, + { + "epoch": 0.968013468013468, + "grad_norm": 3.303525626665763, + "learning_rate": 1.777124532335649e-08, + "logits/generated": -1.9220225811004639, + "logits/real": -2.0492451190948486, + "logps/generated": -711.3641357421875, + "logps/real": -279.6134948730469, + "loss": 0.8231, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -12.18855094909668, + "rewards/margins": 38.05695343017578, + "rewards/real": 25.8684024810791, + "step": 8050 + }, + { + "epoch": 0.9692159692159692, + "grad_norm": 122.83276724360225, + "learning_rate": 1.7103153393906998e-08, + "logits/generated": -1.8907839059829712, + "logits/real": -1.9493697881698608, + "logps/generated": -463.2969665527344, + "logps/real": -166.5428924560547, + "loss": 0.3734, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -8.571676254272461, + "rewards/margins": 24.737638473510742, + "rewards/real": 16.16596221923828, + "step": 8060 + }, + { + "epoch": 0.9704184704184704, + "grad_norm": 15.856534381934436, + "learning_rate": 1.6435061464457508e-08, + "logits/generated": -1.8526527881622314, + "logits/real": -1.9400050640106201, + "logps/generated": -560.2443237304688, + "logps/real": -285.6490173339844, + "loss": 0.4117, + "rewards/accuracies": 1.0, + "rewards/generated": -7.392428398132324, + "rewards/margins": 32.72213363647461, + "rewards/real": 25.3297061920166, + "step": 8070 + }, + { + "epoch": 0.9716209716209716, + "grad_norm": 925.17430026782, + "learning_rate": 1.5766969535008014e-08, + "logits/generated": -1.8764747381210327, + "logits/real": -1.849029541015625, + "logps/generated": -541.9317626953125, + "logps/real": -256.1459655761719, + "loss": 0.4275, + "rewards/accuracies": 1.0, + "rewards/generated": -8.22850513458252, + "rewards/margins": 30.15728759765625, + "rewards/real": 21.928783416748047, + "step": 8080 + }, + { + "epoch": 0.9728234728234728, + "grad_norm": 132.89207443382926, + "learning_rate": 1.5098877605558524e-08, + "logits/generated": -1.89582097530365, + "logits/real": -1.959554672241211, + "logps/generated": -586.2793579101562, + "logps/real": -245.7548370361328, + "loss": 0.7542, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -10.676665306091309, + "rewards/margins": 32.07061004638672, + "rewards/real": 21.39394760131836, + "step": 8090 + }, + { + "epoch": 0.974025974025974, + "grad_norm": 27.941911449310695, + "learning_rate": 1.4430785676109032e-08, + "logits/generated": -1.8876546621322632, + "logits/real": -1.995438814163208, + "logps/generated": -578.7598876953125, + "logps/real": -333.9859619140625, + "loss": 0.5529, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -1.251502275466919, + "rewards/margins": 31.09954261779785, + "rewards/real": 29.84804344177246, + "step": 8100 + }, + { + "epoch": 0.9752284752284752, + "grad_norm": 235.98287723233014, + "learning_rate": 1.376269374665954e-08, + "logits/generated": -1.9337660074234009, + "logits/real": -2.0056357383728027, + "logps/generated": -513.9544677734375, + "logps/real": -266.8468322753906, + "loss": 0.4692, + "rewards/accuracies": 1.0, + "rewards/generated": -6.389153480529785, + "rewards/margins": 30.21062660217285, + "rewards/real": 23.821468353271484, + "step": 8110 + }, + { + "epoch": 0.9764309764309764, + "grad_norm": 4.208306008543635, + "learning_rate": 1.3094601817210048e-08, + "logits/generated": -1.8793519735336304, + "logits/real": -1.9217697381973267, + "logps/generated": -601.8344116210938, + "logps/real": -319.68206787109375, + "loss": 0.5761, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -3.4111621379852295, + "rewards/margins": 29.1977481842041, + "rewards/real": 25.786584854125977, + "step": 8120 + }, + { + "epoch": 0.9776334776334776, + "grad_norm": 231.02284426195752, + "learning_rate": 1.2426509887760556e-08, + "logits/generated": -1.8332197666168213, + "logits/real": -1.977024793624878, + "logps/generated": -442.82916259765625, + "logps/real": -210.1693115234375, + "loss": 0.5707, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.085574150085449, + "rewards/margins": 27.40066146850586, + "rewards/real": 20.315088272094727, + "step": 8130 + }, + { + "epoch": 0.9788359788359788, + "grad_norm": 153.1950357875449, + "learning_rate": 1.1758417958311064e-08, + "logits/generated": -1.8746535778045654, + "logits/real": -1.960744857788086, + "logps/generated": -535.58642578125, + "logps/real": -210.9016876220703, + "loss": 0.5227, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -5.577829837799072, + "rewards/margins": 25.187856674194336, + "rewards/real": 19.610027313232422, + "step": 8140 + }, + { + "epoch": 0.98003848003848, + "grad_norm": 189.62900666295315, + "learning_rate": 1.1090326028861572e-08, + "logits/generated": -1.899243950843811, + "logits/real": -1.9381084442138672, + "logps/generated": -654.8983154296875, + "logps/real": -235.5189971923828, + "loss": 0.6281, + "rewards/accuracies": 1.0, + "rewards/generated": -11.314410209655762, + "rewards/margins": 32.45915985107422, + "rewards/real": 21.144752502441406, + "step": 8150 + }, + { + "epoch": 0.9812409812409812, + "grad_norm": 19.043560298229426, + "learning_rate": 1.0422234099412079e-08, + "logits/generated": -1.8975727558135986, + "logits/real": -2.0573973655700684, + "logps/generated": -684.2650146484375, + "logps/real": -272.1838684082031, + "loss": 0.4934, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.558279991149902, + "rewards/margins": 32.49468994140625, + "rewards/real": 25.936412811279297, + "step": 8160 + }, + { + "epoch": 0.9824434824434825, + "grad_norm": 255.80157344973264, + "learning_rate": 9.754142169962585e-09, + "logits/generated": -1.868058204650879, + "logits/real": -1.9154685735702515, + "logps/generated": -542.9749755859375, + "logps/real": -217.7202911376953, + "loss": 0.7706, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -6.893022060394287, + "rewards/margins": 25.462129592895508, + "rewards/real": 18.569110870361328, + "step": 8170 + }, + { + "epoch": 0.9836459836459837, + "grad_norm": 729.4401918580012, + "learning_rate": 9.086050240513093e-09, + "logits/generated": -1.9020189046859741, + "logits/real": -1.948012351989746, + "logps/generated": -636.56298828125, + "logps/real": -270.43939208984375, + "loss": 0.4966, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -10.260741233825684, + "rewards/margins": 33.59025192260742, + "rewards/real": 23.329511642456055, + "step": 8180 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 494.7696205202936, + "learning_rate": 8.417958311063602e-09, + "logits/generated": -1.8801816701889038, + "logits/real": -1.9777199029922485, + "logps/generated": -511.88519287109375, + "logps/real": -207.6827850341797, + "loss": 0.5856, + "rewards/accuracies": 0.8999999761581421, + "rewards/generated": -6.680776119232178, + "rewards/margins": 26.44734764099121, + "rewards/real": 19.766572952270508, + "step": 8190 + }, + { + "epoch": 0.9860509860509861, + "grad_norm": 524.9149081458041, + "learning_rate": 7.74986638161411e-09, + "logits/generated": -1.695943832397461, + "logits/real": -1.8408949375152588, + "logps/generated": -603.311767578125, + "logps/real": -169.5996856689453, + "loss": 0.375, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -17.913097381591797, + "rewards/margins": 33.64246368408203, + "rewards/real": 15.729368209838867, + "step": 8200 + }, + { + "epoch": 0.9872534872534873, + "grad_norm": 777.969282884708, + "learning_rate": 7.081774452164618e-09, + "logits/generated": -1.8220351934432983, + "logits/real": -1.9063689708709717, + "logps/generated": -680.7005004882812, + "logps/real": -282.27911376953125, + "loss": 0.4907, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -12.420315742492676, + "rewards/margins": 33.469173431396484, + "rewards/real": 21.04886245727539, + "step": 8210 + }, + { + "epoch": 0.9884559884559885, + "grad_norm": 150.28547000576296, + "learning_rate": 6.413682522715126e-09, + "logits/generated": -1.8631532192230225, + "logits/real": -2.0302681922912598, + "logps/generated": -725.7526245117188, + "logps/real": -301.81536865234375, + "loss": 0.5798, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -12.97132396697998, + "rewards/margins": 36.01592254638672, + "rewards/real": 23.04459571838379, + "step": 8220 + }, + { + "epoch": 0.9896584896584897, + "grad_norm": 7.4342139328404055, + "learning_rate": 5.745590593265633e-09, + "logits/generated": -1.8681774139404297, + "logits/real": -1.9475829601287842, + "logps/generated": -545.0968017578125, + "logps/real": -263.8797912597656, + "loss": 0.4305, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -4.504101753234863, + "rewards/margins": 27.534839630126953, + "rewards/real": 23.030738830566406, + "step": 8230 + }, + { + "epoch": 0.9908609908609909, + "grad_norm": 5.857174381469028, + "learning_rate": 5.07749866381614e-09, + "logits/generated": -1.9394757747650146, + "logits/real": -2.0266969203948975, + "logps/generated": -588.8060302734375, + "logps/real": -285.57464599609375, + "loss": 0.5886, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -3.924767255783081, + "rewards/margins": 28.511524200439453, + "rewards/real": 24.58675193786621, + "step": 8240 + }, + { + "epoch": 0.9920634920634921, + "grad_norm": 998.4137066643937, + "learning_rate": 4.4094067343666485e-09, + "logits/generated": -1.8733768463134766, + "logits/real": -1.9439998865127563, + "logps/generated": -653.6117553710938, + "logps/real": -255.6890106201172, + "loss": 0.5714, + "rewards/accuracies": 0.9750000238418579, + "rewards/generated": -8.273148536682129, + "rewards/margins": 30.489307403564453, + "rewards/real": 22.216154098510742, + "step": 8250 + }, + { + "epoch": 0.9932659932659933, + "grad_norm": 92.09120705704146, + "learning_rate": 3.741314804917157e-09, + "logits/generated": -1.8796441555023193, + "logits/real": -1.9806550741195679, + "logps/generated": -520.3734130859375, + "logps/real": -201.2113037109375, + "loss": 0.5388, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -7.409041404724121, + "rewards/margins": 25.22804832458496, + "rewards/real": 17.819007873535156, + "step": 8260 + }, + { + "epoch": 0.9944684944684945, + "grad_norm": 16.67388472375853, + "learning_rate": 3.0732228754676643e-09, + "logits/generated": -1.8960243463516235, + "logits/real": -1.9444904327392578, + "logps/generated": -660.3632202148438, + "logps/real": -218.44381713867188, + "loss": 0.2777, + "rewards/accuracies": 1.0, + "rewards/generated": -15.783859252929688, + "rewards/margins": 37.282047271728516, + "rewards/real": 21.498188018798828, + "step": 8270 + }, + { + "epoch": 0.9956709956709957, + "grad_norm": 26.501938148729327, + "learning_rate": 2.405130946018172e-09, + "logits/generated": -1.8098771572113037, + "logits/real": -1.9966379404067993, + "logps/generated": -601.8561401367188, + "logps/real": -264.85504150390625, + "loss": 0.4386, + "rewards/accuracies": 0.925000011920929, + "rewards/generated": -9.823989868164062, + "rewards/margins": 34.13655471801758, + "rewards/real": 24.31256103515625, + "step": 8280 + }, + { + "epoch": 0.9968734968734969, + "grad_norm": 41.47023906797856, + "learning_rate": 1.7370390165686799e-09, + "logits/generated": -1.7665832042694092, + "logits/real": -1.8990328311920166, + "logps/generated": -603.8845825195312, + "logps/real": -192.68128967285156, + "loss": 0.4049, + "rewards/accuracies": 1.0, + "rewards/generated": -14.44365406036377, + "rewards/margins": 31.124195098876953, + "rewards/real": 16.680543899536133, + "step": 8290 + }, + { + "epoch": 0.9980759980759981, + "grad_norm": 16.051955841539332, + "learning_rate": 1.0689470871191874e-09, + "logits/generated": -1.9049867391586304, + "logits/real": -1.9545881748199463, + "logps/generated": -469.04022216796875, + "logps/real": -204.06179809570312, + "loss": 0.5172, + "rewards/accuracies": 0.8500000238418579, + "rewards/generated": -0.7458006739616394, + "rewards/margins": 19.346948623657227, + "rewards/real": 18.601146697998047, + "step": 8300 + }, + { + "epoch": 0.9992784992784993, + "grad_norm": 366.90768126900497, + "learning_rate": 4.0085515766969536e-10, + "logits/generated": -1.8730134963989258, + "logits/real": -1.986374855041504, + "logps/generated": -415.9329528808594, + "logps/real": -208.17019653320312, + "loss": 0.5692, + "rewards/accuracies": 0.949999988079071, + "rewards/generated": -2.6481776237487793, + "rewards/margins": 19.210948944091797, + "rewards/real": 16.562774658203125, + "step": 8310 + }, + { + "epoch": 1.0, + "step": 8316, + "total_flos": 0.0, + "train_loss": 0.5495538560314325, + "train_runtime": 35404.0549, + "train_samples_per_second": 2.819, + "train_steps_per_second": 0.235 + } + ], + "logging_steps": 10, + "max_steps": 8316, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}