diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,20202 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 7.964444444444444, + "eval_steps": 100, + "global_step": 1344, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005925925925925926, + "grad_norm": 54.73865774156835, + "learning_rate": 3.7037037037037036e-09, + "logits/chosen": -1.6551780700683594, + "logits/rejected": -1.6470587253570557, + "logps/chosen": -42.52139663696289, + "logps/rejected": -48.890506744384766, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.011851851851851851, + "grad_norm": 50.35685724767536, + "learning_rate": 7.407407407407407e-09, + "logits/chosen": -1.1584198474884033, + "logits/rejected": -1.2945518493652344, + "logps/chosen": -42.262428283691406, + "logps/rejected": -47.62751007080078, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 54.59371328811371, + "learning_rate": 1.111111111111111e-08, + "logits/chosen": -1.4449834823608398, + "logits/rejected": -1.347031831741333, + "logps/chosen": -38.551475524902344, + "logps/rejected": -56.001258850097656, + "loss": 0.7056, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04485452175140381, + "rewards/margins": -0.044418931007385254, + "rewards/rejected": -0.0004355907440185547, + "step": 3 + }, + { + "epoch": 0.023703703703703703, + "grad_norm": 51.501597470917936, + "learning_rate": 1.4814814814814814e-08, + "logits/chosen": -1.69174063205719, + "logits/rejected": -1.577598214149475, + "logps/chosen": -30.320907592773438, + "logps/rejected": -46.535797119140625, + "loss": 0.7038, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0048062801361083984, + "rewards/margins": 0.05189502239227295, + "rewards/rejected": -0.04708874225616455, + "step": 4 + }, + { + "epoch": 0.02962962962962963, + "grad_norm": 51.34455089675247, + "learning_rate": 1.8518518518518518e-08, + "logits/chosen": -1.6288294792175293, + "logits/rejected": -1.5976742506027222, + "logps/chosen": -33.35110855102539, + "logps/rejected": -48.16716766357422, + "loss": 0.7014, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00041840970516204834, + "rewards/margins": -0.032079800963401794, + "rewards/rejected": 0.031661391258239746, + "step": 5 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 47.841287211454, + "learning_rate": 2.222222222222222e-08, + "logits/chosen": -1.4972864389419556, + "logits/rejected": -1.5760784149169922, + "logps/chosen": -35.15846252441406, + "logps/rejected": -35.15956497192383, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0043003857135772705, + "rewards/margins": -0.031109660863876343, + "rewards/rejected": 0.03541004657745361, + "step": 6 + }, + { + "epoch": 0.04148148148148148, + "grad_norm": 48.17008791092014, + "learning_rate": 2.5925925925925923e-08, + "logits/chosen": -1.2874022722244263, + "logits/rejected": -1.339775562286377, + "logps/chosen": -38.65494155883789, + "logps/rejected": -44.154571533203125, + "loss": 0.6844, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02324095368385315, + "rewards/margins": 0.07515916228294373, + "rewards/rejected": -0.051918208599090576, + "step": 7 + }, + { + "epoch": 0.047407407407407405, + "grad_norm": 53.97607510739969, + "learning_rate": 2.962962962962963e-08, + "logits/chosen": -1.6390717029571533, + "logits/rejected": -1.5105955600738525, + "logps/chosen": -34.070068359375, + "logps/rejected": -48.21985626220703, + "loss": 0.7044, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.015645623207092285, + "rewards/margins": 0.03352612257003784, + "rewards/rejected": -0.017880499362945557, + "step": 8 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 50.44575738510241, + "learning_rate": 3.3333333333333334e-08, + "logits/chosen": -1.9325859546661377, + "logits/rejected": -1.77297043800354, + "logps/chosen": -54.47149658203125, + "logps/rejected": -57.60348892211914, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00028821825981140137, + "rewards/margins": 0.010739117860794067, + "rewards/rejected": -0.011027336120605469, + "step": 9 + }, + { + "epoch": 0.05925925925925926, + "grad_norm": 51.58385642006147, + "learning_rate": 3.7037037037037036e-08, + "logits/chosen": -1.6119937896728516, + "logits/rejected": -1.7191580533981323, + "logps/chosen": -41.67599105834961, + "logps/rejected": -42.40376663208008, + "loss": 0.7032, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.013362348079681396, + "rewards/margins": -0.01079791784286499, + "rewards/rejected": 0.024160265922546387, + "step": 10 + }, + { + "epoch": 0.06518518518518518, + "grad_norm": 54.37783035721775, + "learning_rate": 4.0740740740740745e-08, + "logits/chosen": -1.2692227363586426, + "logits/rejected": -1.355187177658081, + "logps/chosen": -43.08770751953125, + "logps/rejected": -46.87504577636719, + "loss": 0.6969, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.028767406940460205, + "rewards/margins": -0.06381511688232422, + "rewards/rejected": 0.035047709941864014, + "step": 11 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 56.74473973643056, + "learning_rate": 4.444444444444444e-08, + "logits/chosen": -1.8102182149887085, + "logits/rejected": -1.938857078552246, + "logps/chosen": -43.0614013671875, + "logps/rejected": -44.25722122192383, + "loss": 0.7345, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.0229855477809906, + "rewards/margins": -0.07008519768714905, + "rewards/rejected": 0.04709964990615845, + "step": 12 + }, + { + "epoch": 0.07703703703703704, + "grad_norm": 46.45521014700028, + "learning_rate": 4.814814814814814e-08, + "logits/chosen": -1.4651801586151123, + "logits/rejected": -1.4712345600128174, + "logps/chosen": -32.00542449951172, + "logps/rejected": -37.7706413269043, + "loss": 0.6979, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.019961148500442505, + "rewards/margins": 0.01969766616821289, + "rewards/rejected": -0.039658814668655396, + "step": 13 + }, + { + "epoch": 0.08296296296296296, + "grad_norm": 53.72462131673607, + "learning_rate": 5.1851851851851846e-08, + "logits/chosen": -1.7651007175445557, + "logits/rejected": -1.5879353284835815, + "logps/chosen": -37.73487091064453, + "logps/rejected": -59.109642028808594, + "loss": 0.7008, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.019394323229789734, + "rewards/margins": -0.01382051408290863, + "rewards/rejected": -0.0055738091468811035, + "step": 14 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 57.44517565738861, + "learning_rate": 5.555555555555555e-08, + "logits/chosen": -1.8541333675384521, + "logits/rejected": -1.7781238555908203, + "logps/chosen": -33.13742446899414, + "logps/rejected": -45.20591354370117, + "loss": 0.6984, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.016418397426605225, + "rewards/margins": -0.0519789457321167, + "rewards/rejected": 0.035560548305511475, + "step": 15 + }, + { + "epoch": 0.09481481481481481, + "grad_norm": 51.361031686541665, + "learning_rate": 5.925925925925926e-08, + "logits/chosen": -1.5105748176574707, + "logits/rejected": -1.3466538190841675, + "logps/chosen": -38.21760177612305, + "logps/rejected": -50.58500289916992, + "loss": 0.7051, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03620937466621399, + "rewards/margins": -0.012676209211349487, + "rewards/rejected": -0.023533165454864502, + "step": 16 + }, + { + "epoch": 0.10074074074074074, + "grad_norm": 54.99448299996414, + "learning_rate": 6.296296296296296e-08, + "logits/chosen": -1.2743877172470093, + "logits/rejected": -1.5392966270446777, + "logps/chosen": -45.76844787597656, + "logps/rejected": -44.51301574707031, + "loss": 0.7044, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02413499355316162, + "rewards/margins": 0.03405413031578064, + "rewards/rejected": -0.009919136762619019, + "step": 17 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 52.90769744924133, + "learning_rate": 6.666666666666667e-08, + "logits/chosen": -1.394054651260376, + "logits/rejected": -1.660745620727539, + "logps/chosen": -51.191001892089844, + "logps/rejected": -39.66780090332031, + "loss": 0.688, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04243624210357666, + "rewards/margins": -0.002468883991241455, + "rewards/rejected": 0.044905126094818115, + "step": 18 + }, + { + "epoch": 0.11259259259259259, + "grad_norm": 52.73976531396199, + "learning_rate": 7.037037037037038e-08, + "logits/chosen": -1.643120527267456, + "logits/rejected": -1.6542155742645264, + "logps/chosen": -42.54930114746094, + "logps/rejected": -46.6912956237793, + "loss": 0.6981, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.037486732006073, + "rewards/margins": 0.02916562557220459, + "rewards/rejected": -0.06665235757827759, + "step": 19 + }, + { + "epoch": 0.11851851851851852, + "grad_norm": 51.77338754692824, + "learning_rate": 7.407407407407407e-08, + "logits/chosen": -1.9419344663619995, + "logits/rejected": -1.7904196977615356, + "logps/chosen": -33.6644287109375, + "logps/rejected": -50.46963882446289, + "loss": 0.6912, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.003351449966430664, + "rewards/margins": -0.023046374320983887, + "rewards/rejected": 0.02639782428741455, + "step": 20 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 55.682792202937584, + "learning_rate": 7.777777777777778e-08, + "logits/chosen": -1.6269031763076782, + "logits/rejected": -1.5598448514938354, + "logps/chosen": -44.19285202026367, + "logps/rejected": -54.80866241455078, + "loss": 0.6673, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0722590982913971, + "rewards/margins": 0.0795658528804779, + "rewards/rejected": -0.0073067545890808105, + "step": 21 + }, + { + "epoch": 0.13037037037037036, + "grad_norm": 54.26723284603444, + "learning_rate": 8.148148148148149e-08, + "logits/chosen": -1.5746511220932007, + "logits/rejected": -1.4856189489364624, + "logps/chosen": -31.291728973388672, + "logps/rejected": -37.666141510009766, + "loss": 0.7061, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.011329293251037598, + "rewards/margins": -0.0033955276012420654, + "rewards/rejected": -0.007933765649795532, + "step": 22 + }, + { + "epoch": 0.1362962962962963, + "grad_norm": 49.26655807512015, + "learning_rate": 8.518518518518517e-08, + "logits/chosen": -1.3974454402923584, + "logits/rejected": -1.2355865240097046, + "logps/chosen": -33.44588851928711, + "logps/rejected": -44.378822326660156, + "loss": 0.7064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016000211238861084, + "rewards/margins": 0.05508500337600708, + "rewards/rejected": -0.039084792137145996, + "step": 23 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 53.748442869248805, + "learning_rate": 8.888888888888888e-08, + "logits/chosen": -2.1273233890533447, + "logits/rejected": -1.962372899055481, + "logps/chosen": -43.61072540283203, + "logps/rejected": -52.419864654541016, + "loss": 0.7231, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.054617494344711304, + "rewards/margins": -0.019975215196609497, + "rewards/rejected": 0.0745927095413208, + "step": 24 + }, + { + "epoch": 0.14814814814814814, + "grad_norm": 48.38445431031237, + "learning_rate": 9.259259259259258e-08, + "logits/chosen": -1.7700583934783936, + "logits/rejected": -1.822185754776001, + "logps/chosen": -38.442745208740234, + "logps/rejected": -49.97007751464844, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014503806829452515, + "rewards/margins": 0.08013251423835754, + "rewards/rejected": -0.06562870740890503, + "step": 25 + }, + { + "epoch": 0.15407407407407409, + "grad_norm": 53.61488969779955, + "learning_rate": 9.629629629629629e-08, + "logits/chosen": -1.6693568229675293, + "logits/rejected": -1.666528582572937, + "logps/chosen": -43.25126266479492, + "logps/rejected": -53.571266174316406, + "loss": 0.7053, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.022118449211120605, + "rewards/margins": 0.0469089150428772, + "rewards/rejected": -0.024790465831756592, + "step": 26 + }, + { + "epoch": 0.16, + "grad_norm": 59.47709615901772, + "learning_rate": 1e-07, + "logits/chosen": -1.3913928270339966, + "logits/rejected": -1.6422309875488281, + "logps/chosen": -48.43421936035156, + "logps/rejected": -44.25364685058594, + "loss": 0.6948, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.02788090705871582, + "rewards/margins": 0.027790188789367676, + "rewards/rejected": 9.071826934814453e-05, + "step": 27 + }, + { + "epoch": 0.16592592592592592, + "grad_norm": 51.38601767812816, + "learning_rate": 1.0370370370370369e-07, + "logits/chosen": -1.5396907329559326, + "logits/rejected": -1.4416035413742065, + "logps/chosen": -34.06419372558594, + "logps/rejected": -43.13043975830078, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027874916791915894, + "rewards/margins": -0.0011770427227020264, + "rewards/rejected": -0.026697874069213867, + "step": 28 + }, + { + "epoch": 0.17185185185185184, + "grad_norm": 49.75849658704746, + "learning_rate": 1.074074074074074e-07, + "logits/chosen": -1.2683113813400269, + "logits/rejected": -0.9843475222587585, + "logps/chosen": -25.81869888305664, + "logps/rejected": -43.70813751220703, + "loss": 0.7049, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03348975256085396, + "rewards/margins": -0.026481706649065018, + "rewards/rejected": -0.00700804591178894, + "step": 29 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 52.49502678815258, + "learning_rate": 1.111111111111111e-07, + "logits/chosen": -1.413278341293335, + "logits/rejected": -1.4029860496520996, + "logps/chosen": -37.33210754394531, + "logps/rejected": -52.769691467285156, + "loss": 0.6962, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.013176381587982178, + "rewards/margins": -0.05075275897979736, + "rewards/rejected": 0.037576377391815186, + "step": 30 + }, + { + "epoch": 0.1837037037037037, + "grad_norm": 55.82883153656641, + "learning_rate": 1.148148148148148e-07, + "logits/chosen": -1.6182873249053955, + "logits/rejected": -1.5029940605163574, + "logps/chosen": -38.19645309448242, + "logps/rejected": -54.64601135253906, + "loss": 0.6943, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04735572636127472, + "rewards/margins": 0.0014861971139907837, + "rewards/rejected": 0.045869529247283936, + "step": 31 + }, + { + "epoch": 0.18962962962962962, + "grad_norm": 51.73567756412023, + "learning_rate": 1.1851851851851851e-07, + "logits/chosen": -1.3902158737182617, + "logits/rejected": -1.4902468919754028, + "logps/chosen": -44.617671966552734, + "logps/rejected": -50.17817687988281, + "loss": 0.7021, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0382617712020874, + "rewards/margins": 0.020143568515777588, + "rewards/rejected": 0.018118202686309814, + "step": 32 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 51.46866040359476, + "learning_rate": 1.2222222222222222e-07, + "logits/chosen": -1.5765653848648071, + "logits/rejected": -1.7073959112167358, + "logps/chosen": -43.03544998168945, + "logps/rejected": -42.10237121582031, + "loss": 0.6699, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0025068819522857666, + "rewards/margins": 0.0735115110874176, + "rewards/rejected": -0.07100462913513184, + "step": 33 + }, + { + "epoch": 0.20148148148148148, + "grad_norm": 52.04875175615269, + "learning_rate": 1.2592592592592592e-07, + "logits/chosen": -0.9567450284957886, + "logits/rejected": -0.9443216919898987, + "logps/chosen": -38.146095275878906, + "logps/rejected": -36.97870635986328, + "loss": 0.686, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03759458661079407, + "rewards/margins": -0.012229889631271362, + "rewards/rejected": 0.04982447624206543, + "step": 34 + }, + { + "epoch": 0.2074074074074074, + "grad_norm": 52.37961759952867, + "learning_rate": 1.2962962962962961e-07, + "logits/chosen": -1.5861988067626953, + "logits/rejected": -1.3526108264923096, + "logps/chosen": -40.725303649902344, + "logps/rejected": -53.843467712402344, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04395633935928345, + "rewards/margins": -0.035573214292526245, + "rewards/rejected": -0.008383125066757202, + "step": 35 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 52.20405363764235, + "learning_rate": 1.3333333333333334e-07, + "logits/chosen": -2.0409719944000244, + "logits/rejected": -1.8356863260269165, + "logps/chosen": -36.4237174987793, + "logps/rejected": -54.58771514892578, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.01013416051864624, + "rewards/margins": 0.023032546043395996, + "rewards/rejected": -0.012898385524749756, + "step": 36 + }, + { + "epoch": 0.21925925925925926, + "grad_norm": 49.83353701641285, + "learning_rate": 1.3703703703703703e-07, + "logits/chosen": -1.9610185623168945, + "logits/rejected": -1.8862241506576538, + "logps/chosen": -32.727813720703125, + "logps/rejected": -46.04048156738281, + "loss": 0.68, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0113593190908432, + "rewards/margins": 0.03532211482524872, + "rewards/rejected": -0.023962795734405518, + "step": 37 + }, + { + "epoch": 0.22518518518518518, + "grad_norm": 91.63451568348084, + "learning_rate": 1.4074074074074075e-07, + "logits/chosen": -2.103301525115967, + "logits/rejected": -2.1240031719207764, + "logps/chosen": -43.680843353271484, + "logps/rejected": -43.4760627746582, + "loss": 0.6831, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.02546370029449463, + "rewards/margins": 0.09499198198318481, + "rewards/rejected": -0.06952828168869019, + "step": 38 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 50.029380495348924, + "learning_rate": 1.4444444444444442e-07, + "logits/chosen": -1.7226446866989136, + "logits/rejected": -1.6855897903442383, + "logps/chosen": -37.30487060546875, + "logps/rejected": -39.31515121459961, + "loss": 0.6942, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.011799216270446777, + "rewards/margins": 0.03693026304244995, + "rewards/rejected": -0.04872947931289673, + "step": 39 + }, + { + "epoch": 0.23703703703703705, + "grad_norm": 51.79236934572246, + "learning_rate": 1.4814814814814815e-07, + "logits/chosen": -1.6022825241088867, + "logits/rejected": -1.5007712841033936, + "logps/chosen": -43.124755859375, + "logps/rejected": -60.236854553222656, + "loss": 0.7037, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03802824020385742, + "rewards/margins": -0.013065189123153687, + "rewards/rejected": -0.024963051080703735, + "step": 40 + }, + { + "epoch": 0.24296296296296296, + "grad_norm": 51.34662690579377, + "learning_rate": 1.5185185185185184e-07, + "logits/chosen": -1.7133663892745972, + "logits/rejected": -1.7276082038879395, + "logps/chosen": -44.17156219482422, + "logps/rejected": -56.69810485839844, + "loss": 0.6964, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0171157568693161, + "rewards/margins": 0.04664464294910431, + "rewards/rejected": -0.06376039981842041, + "step": 41 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 52.11523781421188, + "learning_rate": 1.5555555555555556e-07, + "logits/chosen": -1.4053561687469482, + "logits/rejected": -1.3741445541381836, + "logps/chosen": -44.58378601074219, + "logps/rejected": -50.028602600097656, + "loss": 0.679, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002070978283882141, + "rewards/margins": 0.0352911502122879, + "rewards/rejected": -0.03322017192840576, + "step": 42 + }, + { + "epoch": 0.2548148148148148, + "grad_norm": 49.32915916600631, + "learning_rate": 1.5925925925925926e-07, + "logits/chosen": -1.3316234350204468, + "logits/rejected": -1.2330265045166016, + "logps/chosen": -27.901004791259766, + "logps/rejected": -31.064517974853516, + "loss": 0.7082, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009749919176101685, + "rewards/margins": -0.010031551122665405, + "rewards/rejected": 0.01978147029876709, + "step": 43 + }, + { + "epoch": 0.2607407407407407, + "grad_norm": 56.48104490330988, + "learning_rate": 1.6296296296296298e-07, + "logits/chosen": -1.796046495437622, + "logits/rejected": -1.5567198991775513, + "logps/chosen": -32.16002655029297, + "logps/rejected": -47.925880432128906, + "loss": 0.6863, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0326349139213562, + "rewards/margins": 0.0439448356628418, + "rewards/rejected": -0.011309921741485596, + "step": 44 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 52.28508627480772, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -1.5154943466186523, + "logits/rejected": -1.4815261363983154, + "logps/chosen": -37.8524055480957, + "logps/rejected": -54.14366149902344, + "loss": 0.6693, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0024260282516479492, + "rewards/margins": 0.07158929109573364, + "rewards/rejected": -0.07401531934738159, + "step": 45 + }, + { + "epoch": 0.2725925925925926, + "grad_norm": 49.33789452340116, + "learning_rate": 1.7037037037037035e-07, + "logits/chosen": -1.1807670593261719, + "logits/rejected": -1.257714033126831, + "logps/chosen": -47.12394714355469, + "logps/rejected": -49.35055160522461, + "loss": 0.6868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018887341022491455, + "rewards/margins": 0.025706887245178223, + "rewards/rejected": -0.006819546222686768, + "step": 46 + }, + { + "epoch": 0.2785185185185185, + "grad_norm": 49.24012566537143, + "learning_rate": 1.7407407407407407e-07, + "logits/chosen": -1.6047768592834473, + "logits/rejected": -1.6896839141845703, + "logps/chosen": -55.357322692871094, + "logps/rejected": -58.79747009277344, + "loss": 0.6856, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.016460120677947998, + "rewards/margins": 0.10610747337341309, + "rewards/rejected": -0.08964735269546509, + "step": 47 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 55.92737374580854, + "learning_rate": 1.7777777777777776e-07, + "logits/chosen": -1.314180612564087, + "logits/rejected": -1.2767497301101685, + "logps/chosen": -40.34666442871094, + "logps/rejected": -48.74937438964844, + "loss": 0.6834, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0430338978767395, + "rewards/margins": -0.010425090789794922, + "rewards/rejected": -0.03260880708694458, + "step": 48 + }, + { + "epoch": 0.2903703703703704, + "grad_norm": 49.01432924086448, + "learning_rate": 1.8148148148148149e-07, + "logits/chosen": -1.3731322288513184, + "logits/rejected": -1.4195034503936768, + "logps/chosen": -34.62800598144531, + "logps/rejected": -43.32075500488281, + "loss": 0.6605, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00801500678062439, + "rewards/margins": 0.04836231470108032, + "rewards/rejected": -0.05637732148170471, + "step": 49 + }, + { + "epoch": 0.2962962962962963, + "grad_norm": 48.181313971977815, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -1.6057178974151611, + "logits/rejected": -1.5365872383117676, + "logps/chosen": -25.938697814941406, + "logps/rejected": -43.2042350769043, + "loss": 0.6415, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03604808449745178, + "rewards/margins": 0.07843932509422302, + "rewards/rejected": -0.04239124059677124, + "step": 50 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 49.376325941861374, + "learning_rate": 1.8888888888888888e-07, + "logits/chosen": -1.6562511920928955, + "logits/rejected": -1.6392525434494019, + "logps/chosen": -47.340354919433594, + "logps/rejected": -57.06756591796875, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007433861494064331, + "rewards/margins": 0.07025310397148132, + "rewards/rejected": -0.07768696546554565, + "step": 51 + }, + { + "epoch": 0.30814814814814817, + "grad_norm": 47.893215154872685, + "learning_rate": 1.9259259259259257e-07, + "logits/chosen": -1.4644725322723389, + "logits/rejected": -1.3114018440246582, + "logps/chosen": -30.613296508789062, + "logps/rejected": -42.53533172607422, + "loss": 0.6799, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.04141402244567871, + "rewards/margins": 0.0248582661151886, + "rewards/rejected": -0.06627228856086731, + "step": 52 + }, + { + "epoch": 0.31407407407407406, + "grad_norm": 48.83713343476518, + "learning_rate": 1.962962962962963e-07, + "logits/chosen": -1.1819308996200562, + "logits/rejected": -1.2800672054290771, + "logps/chosen": -32.65019989013672, + "logps/rejected": -32.47792053222656, + "loss": 0.6795, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.05936972796916962, + "rewards/margins": 0.005213424563407898, + "rewards/rejected": -0.06458315253257751, + "step": 53 + }, + { + "epoch": 0.32, + "grad_norm": 49.333664814053236, + "learning_rate": 2e-07, + "logits/chosen": -1.4133021831512451, + "logits/rejected": -1.3248672485351562, + "logps/chosen": -39.12198257446289, + "logps/rejected": -60.79273223876953, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0566544234752655, + "rewards/margins": 0.06260296702384949, + "rewards/rejected": -0.11925739049911499, + "step": 54 + }, + { + "epoch": 0.32592592592592595, + "grad_norm": 52.153743932483735, + "learning_rate": 2.0370370370370369e-07, + "logits/chosen": -1.989043951034546, + "logits/rejected": -1.7848541736602783, + "logps/chosen": -35.482582092285156, + "logps/rejected": -52.01215362548828, + "loss": 0.6858, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03922635316848755, + "rewards/margins": 0.014412403106689453, + "rewards/rejected": -0.053638756275177, + "step": 55 + }, + { + "epoch": 0.33185185185185184, + "grad_norm": 51.7443158402984, + "learning_rate": 2.0740740740740738e-07, + "logits/chosen": -1.6254875659942627, + "logits/rejected": -1.689445972442627, + "logps/chosen": -54.14313507080078, + "logps/rejected": -59.49906921386719, + "loss": 0.682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04046362638473511, + "rewards/margins": 0.05603635311126709, + "rewards/rejected": -0.0964999794960022, + "step": 56 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 50.651540848490434, + "learning_rate": 2.111111111111111e-07, + "logits/chosen": -1.3982229232788086, + "logits/rejected": -1.4037715196609497, + "logps/chosen": -43.407264709472656, + "logps/rejected": -44.814266204833984, + "loss": 0.6784, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06774739176034927, + "rewards/margins": 0.04890603572130203, + "rewards/rejected": 0.01884135603904724, + "step": 57 + }, + { + "epoch": 0.3437037037037037, + "grad_norm": 48.03187108978971, + "learning_rate": 2.148148148148148e-07, + "logits/chosen": -1.647029161453247, + "logits/rejected": -1.6315536499023438, + "logps/chosen": -39.03837585449219, + "logps/rejected": -47.67170333862305, + "loss": 0.6386, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.009533137083053589, + "rewards/margins": 0.11656391620635986, + "rewards/rejected": -0.12609705328941345, + "step": 58 + }, + { + "epoch": 0.3496296296296296, + "grad_norm": 47.820881352326055, + "learning_rate": 2.1851851851851852e-07, + "logits/chosen": -0.7058523893356323, + "logits/rejected": -0.7873870730400085, + "logps/chosen": -40.064552307128906, + "logps/rejected": -42.425132751464844, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0005445778369903564, + "rewards/margins": 0.09737929701805115, + "rewards/rejected": -0.09683471918106079, + "step": 59 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 52.192096809034254, + "learning_rate": 2.222222222222222e-07, + "logits/chosen": -1.3768231868743896, + "logits/rejected": -1.2123544216156006, + "logps/chosen": -35.475467681884766, + "logps/rejected": -55.790775299072266, + "loss": 0.6618, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012328773736953735, + "rewards/margins": 0.1337939202785492, + "rewards/rejected": -0.14612269401550293, + "step": 60 + }, + { + "epoch": 0.36148148148148146, + "grad_norm": 47.41837940906366, + "learning_rate": 2.2592592592592591e-07, + "logits/chosen": -1.6038029193878174, + "logits/rejected": -1.5517463684082031, + "logps/chosen": -31.51889419555664, + "logps/rejected": -42.58946990966797, + "loss": 0.6697, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.08359864354133606, + "rewards/margins": 0.0298750102519989, + "rewards/rejected": -0.11347365379333496, + "step": 61 + }, + { + "epoch": 0.3674074074074074, + "grad_norm": 48.790002508106205, + "learning_rate": 2.296296296296296e-07, + "logits/chosen": -1.6775469779968262, + "logits/rejected": -1.626863956451416, + "logps/chosen": -35.4007568359375, + "logps/rejected": -45.62629699707031, + "loss": 0.6594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.033218562602996826, + "rewards/margins": 0.03804764151573181, + "rewards/rejected": -0.07126620411872864, + "step": 62 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 48.66526101519147, + "learning_rate": 2.3333333333333333e-07, + "logits/chosen": -1.521484136581421, + "logits/rejected": -1.5792737007141113, + "logps/chosen": -39.2108039855957, + "logps/rejected": -46.792152404785156, + "loss": 0.6432, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.034676894545555115, + "rewards/margins": 0.20874853432178497, + "rewards/rejected": -0.17407163977622986, + "step": 63 + }, + { + "epoch": 0.37925925925925924, + "grad_norm": 50.80353184657783, + "learning_rate": 2.3703703703703703e-07, + "logits/chosen": -1.6936514377593994, + "logits/rejected": -1.5413093566894531, + "logps/chosen": -34.11452865600586, + "logps/rejected": -52.953857421875, + "loss": 0.6335, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.016986578702926636, + "rewards/margins": 0.07408609986305237, + "rewards/rejected": -0.091072678565979, + "step": 64 + }, + { + "epoch": 0.3851851851851852, + "grad_norm": 48.49871450799722, + "learning_rate": 2.407407407407407e-07, + "logits/chosen": -1.500447392463684, + "logits/rejected": -1.4313578605651855, + "logps/chosen": -38.88074493408203, + "logps/rejected": -50.81879425048828, + "loss": 0.6497, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05385851860046387, + "rewards/margins": 0.1640552282333374, + "rewards/rejected": -0.21791374683380127, + "step": 65 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 47.16678228397842, + "learning_rate": 2.4444444444444445e-07, + "logits/chosen": -1.55772066116333, + "logits/rejected": -1.4572783708572388, + "logps/chosen": -39.041255950927734, + "logps/rejected": -43.814151763916016, + "loss": 0.631, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08624696731567383, + "rewards/margins": 0.068337082862854, + "rewards/rejected": -0.15458405017852783, + "step": 66 + }, + { + "epoch": 0.397037037037037, + "grad_norm": 52.21526548282161, + "learning_rate": 2.4814814814814814e-07, + "logits/chosen": -1.925466775894165, + "logits/rejected": -1.8819047212600708, + "logps/chosen": -34.829891204833984, + "logps/rejected": -41.185691833496094, + "loss": 0.6401, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0699830949306488, + "rewards/margins": 0.05667153000831604, + "rewards/rejected": -0.12665462493896484, + "step": 67 + }, + { + "epoch": 0.40296296296296297, + "grad_norm": 46.527865628613704, + "learning_rate": 2.5185185185185184e-07, + "logits/chosen": -1.6865217685699463, + "logits/rejected": -1.4329313039779663, + "logps/chosen": -28.436674118041992, + "logps/rejected": -50.398094177246094, + "loss": 0.6594, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.03219631314277649, + "rewards/margins": 0.21057480573654175, + "rewards/rejected": -0.24277111887931824, + "step": 68 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 52.777421185335456, + "learning_rate": 2.5555555555555553e-07, + "logits/chosen": -1.4382909536361694, + "logits/rejected": -1.316765308380127, + "logps/chosen": -33.29138946533203, + "logps/rejected": -40.392578125, + "loss": 0.6415, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00170879065990448, + "rewards/margins": 0.1503458172082901, + "rewards/rejected": -0.15205460786819458, + "step": 69 + }, + { + "epoch": 0.4148148148148148, + "grad_norm": 48.73012974262115, + "learning_rate": 2.5925925925925923e-07, + "logits/chosen": -1.265784740447998, + "logits/rejected": -1.2845590114593506, + "logps/chosen": -41.45735168457031, + "logps/rejected": -47.93992614746094, + "loss": 0.6418, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.022552520036697388, + "rewards/margins": 0.18044358491897583, + "rewards/rejected": -0.15789106488227844, + "step": 70 + }, + { + "epoch": 0.42074074074074075, + "grad_norm": 47.468267237095105, + "learning_rate": 2.629629629629629e-07, + "logits/chosen": -2.099151134490967, + "logits/rejected": -1.9303994178771973, + "logps/chosen": -35.98270034790039, + "logps/rejected": -59.536529541015625, + "loss": 0.6412, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019120603799819946, + "rewards/margins": 0.0822630226612091, + "rewards/rejected": -0.10138362646102905, + "step": 71 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 44.72180715869609, + "learning_rate": 2.6666666666666667e-07, + "logits/chosen": -1.867506980895996, + "logits/rejected": -1.9733811616897583, + "logps/chosen": -45.759056091308594, + "logps/rejected": -49.942962646484375, + "loss": 0.61, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.044030994176864624, + "rewards/margins": 0.20099851489067078, + "rewards/rejected": -0.2450295090675354, + "step": 72 + }, + { + "epoch": 0.4325925925925926, + "grad_norm": 45.529088721861704, + "learning_rate": 2.7037037037037037e-07, + "logits/chosen": -1.7743897438049316, + "logits/rejected": -1.6937413215637207, + "logps/chosen": -31.361610412597656, + "logps/rejected": -46.95793914794922, + "loss": 0.6292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07771135866641998, + "rewards/margins": 0.14026887714862823, + "rewards/rejected": -0.21798023581504822, + "step": 73 + }, + { + "epoch": 0.43851851851851853, + "grad_norm": 45.588644884343736, + "learning_rate": 2.7407407407407406e-07, + "logits/chosen": -1.773651361465454, + "logits/rejected": -1.704150915145874, + "logps/chosen": -35.21525955200195, + "logps/rejected": -47.804935455322266, + "loss": 0.6131, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.09824433922767639, + "rewards/margins": 0.1910991370677948, + "rewards/rejected": -0.2893434762954712, + "step": 74 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 45.93590071651665, + "learning_rate": 2.7777777777777776e-07, + "logits/chosen": -1.6096124649047852, + "logits/rejected": -1.6175916194915771, + "logps/chosen": -39.06000900268555, + "logps/rejected": -54.238365173339844, + "loss": 0.6101, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.115651935338974, + "rewards/margins": 0.15163597464561462, + "rewards/rejected": -0.2672879099845886, + "step": 75 + }, + { + "epoch": 0.45037037037037037, + "grad_norm": 47.95259419432817, + "learning_rate": 2.814814814814815e-07, + "logits/chosen": -1.5724971294403076, + "logits/rejected": -1.5149195194244385, + "logps/chosen": -37.35981750488281, + "logps/rejected": -48.432159423828125, + "loss": 0.6448, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11759337782859802, + "rewards/margins": 0.18581095337867737, + "rewards/rejected": -0.3034043312072754, + "step": 76 + }, + { + "epoch": 0.4562962962962963, + "grad_norm": 47.09881943830815, + "learning_rate": 2.851851851851852e-07, + "logits/chosen": -1.4347920417785645, + "logits/rejected": -1.369720458984375, + "logps/chosen": -42.203067779541016, + "logps/rejected": -49.453102111816406, + "loss": 0.5943, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07791465520858765, + "rewards/margins": 0.3344114422798157, + "rewards/rejected": -0.4123260974884033, + "step": 77 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 44.2801384081521, + "learning_rate": 2.8888888888888885e-07, + "logits/chosen": -1.570386290550232, + "logits/rejected": -1.4483294486999512, + "logps/chosen": -35.98171615600586, + "logps/rejected": -49.303775787353516, + "loss": 0.5816, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.062459707260131836, + "rewards/margins": 0.3032795786857605, + "rewards/rejected": -0.36573928594589233, + "step": 78 + }, + { + "epoch": 0.46814814814814815, + "grad_norm": 43.35068917072876, + "learning_rate": 2.9259259259259254e-07, + "logits/chosen": -1.4911473989486694, + "logits/rejected": -1.4859997034072876, + "logps/chosen": -43.731712341308594, + "logps/rejected": -51.12042236328125, + "loss": 0.6045, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20656388998031616, + "rewards/margins": 0.26457029581069946, + "rewards/rejected": -0.4711341857910156, + "step": 79 + }, + { + "epoch": 0.4740740740740741, + "grad_norm": 47.72342589223719, + "learning_rate": 2.962962962962963e-07, + "logits/chosen": -1.602846384048462, + "logits/rejected": -1.3540472984313965, + "logps/chosen": -40.95463943481445, + "logps/rejected": -56.518924713134766, + "loss": 0.6044, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07146379351615906, + "rewards/margins": 0.26500311493873596, + "rewards/rejected": -0.336466908454895, + "step": 80 + }, + { + "epoch": 0.48, + "grad_norm": 44.13792564031615, + "learning_rate": 3e-07, + "logits/chosen": -1.9351346492767334, + "logits/rejected": -1.9528157711029053, + "logps/chosen": -45.20112609863281, + "logps/rejected": -47.75102233886719, + "loss": 0.5828, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1623789370059967, + "rewards/margins": 0.123433917760849, + "rewards/rejected": -0.2858128547668457, + "step": 81 + }, + { + "epoch": 0.48592592592592593, + "grad_norm": 45.1081090387189, + "learning_rate": 3.037037037037037e-07, + "logits/chosen": -1.35515296459198, + "logits/rejected": -1.2480462789535522, + "logps/chosen": -41.036842346191406, + "logps/rejected": -52.20813751220703, + "loss": 0.5904, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.16542187333106995, + "rewards/margins": 0.3270244598388672, + "rewards/rejected": -0.49244633316993713, + "step": 82 + }, + { + "epoch": 0.4918518518518519, + "grad_norm": 43.25852049728235, + "learning_rate": 3.074074074074074e-07, + "logits/chosen": -2.026505708694458, + "logits/rejected": -1.778357744216919, + "logps/chosen": -40.66118621826172, + "logps/rejected": -53.48126983642578, + "loss": 0.5743, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12560813128948212, + "rewards/margins": 0.23484648764133453, + "rewards/rejected": -0.36045461893081665, + "step": 83 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 46.12076558091472, + "learning_rate": 3.111111111111111e-07, + "logits/chosen": -1.7639347314834595, + "logits/rejected": -1.836035966873169, + "logps/chosen": -39.40026092529297, + "logps/rejected": -47.0621223449707, + "loss": 0.5967, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1895102560520172, + "rewards/margins": 0.31577515602111816, + "rewards/rejected": -0.505285382270813, + "step": 84 + }, + { + "epoch": 0.5037037037037037, + "grad_norm": 40.673959765790045, + "learning_rate": 3.148148148148148e-07, + "logits/chosen": -1.6891648769378662, + "logits/rejected": -1.4950172901153564, + "logps/chosen": -39.709754943847656, + "logps/rejected": -51.628135681152344, + "loss": 0.5301, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19943200051784515, + "rewards/margins": 0.4749172031879425, + "rewards/rejected": -0.6743491888046265, + "step": 85 + }, + { + "epoch": 0.5096296296296297, + "grad_norm": 41.51680039866243, + "learning_rate": 3.185185185185185e-07, + "logits/chosen": -1.5121097564697266, + "logits/rejected": -1.367327094078064, + "logps/chosen": -42.37049102783203, + "logps/rejected": -57.95114517211914, + "loss": 0.5417, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09324803948402405, + "rewards/margins": 0.4947849214076996, + "rewards/rejected": -0.5880329608917236, + "step": 86 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 43.23322172437616, + "learning_rate": 3.222222222222222e-07, + "logits/chosen": -1.559330701828003, + "logits/rejected": -1.2308735847473145, + "logps/chosen": -39.38884735107422, + "logps/rejected": -53.98082733154297, + "loss": 0.5726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17123621702194214, + "rewards/margins": 0.3211599588394165, + "rewards/rejected": -0.49239617586135864, + "step": 87 + }, + { + "epoch": 0.5214814814814814, + "grad_norm": 44.62237450482671, + "learning_rate": 3.2592592592592596e-07, + "logits/chosen": -1.4969669580459595, + "logits/rejected": -1.1791250705718994, + "logps/chosen": -30.64980125427246, + "logps/rejected": -47.99778747558594, + "loss": 0.586, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13224159181118011, + "rewards/margins": 0.3085125684738159, + "rewards/rejected": -0.4407541751861572, + "step": 88 + }, + { + "epoch": 0.5274074074074074, + "grad_norm": 45.82415006543612, + "learning_rate": 3.296296296296296e-07, + "logits/chosen": -1.7968785762786865, + "logits/rejected": -1.8232977390289307, + "logps/chosen": -45.06230926513672, + "logps/rejected": -49.47294235229492, + "loss": 0.5492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.27738267183303833, + "rewards/margins": 0.3488852381706238, + "rewards/rejected": -0.6262679100036621, + "step": 89 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 45.46414191234741, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.6212067604064941, + "logits/rejected": -1.634657859802246, + "logps/chosen": -43.29548645019531, + "logps/rejected": -46.447914123535156, + "loss": 0.5794, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.20608562231063843, + "rewards/margins": 0.4118134379386902, + "rewards/rejected": -0.6178990602493286, + "step": 90 + }, + { + "epoch": 0.5392592592592592, + "grad_norm": 43.39793711865867, + "learning_rate": 3.37037037037037e-07, + "logits/chosen": -1.5805771350860596, + "logits/rejected": -1.5447726249694824, + "logps/chosen": -39.61695861816406, + "logps/rejected": -47.621971130371094, + "loss": 0.5694, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19198113679885864, + "rewards/margins": 0.45119503140449524, + "rewards/rejected": -0.6431761980056763, + "step": 91 + }, + { + "epoch": 0.5451851851851852, + "grad_norm": 41.97640066399846, + "learning_rate": 3.407407407407407e-07, + "logits/chosen": -1.6754682064056396, + "logits/rejected": -1.4996973276138306, + "logps/chosen": -35.48243713378906, + "logps/rejected": -46.71401596069336, + "loss": 0.5619, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10602587461471558, + "rewards/margins": 0.4457920789718628, + "rewards/rejected": -0.5518179535865784, + "step": 92 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 44.228366108054274, + "learning_rate": 3.4444444444444444e-07, + "logits/chosen": -1.4117846488952637, + "logits/rejected": -1.4865397214889526, + "logps/chosen": -42.30179214477539, + "logps/rejected": -43.16421890258789, + "loss": 0.6, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.27544260025024414, + "rewards/margins": 0.013427436351776123, + "rewards/rejected": -0.28887003660202026, + "step": 93 + }, + { + "epoch": 0.557037037037037, + "grad_norm": 40.85074060393982, + "learning_rate": 3.4814814814814814e-07, + "logits/chosen": -1.7078298330307007, + "logits/rejected": -1.8396275043487549, + "logps/chosen": -43.45661163330078, + "logps/rejected": -36.45011520385742, + "loss": 0.524, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18786099553108215, + "rewards/margins": 0.4306405782699585, + "rewards/rejected": -0.6185015439987183, + "step": 94 + }, + { + "epoch": 0.562962962962963, + "grad_norm": 42.13291872541957, + "learning_rate": 3.5185185185185183e-07, + "logits/chosen": -1.4014463424682617, + "logits/rejected": -1.314795970916748, + "logps/chosen": -35.93153381347656, + "logps/rejected": -47.03495788574219, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1643732786178589, + "rewards/margins": 0.5755838751792908, + "rewards/rejected": -0.7399571537971497, + "step": 95 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 42.917662210162185, + "learning_rate": 3.5555555555555553e-07, + "logits/chosen": -1.393978476524353, + "logits/rejected": -1.306774377822876, + "logps/chosen": -47.09367370605469, + "logps/rejected": -61.688514709472656, + "loss": 0.5234, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.31930363178253174, + "rewards/margins": 0.44419366121292114, + "rewards/rejected": -0.7634972929954529, + "step": 96 + }, + { + "epoch": 0.5748148148148148, + "grad_norm": 41.93635144096363, + "learning_rate": 3.592592592592593e-07, + "logits/chosen": -1.4364768266677856, + "logits/rejected": -1.138240098953247, + "logps/chosen": -43.779056549072266, + "logps/rejected": -50.88738250732422, + "loss": 0.5004, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.25555330514907837, + "rewards/margins": 0.5455037355422974, + "rewards/rejected": -0.8010570406913757, + "step": 97 + }, + { + "epoch": 0.5807407407407408, + "grad_norm": 48.44364004513561, + "learning_rate": 3.6296296296296297e-07, + "logits/chosen": -1.3962883949279785, + "logits/rejected": -1.345455527305603, + "logps/chosen": -41.076805114746094, + "logps/rejected": -50.62786865234375, + "loss": 0.567, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.18818557262420654, + "rewards/margins": 0.32878220081329346, + "rewards/rejected": -0.5169677734375, + "step": 98 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 40.709285626891656, + "learning_rate": 3.666666666666666e-07, + "logits/chosen": -1.6217023134231567, + "logits/rejected": -1.455048680305481, + "logps/chosen": -27.7904052734375, + "logps/rejected": -50.464454650878906, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12327791750431061, + "rewards/margins": 0.31492501497268677, + "rewards/rejected": -0.4382029175758362, + "step": 99 + }, + { + "epoch": 0.5925925925925926, + "grad_norm": 44.49791847791134, + "learning_rate": 3.703703703703703e-07, + "logits/chosen": -1.3246082067489624, + "logits/rejected": -1.341948390007019, + "logps/chosen": -45.43359375, + "logps/rejected": -64.56365203857422, + "loss": 0.5368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2600909173488617, + "rewards/margins": 0.6141197681427002, + "rewards/rejected": -0.8742106556892395, + "step": 100 + }, + { + "epoch": 0.5985185185185186, + "grad_norm": 42.97756774543605, + "learning_rate": 3.7407407407407406e-07, + "logits/chosen": -1.2443385124206543, + "logits/rejected": -1.0570563077926636, + "logps/chosen": -36.11662673950195, + "logps/rejected": -52.12215805053711, + "loss": 0.5246, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.32490265369415283, + "rewards/margins": 0.6514842510223389, + "rewards/rejected": -0.9763869047164917, + "step": 101 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 42.33008530948347, + "learning_rate": 3.7777777777777775e-07, + "logits/chosen": -1.480468988418579, + "logits/rejected": -1.4824107885360718, + "logps/chosen": -32.58100891113281, + "logps/rejected": -38.337615966796875, + "loss": 0.5293, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.23031967878341675, + "rewards/margins": 0.4455646872520447, + "rewards/rejected": -0.6758843660354614, + "step": 102 + }, + { + "epoch": 0.6103703703703703, + "grad_norm": 37.653216263070966, + "learning_rate": 3.8148148148148145e-07, + "logits/chosen": -1.830120325088501, + "logits/rejected": -1.9020617008209229, + "logps/chosen": -35.33047866821289, + "logps/rejected": -38.83496856689453, + "loss": 0.5366, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2223891317844391, + "rewards/margins": 0.4304327070713043, + "rewards/rejected": -0.6528218388557434, + "step": 103 + }, + { + "epoch": 0.6162962962962963, + "grad_norm": 37.93219958276944, + "learning_rate": 3.8518518518518515e-07, + "logits/chosen": -1.184201717376709, + "logits/rejected": -1.1568915843963623, + "logps/chosen": -41.511451721191406, + "logps/rejected": -51.12090301513672, + "loss": 0.4447, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16868919134140015, + "rewards/margins": 1.0149188041687012, + "rewards/rejected": -1.1836079359054565, + "step": 104 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 44.46824215729115, + "learning_rate": 3.888888888888889e-07, + "logits/chosen": -1.642662525177002, + "logits/rejected": -1.373224139213562, + "logps/chosen": -37.704383850097656, + "logps/rejected": -57.575584411621094, + "loss": 0.559, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39903172850608826, + "rewards/margins": 0.5329186916351318, + "rewards/rejected": -0.9319504499435425, + "step": 105 + }, + { + "epoch": 0.6281481481481481, + "grad_norm": 40.24407413315748, + "learning_rate": 3.925925925925926e-07, + "logits/chosen": -1.6562988758087158, + "logits/rejected": -1.6213304996490479, + "logps/chosen": -42.33479309082031, + "logps/rejected": -46.03807830810547, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3010629415512085, + "rewards/margins": 0.7453230023384094, + "rewards/rejected": -1.0463860034942627, + "step": 106 + }, + { + "epoch": 0.6340740740740741, + "grad_norm": 38.972579462755846, + "learning_rate": 3.962962962962963e-07, + "logits/chosen": -1.3949552774429321, + "logits/rejected": -1.2594126462936401, + "logps/chosen": -40.719146728515625, + "logps/rejected": -60.80188751220703, + "loss": 0.441, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32348203659057617, + "rewards/margins": 0.8667979836463928, + "rewards/rejected": -1.1902799606323242, + "step": 107 + }, + { + "epoch": 0.64, + "grad_norm": 36.39536195986827, + "learning_rate": 4e-07, + "logits/chosen": -2.0106091499328613, + "logits/rejected": -1.8416554927825928, + "logps/chosen": -39.52052307128906, + "logps/rejected": -52.07545471191406, + "loss": 0.4314, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18241432309150696, + "rewards/margins": 0.9339324235916138, + "rewards/rejected": -1.116346836090088, + "step": 108 + }, + { + "epoch": 0.6459259259259259, + "grad_norm": 44.98515264402161, + "learning_rate": 4.0370370370370373e-07, + "logits/chosen": -1.7714054584503174, + "logits/rejected": -1.7249916791915894, + "logps/chosen": -39.17259216308594, + "logps/rejected": -56.0714111328125, + "loss": 0.524, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29726728796958923, + "rewards/margins": 1.1248726844787598, + "rewards/rejected": -1.4221400022506714, + "step": 109 + }, + { + "epoch": 0.6518518518518519, + "grad_norm": 44.824508106883286, + "learning_rate": 4.0740740740740737e-07, + "logits/chosen": -1.0125787258148193, + "logits/rejected": -0.9560598134994507, + "logps/chosen": -42.5981330871582, + "logps/rejected": -50.66206359863281, + "loss": 0.5808, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5798179507255554, + "rewards/margins": 0.5451233386993408, + "rewards/rejected": -1.124941349029541, + "step": 110 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 38.90156797225312, + "learning_rate": 4.1111111111111107e-07, + "logits/chosen": -1.1894886493682861, + "logits/rejected": -1.1232939958572388, + "logps/chosen": -37.06401062011719, + "logps/rejected": -47.91145324707031, + "loss": 0.4815, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3007606565952301, + "rewards/margins": 0.7585300207138062, + "rewards/rejected": -1.0592906475067139, + "step": 111 + }, + { + "epoch": 0.6637037037037037, + "grad_norm": 49.6237498585263, + "learning_rate": 4.1481481481481476e-07, + "logits/chosen": -1.2994275093078613, + "logits/rejected": -1.5018212795257568, + "logps/chosen": -55.228172302246094, + "logps/rejected": -54.287559509277344, + "loss": 0.598, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.679509162902832, + "rewards/margins": 0.5942882299423218, + "rewards/rejected": -1.2737973928451538, + "step": 112 + }, + { + "epoch": 0.6696296296296296, + "grad_norm": 43.2646193758696, + "learning_rate": 4.185185185185185e-07, + "logits/chosen": -1.492136836051941, + "logits/rejected": -1.4424383640289307, + "logps/chosen": -35.74744415283203, + "logps/rejected": -43.59991455078125, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.33821651339530945, + "rewards/margins": 0.5227227210998535, + "rewards/rejected": -0.8609392046928406, + "step": 113 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 41.14792424525068, + "learning_rate": 4.222222222222222e-07, + "logits/chosen": -1.5096982717514038, + "logits/rejected": -1.4714512825012207, + "logps/chosen": -38.27312469482422, + "logps/rejected": -44.96509552001953, + "loss": 0.4681, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4304978847503662, + "rewards/margins": 0.5347405076026917, + "rewards/rejected": -0.9652383923530579, + "step": 114 + }, + { + "epoch": 0.6814814814814815, + "grad_norm": 40.18814061966553, + "learning_rate": 4.259259259259259e-07, + "logits/chosen": -1.5355191230773926, + "logits/rejected": -1.524069905281067, + "logps/chosen": -43.97163391113281, + "logps/rejected": -55.98404312133789, + "loss": 0.4432, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2595410943031311, + "rewards/margins": 1.4614770412445068, + "rewards/rejected": -1.7210183143615723, + "step": 115 + }, + { + "epoch": 0.6874074074074074, + "grad_norm": 34.642341791275506, + "learning_rate": 4.296296296296296e-07, + "logits/chosen": -1.4036482572555542, + "logits/rejected": -1.3584758043289185, + "logps/chosen": -39.086036682128906, + "logps/rejected": -52.415191650390625, + "loss": 0.4359, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47538018226623535, + "rewards/margins": 0.8739761710166931, + "rewards/rejected": -1.3493562936782837, + "step": 116 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 35.64480707741049, + "learning_rate": 4.3333333333333335e-07, + "logits/chosen": -1.396484613418579, + "logits/rejected": -1.3210082054138184, + "logps/chosen": -44.036865234375, + "logps/rejected": -54.3887939453125, + "loss": 0.4008, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5203036069869995, + "rewards/margins": 1.411538004875183, + "rewards/rejected": -1.9318416118621826, + "step": 117 + }, + { + "epoch": 0.6992592592592592, + "grad_norm": 38.24792362559405, + "learning_rate": 4.3703703703703704e-07, + "logits/chosen": -1.3976020812988281, + "logits/rejected": -1.3027665615081787, + "logps/chosen": -34.96048355102539, + "logps/rejected": -47.282859802246094, + "loss": 0.4459, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35238882899284363, + "rewards/margins": 1.0079915523529053, + "rewards/rejected": -1.3603804111480713, + "step": 118 + }, + { + "epoch": 0.7051851851851851, + "grad_norm": 37.84290162399573, + "learning_rate": 4.4074074074074074e-07, + "logits/chosen": -1.3033220767974854, + "logits/rejected": -1.2730566263198853, + "logps/chosen": -37.476531982421875, + "logps/rejected": -48.643516540527344, + "loss": 0.4149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3667222857475281, + "rewards/margins": 1.1727958917617798, + "rewards/rejected": -1.5395182371139526, + "step": 119 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 37.878664188795796, + "learning_rate": 4.444444444444444e-07, + "logits/chosen": -0.9400476217269897, + "logits/rejected": -0.7184149622917175, + "logps/chosen": -31.205936431884766, + "logps/rejected": -47.984012603759766, + "loss": 0.4405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4675554037094116, + "rewards/margins": 0.7965797781944275, + "rewards/rejected": -1.2641352415084839, + "step": 120 + }, + { + "epoch": 0.717037037037037, + "grad_norm": 40.88887819751045, + "learning_rate": 4.4814814814814813e-07, + "logits/chosen": -1.7406163215637207, + "logits/rejected": -1.7047609090805054, + "logps/chosen": -42.533084869384766, + "logps/rejected": -57.01613998413086, + "loss": 0.4841, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7570239305496216, + "rewards/margins": 0.736762523651123, + "rewards/rejected": -1.493786334991455, + "step": 121 + }, + { + "epoch": 0.7229629629629629, + "grad_norm": 35.35115029467827, + "learning_rate": 4.5185185185185183e-07, + "logits/chosen": -1.2737648487091064, + "logits/rejected": -1.1840261220932007, + "logps/chosen": -37.37749099731445, + "logps/rejected": -52.0399055480957, + "loss": 0.4191, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6177048683166504, + "rewards/margins": 1.4750514030456543, + "rewards/rejected": -2.0927562713623047, + "step": 122 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 38.5026450550663, + "learning_rate": 4.555555555555555e-07, + "logits/chosen": -1.5902702808380127, + "logits/rejected": -1.5567035675048828, + "logps/chosen": -44.0450553894043, + "logps/rejected": -52.94071960449219, + "loss": 0.4411, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4093371033668518, + "rewards/margins": 1.0345042943954468, + "rewards/rejected": -1.4438413381576538, + "step": 123 + }, + { + "epoch": 0.7348148148148148, + "grad_norm": 30.293538787855763, + "learning_rate": 4.592592592592592e-07, + "logits/chosen": -1.4934203624725342, + "logits/rejected": -1.3282454013824463, + "logps/chosen": -39.16450500488281, + "logps/rejected": -56.53828048706055, + "loss": 0.3579, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35691025853157043, + "rewards/margins": 1.0708496570587158, + "rewards/rejected": -1.4277598857879639, + "step": 124 + }, + { + "epoch": 0.7407407407407407, + "grad_norm": 47.64124797635846, + "learning_rate": 4.6296296296296297e-07, + "logits/chosen": -1.4489716291427612, + "logits/rejected": -1.516164779663086, + "logps/chosen": -49.20244598388672, + "logps/rejected": -52.79627990722656, + "loss": 0.5055, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8552089929580688, + "rewards/margins": 0.5824382305145264, + "rewards/rejected": -1.4376472234725952, + "step": 125 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 40.037576352632705, + "learning_rate": 4.6666666666666666e-07, + "logits/chosen": -1.3147563934326172, + "logits/rejected": -1.3571640253067017, + "logps/chosen": -44.24326705932617, + "logps/rejected": -52.24501037597656, + "loss": 0.4446, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7837151885032654, + "rewards/margins": 1.0028468370437622, + "rewards/rejected": -1.7865620851516724, + "step": 126 + }, + { + "epoch": 0.7525925925925926, + "grad_norm": 33.59225363588131, + "learning_rate": 4.7037037037037036e-07, + "logits/chosen": -1.8176665306091309, + "logits/rejected": -1.6222167015075684, + "logps/chosen": -28.441362380981445, + "logps/rejected": -46.46739959716797, + "loss": 0.3895, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18460707366466522, + "rewards/margins": 1.4571757316589355, + "rewards/rejected": -1.6417827606201172, + "step": 127 + }, + { + "epoch": 0.7585185185185185, + "grad_norm": 39.646954284486725, + "learning_rate": 4.7407407407407405e-07, + "logits/chosen": -1.2093782424926758, + "logits/rejected": -1.2084648609161377, + "logps/chosen": -38.774654388427734, + "logps/rejected": -52.121620178222656, + "loss": 0.4106, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5091949701309204, + "rewards/margins": 1.171380877494812, + "rewards/rejected": -1.6805758476257324, + "step": 128 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 38.70377586651715, + "learning_rate": 4.777777777777778e-07, + "logits/chosen": -1.777665376663208, + "logits/rejected": -1.5932849645614624, + "logps/chosen": -43.61308288574219, + "logps/rejected": -53.84217834472656, + "loss": 0.4068, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3957401514053345, + "rewards/margins": 1.026474118232727, + "rewards/rejected": -1.4222142696380615, + "step": 129 + }, + { + "epoch": 0.7703703703703704, + "grad_norm": 41.30978823426115, + "learning_rate": 4.814814814814814e-07, + "logits/chosen": -1.340857982635498, + "logits/rejected": -1.3845839500427246, + "logps/chosen": -41.08652114868164, + "logps/rejected": -50.04093551635742, + "loss": 0.4386, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6982749700546265, + "rewards/margins": 0.5795612931251526, + "rewards/rejected": -1.2778363227844238, + "step": 130 + }, + { + "epoch": 0.7762962962962963, + "grad_norm": 46.879219229092406, + "learning_rate": 4.851851851851852e-07, + "logits/chosen": -1.7075390815734863, + "logits/rejected": -1.839177131652832, + "logps/chosen": -52.101966857910156, + "logps/rejected": -49.733314514160156, + "loss": 0.4896, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.981723427772522, + "rewards/margins": 0.5867584943771362, + "rewards/rejected": -1.5684819221496582, + "step": 131 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 35.95083543821181, + "learning_rate": 4.888888888888889e-07, + "logits/chosen": -2.1524219512939453, + "logits/rejected": -2.247002601623535, + "logps/chosen": -46.40180206298828, + "logps/rejected": -46.5478515625, + "loss": 0.4148, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5037150979042053, + "rewards/margins": 1.0674867630004883, + "rewards/rejected": -1.5712018013000488, + "step": 132 + }, + { + "epoch": 0.7881481481481482, + "grad_norm": 32.260601736979446, + "learning_rate": 4.925925925925926e-07, + "logits/chosen": -1.435699701309204, + "logits/rejected": -1.4596015214920044, + "logps/chosen": -41.34964370727539, + "logps/rejected": -56.690032958984375, + "loss": 0.3289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5615026950836182, + "rewards/margins": 1.8352725505828857, + "rewards/rejected": -2.396775245666504, + "step": 133 + }, + { + "epoch": 0.794074074074074, + "grad_norm": 41.73394354567568, + "learning_rate": 4.962962962962963e-07, + "logits/chosen": -0.9744957089424133, + "logits/rejected": -0.8672415018081665, + "logps/chosen": -33.1612663269043, + "logps/rejected": -42.50749206542969, + "loss": 0.4965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4336245059967041, + "rewards/margins": 1.558875560760498, + "rewards/rejected": -1.9925000667572021, + "step": 134 + }, + { + "epoch": 0.8, + "grad_norm": 38.10147271008432, + "learning_rate": 5e-07, + "logits/chosen": -1.3705247640609741, + "logits/rejected": -1.5914721488952637, + "logps/chosen": -47.50928497314453, + "logps/rejected": -50.321815490722656, + "loss": 0.4227, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7019506096839905, + "rewards/margins": 1.1958553791046143, + "rewards/rejected": -1.89780592918396, + "step": 135 + }, + { + "epoch": 0.8059259259259259, + "grad_norm": 50.33738386164938, + "learning_rate": 4.999991559718872e-07, + "logits/chosen": -1.0675755739212036, + "logits/rejected": -0.9794799089431763, + "logps/chosen": -47.514888763427734, + "logps/rejected": -66.84017944335938, + "loss": 0.499, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6851686239242554, + "rewards/margins": 1.2223974466323853, + "rewards/rejected": -1.9075660705566406, + "step": 136 + }, + { + "epoch": 0.8118518518518518, + "grad_norm": 45.70334700227829, + "learning_rate": 4.999966238932478e-07, + "logits/chosen": -1.2879647016525269, + "logits/rejected": -1.3816440105438232, + "logps/chosen": -47.61471176147461, + "logps/rejected": -46.95405197143555, + "loss": 0.4767, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6745498180389404, + "rewards/margins": 0.8887066841125488, + "rewards/rejected": -1.5632563829421997, + "step": 137 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 39.634286154920076, + "learning_rate": 4.999924037811792e-07, + "logits/chosen": -1.9803012609481812, + "logits/rejected": -1.9392635822296143, + "logps/chosen": -45.58927917480469, + "logps/rejected": -66.4295883178711, + "loss": 0.4753, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5923961400985718, + "rewards/margins": 1.6612188816070557, + "rewards/rejected": -2.253614902496338, + "step": 138 + }, + { + "epoch": 0.8237037037037037, + "grad_norm": 37.260534789973065, + "learning_rate": 4.999864956641761e-07, + "logits/chosen": -1.2599807977676392, + "logits/rejected": -1.1629146337509155, + "logps/chosen": -32.816200256347656, + "logps/rejected": -40.76869583129883, + "loss": 0.4061, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4357667863368988, + "rewards/margins": 1.3744035959243774, + "rewards/rejected": -1.810170292854309, + "step": 139 + }, + { + "epoch": 0.8296296296296296, + "grad_norm": 40.00541881208066, + "learning_rate": 4.99978899582132e-07, + "logits/chosen": -1.776047945022583, + "logits/rejected": -1.7421143054962158, + "logps/chosen": -41.147705078125, + "logps/rejected": -52.4171142578125, + "loss": 0.4588, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5953139066696167, + "rewards/margins": 1.7757620811462402, + "rewards/rejected": -2.3710761070251465, + "step": 140 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 42.7543024702637, + "learning_rate": 4.999696155863368e-07, + "logits/chosen": -1.578580379486084, + "logits/rejected": -1.4482641220092773, + "logps/chosen": -31.513076782226562, + "logps/rejected": -43.173744201660156, + "loss": 0.5108, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.33485445380210876, + "rewards/margins": 1.1695399284362793, + "rewards/rejected": -1.5043944120407104, + "step": 141 + }, + { + "epoch": 0.8414814814814815, + "grad_norm": 38.96998757756861, + "learning_rate": 4.999586437394786e-07, + "logits/chosen": -1.481893539428711, + "logits/rejected": -1.5493358373641968, + "logps/chosen": -40.78139877319336, + "logps/rejected": -48.02816390991211, + "loss": 0.4458, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.35072338581085205, + "rewards/margins": 1.8705483675003052, + "rewards/rejected": -2.2212717533111572, + "step": 142 + }, + { + "epoch": 0.8474074074074074, + "grad_norm": 37.872628732664225, + "learning_rate": 4.999459841156414e-07, + "logits/chosen": -1.2435798645019531, + "logits/rejected": -1.1472171545028687, + "logps/chosen": -31.227603912353516, + "logps/rejected": -40.37028503417969, + "loss": 0.4717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36621564626693726, + "rewards/margins": 1.2962467670440674, + "rewards/rejected": -1.6624623537063599, + "step": 143 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 38.01721990825646, + "learning_rate": 4.999316368003061e-07, + "logits/chosen": -1.5544506311416626, + "logits/rejected": -1.362791657447815, + "logps/chosen": -51.45790100097656, + "logps/rejected": -60.38456726074219, + "loss": 0.4668, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9039173126220703, + "rewards/margins": 1.306924819946289, + "rewards/rejected": -2.2108421325683594, + "step": 144 + }, + { + "epoch": 0.8592592592592593, + "grad_norm": 47.57106677206415, + "learning_rate": 4.999156018903489e-07, + "logits/chosen": -1.6596348285675049, + "logits/rejected": -1.6843326091766357, + "logps/chosen": -47.816925048828125, + "logps/rejected": -50.309932708740234, + "loss": 0.5428, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5581059455871582, + "rewards/margins": 0.2512357831001282, + "rewards/rejected": -0.8093417286872864, + "step": 145 + }, + { + "epoch": 0.8651851851851852, + "grad_norm": 35.787352860041, + "learning_rate": 4.998978794940411e-07, + "logits/chosen": -1.1359593868255615, + "logits/rejected": -1.2077702283859253, + "logps/chosen": -47.62373733520508, + "logps/rejected": -45.07956314086914, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.30160394310951233, + "rewards/margins": 1.6038706302642822, + "rewards/rejected": -1.9054745435714722, + "step": 146 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 42.18929063216021, + "learning_rate": 4.998784697310482e-07, + "logits/chosen": -1.1696794033050537, + "logits/rejected": -1.4276272058486938, + "logps/chosen": -49.076904296875, + "logps/rejected": -47.925071716308594, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44560256600379944, + "rewards/margins": 1.376593828201294, + "rewards/rejected": -1.822196364402771, + "step": 147 + }, + { + "epoch": 0.8770370370370371, + "grad_norm": 36.04716697331485, + "learning_rate": 4.998573727324294e-07, + "logits/chosen": -1.4678583145141602, + "logits/rejected": -1.1020907163619995, + "logps/chosen": -39.27702331542969, + "logps/rejected": -70.841552734375, + "loss": 0.4088, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4344024658203125, + "rewards/margins": 2.001018524169922, + "rewards/rejected": -2.4354209899902344, + "step": 148 + }, + { + "epoch": 0.882962962962963, + "grad_norm": 34.61517859729992, + "learning_rate": 4.998345886406365e-07, + "logits/chosen": -1.5456968545913696, + "logits/rejected": -1.5202821493148804, + "logps/chosen": -36.613433837890625, + "logps/rejected": -42.71227264404297, + "loss": 0.4214, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.25451546907424927, + "rewards/margins": 0.7433053255081177, + "rewards/rejected": -0.9978208541870117, + "step": 149 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 37.1698250651998, + "learning_rate": 4.998101176095128e-07, + "logits/chosen": -1.3398644924163818, + "logits/rejected": -1.3869026899337769, + "logps/chosen": -40.0852165222168, + "logps/rejected": -55.97785186767578, + "loss": 0.4379, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5003520846366882, + "rewards/margins": 1.8009438514709473, + "rewards/rejected": -2.3012959957122803, + "step": 150 + }, + { + "epoch": 0.8948148148148148, + "grad_norm": 36.86123772415135, + "learning_rate": 4.997839598042919e-07, + "logits/chosen": -2.002145290374756, + "logits/rejected": -1.9517096281051636, + "logps/chosen": -41.4112663269043, + "logps/rejected": -51.83033752441406, + "loss": 0.3932, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4211091101169586, + "rewards/margins": 1.41811203956604, + "rewards/rejected": -1.8392211198806763, + "step": 151 + }, + { + "epoch": 0.9007407407407407, + "grad_norm": 44.396493964573885, + "learning_rate": 4.997561154015975e-07, + "logits/chosen": -1.787272334098816, + "logits/rejected": -1.758475422859192, + "logps/chosen": -36.319149017333984, + "logps/rejected": -44.939476013183594, + "loss": 0.5036, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45524048805236816, + "rewards/margins": 0.7919211387634277, + "rewards/rejected": -1.247161626815796, + "step": 152 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 36.380787063292495, + "learning_rate": 4.997265845894411e-07, + "logits/chosen": -1.5731865167617798, + "logits/rejected": -1.6401747465133667, + "logps/chosen": -48.35523986816406, + "logps/rejected": -41.2987174987793, + "loss": 0.3815, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.23987555503845215, + "rewards/margins": 1.1017296314239502, + "rewards/rejected": -1.3416051864624023, + "step": 153 + }, + { + "epoch": 0.9125925925925926, + "grad_norm": 39.813495349904564, + "learning_rate": 4.996953675672213e-07, + "logits/chosen": -1.2417954206466675, + "logits/rejected": -1.1722739934921265, + "logps/chosen": -39.34113693237305, + "logps/rejected": -47.16643524169922, + "loss": 0.4529, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45460134744644165, + "rewards/margins": 0.8995037078857422, + "rewards/rejected": -1.354104995727539, + "step": 154 + }, + { + "epoch": 0.9185185185185185, + "grad_norm": 33.59176857625763, + "learning_rate": 4.996624645457227e-07, + "logits/chosen": -1.565160870552063, + "logits/rejected": -1.8368042707443237, + "logps/chosen": -48.77574157714844, + "logps/rejected": -49.07792663574219, + "loss": 0.3968, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.27005040645599365, + "rewards/margins": 1.148226261138916, + "rewards/rejected": -1.4182766675949097, + "step": 155 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 32.73969782640533, + "learning_rate": 4.996278757471138e-07, + "logits/chosen": -1.9420874118804932, + "logits/rejected": -1.9092347621917725, + "logps/chosen": -36.45257568359375, + "logps/rejected": -46.10245895385742, + "loss": 0.3288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2543891966342926, + "rewards/margins": 2.042929172515869, + "rewards/rejected": -2.297318458557129, + "step": 156 + }, + { + "epoch": 0.9303703703703704, + "grad_norm": 37.40978956784082, + "learning_rate": 4.995916014049461e-07, + "logits/chosen": -1.2158647775650024, + "logits/rejected": -1.3647968769073486, + "logps/chosen": -55.8748779296875, + "logps/rejected": -57.6270751953125, + "loss": 0.4138, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7079750895500183, + "rewards/margins": 0.7428067922592163, + "rewards/rejected": -1.4507818222045898, + "step": 157 + }, + { + "epoch": 0.9362962962962963, + "grad_norm": 42.46766006074291, + "learning_rate": 4.995536417641517e-07, + "logits/chosen": -1.6255074739456177, + "logits/rejected": -1.534214973449707, + "logps/chosen": -37.175628662109375, + "logps/rejected": -48.801025390625, + "loss": 0.4694, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2790091633796692, + "rewards/margins": 1.2557103633880615, + "rewards/rejected": -1.534719467163086, + "step": 158 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 35.427791373582686, + "learning_rate": 4.99513997081043e-07, + "logits/chosen": -1.4098460674285889, + "logits/rejected": -1.2685893774032593, + "logps/chosen": -38.03111267089844, + "logps/rejected": -54.027000427246094, + "loss": 0.3986, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5155774354934692, + "rewards/margins": 1.0906703472137451, + "rewards/rejected": -1.606247901916504, + "step": 159 + }, + { + "epoch": 0.9481481481481482, + "grad_norm": 39.60980028499697, + "learning_rate": 4.994726676233097e-07, + "logits/chosen": -0.8466954231262207, + "logits/rejected": -0.7026901245117188, + "logps/chosen": -49.040225982666016, + "logps/rejected": -59.15562438964844, + "loss": 0.3939, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6523393392562866, + "rewards/margins": 0.9394184350967407, + "rewards/rejected": -1.5917577743530273, + "step": 160 + }, + { + "epoch": 0.9540740740740741, + "grad_norm": 42.71341893793309, + "learning_rate": 4.994296536700177e-07, + "logits/chosen": -1.6049985885620117, + "logits/rejected": -1.6918436288833618, + "logps/chosen": -47.31741714477539, + "logps/rejected": -60.05731964111328, + "loss": 0.4019, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4310283064842224, + "rewards/margins": 1.8012323379516602, + "rewards/rejected": -2.2322607040405273, + "step": 161 + }, + { + "epoch": 0.96, + "grad_norm": 35.9928869034378, + "learning_rate": 4.993849555116066e-07, + "logits/chosen": -1.5866928100585938, + "logits/rejected": -1.4496181011199951, + "logps/chosen": -29.258974075317383, + "logps/rejected": -39.82886505126953, + "loss": 0.3986, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02396818995475769, + "rewards/margins": 0.8717716932296753, + "rewards/rejected": -0.8957399129867554, + "step": 162 + }, + { + "epoch": 0.965925925925926, + "grad_norm": 25.847460352007364, + "learning_rate": 4.993385734498887e-07, + "logits/chosen": -1.755631446838379, + "logits/rejected": -1.581370234489441, + "logps/chosen": -35.742820739746094, + "logps/rejected": -57.367156982421875, + "loss": 0.2876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23074620962142944, + "rewards/margins": 2.2658233642578125, + "rewards/rejected": -2.4965696334838867, + "step": 163 + }, + { + "epoch": 0.9718518518518519, + "grad_norm": 42.879579981836365, + "learning_rate": 4.992905077980461e-07, + "logits/chosen": -1.7979357242584229, + "logits/rejected": -1.8016386032104492, + "logps/chosen": -48.4314079284668, + "logps/rejected": -55.05060577392578, + "loss": 0.4501, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.49184557795524597, + "rewards/margins": 1.798371434211731, + "rewards/rejected": -2.2902169227600098, + "step": 164 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 40.09049915958416, + "learning_rate": 4.992407588806287e-07, + "logits/chosen": -1.562727451324463, + "logits/rejected": -1.552638292312622, + "logps/chosen": -36.466468811035156, + "logps/rejected": -49.57290267944336, + "loss": 0.4016, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.43339627981185913, + "rewards/margins": 1.6709353923797607, + "rewards/rejected": -2.1043317317962646, + "step": 165 + }, + { + "epoch": 0.9837037037037037, + "grad_norm": 33.4817708291202, + "learning_rate": 4.991893270335525e-07, + "logits/chosen": -1.164905309677124, + "logits/rejected": -1.1139239072799683, + "logps/chosen": -31.748462677001953, + "logps/rejected": -54.49275588989258, + "loss": 0.3344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.49971821904182434, + "rewards/margins": 1.8249340057373047, + "rewards/rejected": -2.3246521949768066, + "step": 166 + }, + { + "epoch": 0.9896296296296296, + "grad_norm": 36.48838262716427, + "learning_rate": 4.991362126040969e-07, + "logits/chosen": -1.425585150718689, + "logits/rejected": -1.3379764556884766, + "logps/chosen": -32.85935974121094, + "logps/rejected": -50.20701217651367, + "loss": 0.4296, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.024861067533493042, + "rewards/margins": 0.9732530117034912, + "rewards/rejected": -0.9483919143676758, + "step": 167 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 34.12565710662784, + "learning_rate": 4.990814159509024e-07, + "logits/chosen": -1.4324623346328735, + "logits/rejected": -1.4954330921173096, + "logps/chosen": -42.89631652832031, + "logps/rejected": -38.344970703125, + "loss": 0.3861, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4475001394748688, + "rewards/margins": 1.3485389947891235, + "rewards/rejected": -1.7960389852523804, + "step": 168 + }, + { + "epoch": 1.0014814814814814, + "grad_norm": 38.32760108321501, + "learning_rate": 4.990249374439684e-07, + "logits/chosen": -1.5158495903015137, + "logits/rejected": -1.534022331237793, + "logps/chosen": -31.372514724731445, + "logps/rejected": -46.01219177246094, + "loss": 0.3269, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012081414461135864, + "rewards/margins": 1.969221830368042, + "rewards/rejected": -1.9813032150268555, + "step": 169 + }, + { + "epoch": 1.0074074074074073, + "grad_norm": 34.96121150467421, + "learning_rate": 4.989667774646505e-07, + "logits/chosen": -1.1385910511016846, + "logits/rejected": -1.3404728174209595, + "logps/chosen": -46.99889373779297, + "logps/rejected": -44.828121185302734, + "loss": 0.381, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.45912593603134155, + "rewards/margins": 1.4480329751968384, + "rewards/rejected": -1.9071589708328247, + "step": 170 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 38.09115794530134, + "learning_rate": 4.989069364056579e-07, + "logits/chosen": -1.1056678295135498, + "logits/rejected": -1.3960037231445312, + "logps/chosen": -42.210548400878906, + "logps/rejected": -36.17823791503906, + "loss": 0.3765, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5556241273880005, + "rewards/margins": 1.0674997568130493, + "rewards/rejected": -1.6231238842010498, + "step": 171 + }, + { + "epoch": 1.0192592592592593, + "grad_norm": 25.250747618417908, + "learning_rate": 4.98845414671051e-07, + "logits/chosen": -1.8973287343978882, + "logits/rejected": -1.8729844093322754, + "logps/chosen": -38.14885711669922, + "logps/rejected": -49.52397155761719, + "loss": 0.2812, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.33738794922828674, + "rewards/margins": 1.8423973321914673, + "rewards/rejected": -2.1797852516174316, + "step": 172 + }, + { + "epoch": 1.0251851851851852, + "grad_norm": 24.64598403479892, + "learning_rate": 4.987822126762382e-07, + "logits/chosen": -1.416272521018982, + "logits/rejected": -1.329006552696228, + "logps/chosen": -43.28013610839844, + "logps/rejected": -55.639705657958984, + "loss": 0.2011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20702561736106873, + "rewards/margins": 2.411742687225342, + "rewards/rejected": -2.6187682151794434, + "step": 173 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 28.853682463289353, + "learning_rate": 4.987173308479737e-07, + "logits/chosen": -1.2722222805023193, + "logits/rejected": -1.28517484664917, + "logps/chosen": -39.79129409790039, + "logps/rejected": -56.42878723144531, + "loss": 0.2956, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14098718762397766, + "rewards/margins": 2.1605212688446045, + "rewards/rejected": -2.3015084266662598, + "step": 174 + }, + { + "epoch": 1.037037037037037, + "grad_norm": 31.102293680811048, + "learning_rate": 4.986507696243543e-07, + "logits/chosen": -1.4579617977142334, + "logits/rejected": -1.309941291809082, + "logps/chosen": -37.566322326660156, + "logps/rejected": -54.842105865478516, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3267005681991577, + "rewards/margins": 2.0936899185180664, + "rewards/rejected": -2.4203903675079346, + "step": 175 + }, + { + "epoch": 1.0429629629629629, + "grad_norm": 33.13459931935171, + "learning_rate": 4.985825294548162e-07, + "logits/chosen": -1.4088029861450195, + "logits/rejected": -1.4467169046401978, + "logps/chosen": -47.83537292480469, + "logps/rejected": -52.668922424316406, + "loss": 0.3269, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2606503367424011, + "rewards/margins": 2.440474033355713, + "rewards/rejected": -2.7011241912841797, + "step": 176 + }, + { + "epoch": 1.048888888888889, + "grad_norm": 35.121879505889765, + "learning_rate": 4.985126108001323e-07, + "logits/chosen": -1.266465425491333, + "logits/rejected": -1.1826586723327637, + "logps/chosen": -39.725948333740234, + "logps/rejected": -57.177398681640625, + "loss": 0.3317, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37341824173927307, + "rewards/margins": 2.5008201599121094, + "rewards/rejected": -2.8742384910583496, + "step": 177 + }, + { + "epoch": 1.0548148148148149, + "grad_norm": 37.244016043977574, + "learning_rate": 4.984410141324092e-07, + "logits/chosen": -2.1158571243286133, + "logits/rejected": -1.9198758602142334, + "logps/chosen": -43.933040618896484, + "logps/rejected": -52.21183395385742, + "loss": 0.3642, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28503745794296265, + "rewards/margins": 1.6764382123947144, + "rewards/rejected": -1.9614756107330322, + "step": 178 + }, + { + "epoch": 1.0607407407407408, + "grad_norm": 27.827943454765062, + "learning_rate": 4.983677399350838e-07, + "logits/chosen": -1.3472518920898438, + "logits/rejected": -1.069368600845337, + "logps/chosen": -37.077884674072266, + "logps/rejected": -61.32925033569336, + "loss": 0.2883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5079047679901123, + "rewards/margins": 2.298504590988159, + "rewards/rejected": -2.8064093589782715, + "step": 179 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 35.482747269625385, + "learning_rate": 4.982927887029197e-07, + "logits/chosen": -1.5738056898117065, + "logits/rejected": -1.5168110132217407, + "logps/chosen": -40.75530242919922, + "logps/rejected": -53.785465240478516, + "loss": 0.3638, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.196524977684021, + "rewards/margins": 1.446781039237976, + "rewards/rejected": -1.643306016921997, + "step": 180 + }, + { + "epoch": 1.0725925925925925, + "grad_norm": 33.393710024973245, + "learning_rate": 4.982161609420047e-07, + "logits/chosen": -1.8916293382644653, + "logits/rejected": -1.541512131690979, + "logps/chosen": -39.467071533203125, + "logps/rejected": -70.18623352050781, + "loss": 0.28, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2709190845489502, + "rewards/margins": 2.419748306274414, + "rewards/rejected": -2.6906673908233643, + "step": 181 + }, + { + "epoch": 1.0785185185185184, + "grad_norm": 27.059206764761488, + "learning_rate": 4.981378571697466e-07, + "logits/chosen": -1.5622146129608154, + "logits/rejected": -1.5354870557785034, + "logps/chosen": -38.11195755004883, + "logps/rejected": -46.43415451049805, + "loss": 0.3158, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3464301526546478, + "rewards/margins": 1.3460371494293213, + "rewards/rejected": -1.692467212677002, + "step": 182 + }, + { + "epoch": 1.0844444444444445, + "grad_norm": 37.99289989125244, + "learning_rate": 4.980578779148702e-07, + "logits/chosen": -1.675439476966858, + "logits/rejected": -1.599168062210083, + "logps/chosen": -35.517608642578125, + "logps/rejected": -49.9504508972168, + "loss": 0.3525, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1147196888923645, + "rewards/margins": 2.2086353302001953, + "rewards/rejected": -2.0939157009124756, + "step": 183 + }, + { + "epoch": 1.0903703703703704, + "grad_norm": 29.913772956316752, + "learning_rate": 4.979762237174131e-07, + "logits/chosen": -2.1450929641723633, + "logits/rejected": -2.026754856109619, + "logps/chosen": -36.079044342041016, + "logps/rejected": -56.236167907714844, + "loss": 0.2973, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17820218205451965, + "rewards/margins": 1.8444724082946777, + "rewards/rejected": -2.022674560546875, + "step": 184 + }, + { + "epoch": 1.0962962962962963, + "grad_norm": 30.99605878821647, + "learning_rate": 4.978928951287232e-07, + "logits/chosen": -1.5684118270874023, + "logits/rejected": -1.4883699417114258, + "logps/chosen": -51.95133972167969, + "logps/rejected": -68.82379150390625, + "loss": 0.275, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3810669183731079, + "rewards/margins": 2.429225444793701, + "rewards/rejected": -2.8102922439575195, + "step": 185 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 27.655362320953017, + "learning_rate": 4.978078927114535e-07, + "logits/chosen": -1.1995656490325928, + "logits/rejected": -1.1525938510894775, + "logps/chosen": -31.533584594726562, + "logps/rejected": -43.609130859375, + "loss": 0.2864, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19067591428756714, + "rewards/margins": 1.482248306274414, + "rewards/rejected": -1.672924280166626, + "step": 186 + }, + { + "epoch": 1.108148148148148, + "grad_norm": 34.748921344170064, + "learning_rate": 4.977212170395597e-07, + "logits/chosen": -1.595931887626648, + "logits/rejected": -1.460189938545227, + "logps/chosen": -41.28904724121094, + "logps/rejected": -53.500274658203125, + "loss": 0.3198, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2814917266368866, + "rewards/margins": 1.909599781036377, + "rewards/rejected": -2.191091537475586, + "step": 187 + }, + { + "epoch": 1.114074074074074, + "grad_norm": 33.03290755090235, + "learning_rate": 4.976328686982954e-07, + "logits/chosen": -1.4642574787139893, + "logits/rejected": -1.4906011819839478, + "logps/chosen": -36.64849090576172, + "logps/rejected": -45.48463439941406, + "loss": 0.3067, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2857409417629242, + "rewards/margins": 1.9576942920684814, + "rewards/rejected": -2.2434353828430176, + "step": 188 + }, + { + "epoch": 1.12, + "grad_norm": 29.67228463711039, + "learning_rate": 4.975428482842082e-07, + "logits/chosen": -2.017104148864746, + "logits/rejected": -1.6186161041259766, + "logps/chosen": -37.26373291015625, + "logps/rejected": -61.593345642089844, + "loss": 0.3102, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1307264268398285, + "rewards/margins": 1.523173213005066, + "rewards/rejected": -1.6538997888565063, + "step": 189 + }, + { + "epoch": 1.125925925925926, + "grad_norm": 26.614271321646097, + "learning_rate": 4.974511564051367e-07, + "logits/chosen": -1.2072570323944092, + "logits/rejected": -1.2738533020019531, + "logps/chosen": -35.82682800292969, + "logps/rejected": -47.37385940551758, + "loss": 0.2503, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11302122473716736, + "rewards/margins": 1.6395896673202515, + "rewards/rejected": -1.7526109218597412, + "step": 190 + }, + { + "epoch": 1.1318518518518519, + "grad_norm": 35.09137385840197, + "learning_rate": 4.973577936802046e-07, + "logits/chosen": -1.2088086605072021, + "logits/rejected": -1.0792959928512573, + "logps/chosen": -40.28418731689453, + "logps/rejected": -49.54509735107422, + "loss": 0.3661, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4909687936306, + "rewards/margins": 1.4177656173706055, + "rewards/rejected": -1.9087340831756592, + "step": 191 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 30.378844905181477, + "learning_rate": 4.972627607398182e-07, + "logits/chosen": -1.9133532047271729, + "logits/rejected": -1.7833731174468994, + "logps/chosen": -48.8740348815918, + "logps/rejected": -58.93412780761719, + "loss": 0.3391, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2783634662628174, + "rewards/margins": 1.4361963272094727, + "rewards/rejected": -1.71455979347229, + "step": 192 + }, + { + "epoch": 1.1437037037037037, + "grad_norm": 32.45747667996871, + "learning_rate": 4.971660582256614e-07, + "logits/chosen": -1.4115142822265625, + "logits/rejected": -1.4700024127960205, + "logps/chosen": -36.24808120727539, + "logps/rejected": -39.952327728271484, + "loss": 0.2917, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4845938980579376, + "rewards/margins": 1.4569411277770996, + "rewards/rejected": -1.9415351152420044, + "step": 193 + }, + { + "epoch": 1.1496296296296296, + "grad_norm": 33.14067947652267, + "learning_rate": 4.970676867906911e-07, + "logits/chosen": -1.3633407354354858, + "logits/rejected": -1.257892370223999, + "logps/chosen": -42.489227294921875, + "logps/rejected": -60.33306121826172, + "loss": 0.2577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31740912795066833, + "rewards/margins": 2.823535680770874, + "rewards/rejected": -3.140944719314575, + "step": 194 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 22.44088258779626, + "learning_rate": 4.969676470991335e-07, + "logits/chosen": -1.6855206489562988, + "logits/rejected": -1.5853474140167236, + "logps/chosen": -40.9166259765625, + "logps/rejected": -58.58363342285156, + "loss": 0.2222, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3480229079723358, + "rewards/margins": 2.3396825790405273, + "rewards/rejected": -2.6877055168151855, + "step": 195 + }, + { + "epoch": 1.1614814814814816, + "grad_norm": 30.19027524330909, + "learning_rate": 4.96865939826479e-07, + "logits/chosen": -1.4181514978408813, + "logits/rejected": -1.352044701576233, + "logps/chosen": -47.38768005371094, + "logps/rejected": -50.862403869628906, + "loss": 0.2625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32805755734443665, + "rewards/margins": 2.48689341545105, + "rewards/rejected": -2.814950942993164, + "step": 196 + }, + { + "epoch": 1.1674074074074074, + "grad_norm": 33.63843769566052, + "learning_rate": 4.967625656594781e-07, + "logits/chosen": -1.059614896774292, + "logits/rejected": -1.1472609043121338, + "logps/chosen": -46.473411560058594, + "logps/rejected": -47.90194320678711, + "loss": 0.3335, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.664130449295044, + "rewards/margins": 1.3695776462554932, + "rewards/rejected": -2.033708095550537, + "step": 197 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 25.93318598058472, + "learning_rate": 4.966575252961365e-07, + "logits/chosen": -0.9099617004394531, + "logits/rejected": -1.086199402809143, + "logps/chosen": -41.337745666503906, + "logps/rejected": -47.416107177734375, + "loss": 0.2587, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.344178169965744, + "rewards/margins": 2.067307472229004, + "rewards/rejected": -2.4114856719970703, + "step": 198 + }, + { + "epoch": 1.1792592592592592, + "grad_norm": 26.412082223409303, + "learning_rate": 4.9655081944571e-07, + "logits/chosen": -1.1102447509765625, + "logits/rejected": -1.0874698162078857, + "logps/chosen": -37.87491989135742, + "logps/rejected": -44.73563003540039, + "loss": 0.259, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09453117847442627, + "rewards/margins": 1.5228742361068726, + "rewards/rejected": -1.6174055337905884, + "step": 199 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 30.713958497949275, + "learning_rate": 4.964424488287009e-07, + "logits/chosen": -1.3750660419464111, + "logits/rejected": -1.5156819820404053, + "logps/chosen": -41.877891540527344, + "logps/rejected": -48.9142951965332, + "loss": 0.3177, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21863913536071777, + "rewards/margins": 1.9450185298919678, + "rewards/rejected": -2.1636576652526855, + "step": 200 + }, + { + "epoch": 1.1911111111111112, + "grad_norm": 31.035229497524096, + "learning_rate": 4.963324141768518e-07, + "logits/chosen": -1.5514556169509888, + "logits/rejected": -1.433958649635315, + "logps/chosen": -45.017242431640625, + "logps/rejected": -59.65779113769531, + "loss": 0.2587, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7274824976921082, + "rewards/margins": 1.8945391178131104, + "rewards/rejected": -2.6220216751098633, + "step": 201 + }, + { + "epoch": 1.1970370370370371, + "grad_norm": 25.89090435294919, + "learning_rate": 4.962207162331414e-07, + "logits/chosen": -1.4618628025054932, + "logits/rejected": -1.3739463090896606, + "logps/chosen": -34.74220275878906, + "logps/rejected": -49.502899169921875, + "loss": 0.245, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1882825791835785, + "rewards/margins": 2.2607545852661133, + "rewards/rejected": -2.4490370750427246, + "step": 202 + }, + { + "epoch": 1.202962962962963, + "grad_norm": 32.42192756007002, + "learning_rate": 4.961073557517792e-07, + "logits/chosen": -1.6199530363082886, + "logits/rejected": -1.6937488317489624, + "logps/chosen": -33.535919189453125, + "logps/rejected": -38.97394561767578, + "loss": 0.3269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28588587045669556, + "rewards/margins": 1.5164083242416382, + "rewards/rejected": -1.8022942543029785, + "step": 203 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 40.287715082939165, + "learning_rate": 4.95992333498201e-07, + "logits/chosen": -1.2144708633422852, + "logits/rejected": -1.1612203121185303, + "logps/chosen": -41.554542541503906, + "logps/rejected": -54.438270568847656, + "loss": 0.4155, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6244699954986572, + "rewards/margins": 1.4721695184707642, + "rewards/rejected": -2.096639633178711, + "step": 204 + }, + { + "epoch": 1.2148148148148148, + "grad_norm": 31.12088690243942, + "learning_rate": 4.958756502490626e-07, + "logits/chosen": -1.4652817249298096, + "logits/rejected": -1.3764071464538574, + "logps/chosen": -36.79831314086914, + "logps/rejected": -59.886383056640625, + "loss": 0.2911, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.30859121680259705, + "rewards/margins": 2.757361888885498, + "rewards/rejected": -3.065952777862549, + "step": 205 + }, + { + "epoch": 1.2207407407407407, + "grad_norm": 30.460892608308736, + "learning_rate": 4.957573067922359e-07, + "logits/chosen": -1.7502586841583252, + "logits/rejected": -1.4709583520889282, + "logps/chosen": -36.0119514465332, + "logps/rejected": -51.99536895751953, + "loss": 0.3284, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36890193819999695, + "rewards/margins": 1.593143343925476, + "rewards/rejected": -1.9620450735092163, + "step": 206 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 31.368572219016052, + "learning_rate": 4.956373039268021e-07, + "logits/chosen": -0.9687676429748535, + "logits/rejected": -1.1372796297073364, + "logps/chosen": -40.27385711669922, + "logps/rejected": -53.913265228271484, + "loss": 0.3021, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39367496967315674, + "rewards/margins": 3.2817864418029785, + "rewards/rejected": -3.6754612922668457, + "step": 207 + }, + { + "epoch": 1.2325925925925927, + "grad_norm": 28.33249165288491, + "learning_rate": 4.955156424630479e-07, + "logits/chosen": -1.431084394454956, + "logits/rejected": -1.2873094081878662, + "logps/chosen": -38.09033203125, + "logps/rejected": -54.86888122558594, + "loss": 0.268, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4929051399230957, + "rewards/margins": 2.4378890991210938, + "rewards/rejected": -2.9307942390441895, + "step": 208 + }, + { + "epoch": 1.2385185185185186, + "grad_norm": 32.08230351680351, + "learning_rate": 4.953923232224586e-07, + "logits/chosen": -1.2549279928207397, + "logits/rejected": -1.2246358394622803, + "logps/chosen": -37.1239128112793, + "logps/rejected": -43.20243453979492, + "loss": 0.3344, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.21321409940719604, + "rewards/margins": 1.9452736377716064, + "rewards/rejected": -2.1584877967834473, + "step": 209 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 22.865961224797655, + "learning_rate": 4.952673470377137e-07, + "logits/chosen": -1.3631465435028076, + "logits/rejected": -1.389848232269287, + "logps/chosen": -34.06418991088867, + "logps/rejected": -60.43558120727539, + "loss": 0.208, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04666715860366821, + "rewards/margins": 2.7804617881774902, + "rewards/rejected": -2.733794689178467, + "step": 210 + }, + { + "epoch": 1.2503703703703704, + "grad_norm": 33.18444614021853, + "learning_rate": 4.951407147526803e-07, + "logits/chosen": -1.26889169216156, + "logits/rejected": -1.262638807296753, + "logps/chosen": -44.48564910888672, + "logps/rejected": -50.11359405517578, + "loss": 0.322, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3982480466365814, + "rewards/margins": 2.0993640422821045, + "rewards/rejected": -2.4976119995117188, + "step": 211 + }, + { + "epoch": 1.2562962962962962, + "grad_norm": 32.15101957433149, + "learning_rate": 4.950124272224082e-07, + "logits/chosen": -1.0472859144210815, + "logits/rejected": -0.9838506579399109, + "logps/chosen": -44.68108367919922, + "logps/rejected": -52.78573226928711, + "loss": 0.3157, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.46195846796035767, + "rewards/margins": 1.9016549587249756, + "rewards/rejected": -2.3636133670806885, + "step": 212 + }, + { + "epoch": 1.2622222222222224, + "grad_norm": 27.576004007465258, + "learning_rate": 4.948824853131236e-07, + "logits/chosen": -0.8029367923736572, + "logits/rejected": -0.7601336240768433, + "logps/chosen": -37.77496337890625, + "logps/rejected": -43.76077651977539, + "loss": 0.2837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45464155077934265, + "rewards/margins": 1.4948298931121826, + "rewards/rejected": -1.9494714736938477, + "step": 213 + }, + { + "epoch": 1.268148148148148, + "grad_norm": 31.02135986952652, + "learning_rate": 4.947508899022234e-07, + "logits/chosen": -1.3477468490600586, + "logits/rejected": -1.4308537244796753, + "logps/chosen": -32.83580017089844, + "logps/rejected": -36.56342315673828, + "loss": 0.2527, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05743288993835449, + "rewards/margins": 1.7676852941513062, + "rewards/rejected": -1.7102524042129517, + "step": 214 + }, + { + "epoch": 1.2740740740740741, + "grad_norm": 34.88556914925436, + "learning_rate": 4.946176418782698e-07, + "logits/chosen": -2.1566824913024902, + "logits/rejected": -2.090806484222412, + "logps/chosen": -46.413124084472656, + "logps/rejected": -66.00543975830078, + "loss": 0.2969, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8992799520492554, + "rewards/margins": 2.2176101207733154, + "rewards/rejected": -3.1168899536132812, + "step": 215 + }, + { + "epoch": 1.28, + "grad_norm": 35.18732477543249, + "learning_rate": 4.944827421409829e-07, + "logits/chosen": -1.181545376777649, + "logits/rejected": -1.3015544414520264, + "logps/chosen": -49.637779235839844, + "logps/rejected": -57.909244537353516, + "loss": 0.326, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8887189030647278, + "rewards/margins": 1.9327178001403809, + "rewards/rejected": -2.821436643600464, + "step": 216 + }, + { + "epoch": 1.285925925925926, + "grad_norm": 24.591401843178655, + "learning_rate": 4.943461916012363e-07, + "logits/chosen": -1.474562168121338, + "logits/rejected": -1.4195574522018433, + "logps/chosen": -43.157752990722656, + "logps/rejected": -63.577606201171875, + "loss": 0.1874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.028524503111839294, + "rewards/margins": 3.577744483947754, + "rewards/rejected": -3.606269359588623, + "step": 217 + }, + { + "epoch": 1.2918518518518518, + "grad_norm": 33.2216535916157, + "learning_rate": 4.9420799118105e-07, + "logits/chosen": -1.2040375471115112, + "logits/rejected": -1.3402186632156372, + "logps/chosen": -40.10552215576172, + "logps/rejected": -47.71405792236328, + "loss": 0.3211, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4178282916545868, + "rewards/margins": 2.3589730262756348, + "rewards/rejected": -2.776801347732544, + "step": 218 + }, + { + "epoch": 1.2977777777777777, + "grad_norm": 24.75619541002914, + "learning_rate": 4.940681418135843e-07, + "logits/chosen": -1.5321645736694336, + "logits/rejected": -1.4269739389419556, + "logps/chosen": -30.008689880371094, + "logps/rejected": -57.94854736328125, + "loss": 0.2077, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2954857349395752, + "rewards/margins": 3.1439101696014404, + "rewards/rejected": -3.4393959045410156, + "step": 219 + }, + { + "epoch": 1.3037037037037038, + "grad_norm": 32.2244205728105, + "learning_rate": 4.939266444431335e-07, + "logits/chosen": -1.1552932262420654, + "logits/rejected": -0.8443745374679565, + "logps/chosen": -37.77070617675781, + "logps/rejected": -65.06275177001953, + "loss": 0.3124, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43534815311431885, + "rewards/margins": 2.6500864028930664, + "rewards/rejected": -3.0854344367980957, + "step": 220 + }, + { + "epoch": 1.3096296296296297, + "grad_norm": 34.31075867463383, + "learning_rate": 4.937835000251197e-07, + "logits/chosen": -1.6441656351089478, + "logits/rejected": -1.4223228693008423, + "logps/chosen": -39.33687210083008, + "logps/rejected": -60.138710021972656, + "loss": 0.3128, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6635284423828125, + "rewards/margins": 2.1143033504486084, + "rewards/rejected": -2.777831792831421, + "step": 221 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 24.691079671097242, + "learning_rate": 4.936387095260863e-07, + "logits/chosen": -1.527470350265503, + "logits/rejected": -1.349807620048523, + "logps/chosen": -31.314828872680664, + "logps/rejected": -60.759063720703125, + "loss": 0.2295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4836583435535431, + "rewards/margins": 3.3463735580444336, + "rewards/rejected": -3.8300321102142334, + "step": 222 + }, + { + "epoch": 1.3214814814814815, + "grad_norm": 29.27341357309857, + "learning_rate": 4.934922739236912e-07, + "logits/chosen": -1.265206217765808, + "logits/rejected": -1.2071069478988647, + "logps/chosen": -33.34834289550781, + "logps/rejected": -54.05493927001953, + "loss": 0.2658, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.175077885389328, + "rewards/margins": 2.510103940963745, + "rewards/rejected": -2.6851820945739746, + "step": 223 + }, + { + "epoch": 1.3274074074074074, + "grad_norm": 33.68278541187267, + "learning_rate": 4.933441942067006e-07, + "logits/chosen": -1.1925909519195557, + "logits/rejected": -0.9682923555374146, + "logps/chosen": -51.606163024902344, + "logps/rejected": -62.01679611206055, + "loss": 0.2812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.20239922404289246, + "rewards/margins": 2.0104899406433105, + "rewards/rejected": -2.2128894329071045, + "step": 224 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 25.970662302058052, + "learning_rate": 4.93194471374982e-07, + "logits/chosen": -1.2032102346420288, + "logits/rejected": -1.178951621055603, + "logps/chosen": -39.75410461425781, + "logps/rejected": -51.909507751464844, + "loss": 0.2617, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3341366946697235, + "rewards/margins": 1.715659260749817, + "rewards/rejected": -2.0497961044311523, + "step": 225 + }, + { + "epoch": 1.3392592592592591, + "grad_norm": 28.630664569506944, + "learning_rate": 4.930431064394976e-07, + "logits/chosen": -1.2886648178100586, + "logits/rejected": -1.1532554626464844, + "logps/chosen": -46.30451202392578, + "logps/rejected": -48.568546295166016, + "loss": 0.2757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7058642506599426, + "rewards/margins": 1.6998100280761719, + "rewards/rejected": -2.405674457550049, + "step": 226 + }, + { + "epoch": 1.3451851851851853, + "grad_norm": 29.29936360176106, + "learning_rate": 4.928901004222977e-07, + "logits/chosen": -1.8237268924713135, + "logits/rejected": -1.9156745672225952, + "logps/chosen": -39.91328430175781, + "logps/rejected": -49.987911224365234, + "loss": 0.251, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7427995204925537, + "rewards/margins": 2.478236198425293, + "rewards/rejected": -3.221035957336426, + "step": 227 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 25.1014676322794, + "learning_rate": 4.92735454356513e-07, + "logits/chosen": -1.5877453088760376, + "logits/rejected": -1.4968990087509155, + "logps/chosen": -45.305973052978516, + "logps/rejected": -64.76545715332031, + "loss": 0.2078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6092345118522644, + "rewards/margins": 2.52042293548584, + "rewards/rejected": -3.129657506942749, + "step": 228 + }, + { + "epoch": 1.357037037037037, + "grad_norm": 30.078462760535523, + "learning_rate": 4.925791692863488e-07, + "logits/chosen": -1.226888656616211, + "logits/rejected": -1.1548047065734863, + "logps/chosen": -32.65156173706055, + "logps/rejected": -45.28959655761719, + "loss": 0.2925, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.459149032831192, + "rewards/margins": 2.1847825050354004, + "rewards/rejected": -2.6439313888549805, + "step": 229 + }, + { + "epoch": 1.362962962962963, + "grad_norm": 39.86584108808501, + "learning_rate": 4.924212462670768e-07, + "logits/chosen": -1.27255380153656, + "logits/rejected": -1.3597698211669922, + "logps/chosen": -45.05531311035156, + "logps/rejected": -54.719696044921875, + "loss": 0.3644, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3324463665485382, + "rewards/margins": 2.581584930419922, + "rewards/rejected": -2.914031505584717, + "step": 230 + }, + { + "epoch": 1.3688888888888888, + "grad_norm": 22.696681021450235, + "learning_rate": 4.922616863650289e-07, + "logits/chosen": -1.656007170677185, + "logits/rejected": -1.5683889389038086, + "logps/chosen": -41.425514221191406, + "logps/rejected": -66.38223266601562, + "loss": 0.2261, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3960660398006439, + "rewards/margins": 3.071995735168457, + "rewards/rejected": -3.468061685562134, + "step": 231 + }, + { + "epoch": 1.374814814814815, + "grad_norm": 29.486281634858543, + "learning_rate": 4.921004906575896e-07, + "logits/chosen": -1.4675967693328857, + "logits/rejected": -1.393191933631897, + "logps/chosen": -43.084381103515625, + "logps/rejected": -52.339630126953125, + "loss": 0.2716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27437475323677063, + "rewards/margins": 2.7827978134155273, + "rewards/rejected": -3.0571727752685547, + "step": 232 + }, + { + "epoch": 1.3807407407407408, + "grad_norm": 31.144179117519332, + "learning_rate": 4.919376602331883e-07, + "logits/chosen": -1.2110486030578613, + "logits/rejected": -1.4307780265808105, + "logps/chosen": -47.4427604675293, + "logps/rejected": -56.62528991699219, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2791571617126465, + "rewards/margins": 4.268290996551514, + "rewards/rejected": -4.54744815826416, + "step": 233 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 29.17828921084767, + "learning_rate": 4.917731961912926e-07, + "logits/chosen": -0.8598974347114563, + "logits/rejected": -0.7685071229934692, + "logps/chosen": -38.05088806152344, + "logps/rejected": -50.185630798339844, + "loss": 0.2472, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3008832037448883, + "rewards/margins": 2.4341988563537598, + "rewards/rejected": -2.7350821495056152, + "step": 234 + }, + { + "epoch": 1.3925925925925926, + "grad_norm": 29.70725398562355, + "learning_rate": 4.91607099642401e-07, + "logits/chosen": -1.569394826889038, + "logits/rejected": -1.5405265092849731, + "logps/chosen": -40.34491729736328, + "logps/rejected": -49.547454833984375, + "loss": 0.302, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.32715070247650146, + "rewards/margins": 2.5119659900665283, + "rewards/rejected": -2.8391168117523193, + "step": 235 + }, + { + "epoch": 1.3985185185185185, + "grad_norm": 25.496602616107623, + "learning_rate": 4.914393717080346e-07, + "logits/chosen": -1.1362640857696533, + "logits/rejected": -1.0989569425582886, + "logps/chosen": -31.34090232849121, + "logps/rejected": -44.264793395996094, + "loss": 0.2358, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08524125814437866, + "rewards/margins": 2.406437873840332, + "rewards/rejected": -2.3211965560913086, + "step": 236 + }, + { + "epoch": 1.4044444444444444, + "grad_norm": 28.575948207640465, + "learning_rate": 4.9127001352073e-07, + "logits/chosen": -1.339735746383667, + "logits/rejected": -1.2221708297729492, + "logps/chosen": -39.16889190673828, + "logps/rejected": -60.40363311767578, + "loss": 0.21, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5930743217468262, + "rewards/margins": 2.652825355529785, + "rewards/rejected": -3.2458996772766113, + "step": 237 + }, + { + "epoch": 1.4103703703703703, + "grad_norm": 24.369471042771703, + "learning_rate": 4.910990262240321e-07, + "logits/chosen": -1.8492693901062012, + "logits/rejected": -1.8319913148880005, + "logps/chosen": -36.13838195800781, + "logps/rejected": -47.36172103881836, + "loss": 0.1971, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.33304405212402344, + "rewards/margins": 2.4353842735290527, + "rewards/rejected": -2.768428325653076, + "step": 238 + }, + { + "epoch": 1.4162962962962964, + "grad_norm": 25.91992533520253, + "learning_rate": 4.909264109724852e-07, + "logits/chosen": -1.5908434391021729, + "logits/rejected": -1.4340860843658447, + "logps/chosen": -30.060483932495117, + "logps/rejected": -50.15409469604492, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035857439041137695, + "rewards/margins": 2.761453628540039, + "rewards/rejected": -2.7255964279174805, + "step": 239 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 28.00265276561218, + "learning_rate": 4.907521689316265e-07, + "logits/chosen": -1.2963168621063232, + "logits/rejected": -0.9083112478256226, + "logps/chosen": -31.37090301513672, + "logps/rejected": -67.65018463134766, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04646921157836914, + "rewards/margins": 2.4311680793762207, + "rewards/rejected": -2.477637529373169, + "step": 240 + }, + { + "epoch": 1.4281481481481482, + "grad_norm": 22.845123828242635, + "learning_rate": 4.905763012779775e-07, + "logits/chosen": -2.0449070930480957, + "logits/rejected": -1.8327888250350952, + "logps/chosen": -47.79774475097656, + "logps/rejected": -66.09587097167969, + "loss": 0.2008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43650582432746887, + "rewards/margins": 4.076416492462158, + "rewards/rejected": -4.512922286987305, + "step": 241 + }, + { + "epoch": 1.434074074074074, + "grad_norm": 20.347183374436156, + "learning_rate": 4.90398809199036e-07, + "logits/chosen": -1.5851411819458008, + "logits/rejected": -1.7745072841644287, + "logps/chosen": -46.445831298828125, + "logps/rejected": -54.314292907714844, + "loss": 0.1838, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21882963180541992, + "rewards/margins": 3.393106698989868, + "rewards/rejected": -3.611936569213867, + "step": 242 + }, + { + "epoch": 1.44, + "grad_norm": 32.344226506103965, + "learning_rate": 4.902196938932685e-07, + "logits/chosen": -1.8756588697433472, + "logits/rejected": -1.8360813856124878, + "logps/chosen": -36.15525436401367, + "logps/rejected": -44.08958053588867, + "loss": 0.2719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1032625138759613, + "rewards/margins": 2.351353645324707, + "rewards/rejected": -2.454616069793701, + "step": 243 + }, + { + "epoch": 1.445925925925926, + "grad_norm": 28.242539134271052, + "learning_rate": 4.90038956570102e-07, + "logits/chosen": -1.1855772733688354, + "logits/rejected": -1.3019590377807617, + "logps/chosen": -54.570953369140625, + "logps/rejected": -50.146095275878906, + "loss": 0.2221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9502605199813843, + "rewards/margins": 1.6242974996566772, + "rewards/rejected": -2.5745580196380615, + "step": 244 + }, + { + "epoch": 1.4518518518518517, + "grad_norm": 24.65166740001487, + "learning_rate": 4.898565984499153e-07, + "logits/chosen": -1.5563501119613647, + "logits/rejected": -1.2845768928527832, + "logps/chosen": -32.572696685791016, + "logps/rejected": -62.32068634033203, + "loss": 0.2219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4941897392272949, + "rewards/margins": 3.1560323238372803, + "rewards/rejected": -3.650221824645996, + "step": 245 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 26.176561390267167, + "learning_rate": 4.896726207640314e-07, + "logits/chosen": -1.284088134765625, + "logits/rejected": -1.530550241470337, + "logps/chosen": -49.36870574951172, + "logps/rejected": -45.64020538330078, + "loss": 0.2343, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45776939392089844, + "rewards/margins": 2.273721694946289, + "rewards/rejected": -2.7314910888671875, + "step": 246 + }, + { + "epoch": 1.4637037037037037, + "grad_norm": 32.31060258996922, + "learning_rate": 4.894870247547093e-07, + "logits/chosen": -1.6879163980484009, + "logits/rejected": -1.5221493244171143, + "logps/chosen": -30.046092987060547, + "logps/rejected": -55.25277328491211, + "loss": 0.3011, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42848262190818787, + "rewards/margins": 2.895785093307495, + "rewards/rejected": -3.324267864227295, + "step": 247 + }, + { + "epoch": 1.4696296296296296, + "grad_norm": 24.438337032328114, + "learning_rate": 4.892998116751348e-07, + "logits/chosen": -1.3243341445922852, + "logits/rejected": -1.3782624006271362, + "logps/chosen": -32.396087646484375, + "logps/rejected": -41.859649658203125, + "loss": 0.2315, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05424916744232178, + "rewards/margins": 1.7480883598327637, + "rewards/rejected": -1.693839192390442, + "step": 248 + }, + { + "epoch": 1.4755555555555555, + "grad_norm": 25.80505112713608, + "learning_rate": 4.891109827894127e-07, + "logits/chosen": -1.5409799814224243, + "logits/rejected": -1.6533753871917725, + "logps/chosen": -60.424072265625, + "logps/rejected": -66.90180206298828, + "loss": 0.2631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5530689358711243, + "rewards/margins": 3.336688280105591, + "rewards/rejected": -3.8897571563720703, + "step": 249 + }, + { + "epoch": 1.4814814814814814, + "grad_norm": 26.021296278280122, + "learning_rate": 4.889205393725583e-07, + "logits/chosen": -1.2817881107330322, + "logits/rejected": -1.060404658317566, + "logps/chosen": -32.70816421508789, + "logps/rejected": -52.11760711669922, + "loss": 0.2183, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21378082036972046, + "rewards/margins": 3.265263080596924, + "rewards/rejected": -3.0514822006225586, + "step": 250 + }, + { + "epoch": 1.4874074074074075, + "grad_norm": 26.001434100122033, + "learning_rate": 4.887284827104881e-07, + "logits/chosen": -2.0695719718933105, + "logits/rejected": -1.9940532445907593, + "logps/chosen": -28.80967140197754, + "logps/rejected": -78.68373107910156, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.015644848346710205, + "rewards/margins": 4.127260684967041, + "rewards/rejected": -4.111616134643555, + "step": 251 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 31.73693517564944, + "learning_rate": 4.885348141000122e-07, + "logits/chosen": -1.0371400117874146, + "logits/rejected": -1.0298850536346436, + "logps/chosen": -34.962913513183594, + "logps/rejected": -53.1561279296875, + "loss": 0.2894, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4044491946697235, + "rewards/margins": 3.485771417617798, + "rewards/rejected": -3.8902206420898438, + "step": 252 + }, + { + "epoch": 1.4992592592592593, + "grad_norm": 27.137864591631498, + "learning_rate": 4.883395348488243e-07, + "logits/chosen": -1.3027697801589966, + "logits/rejected": -1.3290810585021973, + "logps/chosen": -47.35820007324219, + "logps/rejected": -53.30695343017578, + "loss": 0.2341, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4950598478317261, + "rewards/margins": 3.308154344558716, + "rewards/rejected": -3.8032140731811523, + "step": 253 + }, + { + "epoch": 1.5051851851851852, + "grad_norm": 27.529883497460066, + "learning_rate": 4.88142646275494e-07, + "logits/chosen": -1.4786378145217896, + "logits/rejected": -1.4781079292297363, + "logps/chosen": -35.87552261352539, + "logps/rejected": -50.91257095336914, + "loss": 0.2294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06936764717102051, + "rewards/margins": 2.9450628757476807, + "rewards/rejected": -3.014430522918701, + "step": 254 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 27.659117607442656, + "learning_rate": 4.879441497094572e-07, + "logits/chosen": -1.3110840320587158, + "logits/rejected": -1.2133020162582397, + "logps/chosen": -35.17852783203125, + "logps/rejected": -45.25739288330078, + "loss": 0.242, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07239916920661926, + "rewards/margins": 2.228761672973633, + "rewards/rejected": -2.3011608123779297, + "step": 255 + }, + { + "epoch": 1.5170370370370372, + "grad_norm": 31.289246246516008, + "learning_rate": 4.877440464910073e-07, + "logits/chosen": -1.564308762550354, + "logits/rejected": -1.2593793869018555, + "logps/chosen": -36.79103469848633, + "logps/rejected": -52.30302047729492, + "loss": 0.2747, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7653313279151917, + "rewards/margins": 2.011465311050415, + "rewards/rejected": -2.776796340942383, + "step": 256 + }, + { + "epoch": 1.5229629629629629, + "grad_norm": 29.211097710250886, + "learning_rate": 4.875423379712864e-07, + "logits/chosen": -1.4866360425949097, + "logits/rejected": -1.2372266054153442, + "logps/chosen": -37.493309020996094, + "logps/rejected": -67.01297760009766, + "loss": 0.2303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5150955319404602, + "rewards/margins": 3.707171678543091, + "rewards/rejected": -4.222267150878906, + "step": 257 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 26.21514812235197, + "learning_rate": 4.873390255122756e-07, + "logits/chosen": -1.276237964630127, + "logits/rejected": -1.4946424961090088, + "logps/chosen": -38.75402069091797, + "logps/rejected": -52.27590560913086, + "loss": 0.1998, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3489813804626465, + "rewards/margins": 3.6085846424102783, + "rewards/rejected": -3.957566022872925, + "step": 258 + }, + { + "epoch": 1.5348148148148149, + "grad_norm": 29.630098087292826, + "learning_rate": 4.871341104867864e-07, + "logits/chosen": -1.614687442779541, + "logits/rejected": -1.4839171171188354, + "logps/chosen": -43.47966003417969, + "logps/rejected": -53.94084167480469, + "loss": 0.2513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4233679175376892, + "rewards/margins": 2.405550241470337, + "rewards/rejected": -2.828917980194092, + "step": 259 + }, + { + "epoch": 1.5407407407407407, + "grad_norm": 28.18645966963501, + "learning_rate": 4.869275942784511e-07, + "logits/chosen": -1.4025224447250366, + "logits/rejected": -1.199758768081665, + "logps/chosen": -30.40345001220703, + "logps/rejected": -47.12421417236328, + "loss": 0.2201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3111732304096222, + "rewards/margins": 2.2125093936920166, + "rewards/rejected": -2.5236825942993164, + "step": 260 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 33.643647259370184, + "learning_rate": 4.867194782817137e-07, + "logits/chosen": -1.0420281887054443, + "logits/rejected": -1.054417610168457, + "logps/chosen": -40.79283142089844, + "logps/rejected": -43.291046142578125, + "loss": 0.3138, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.007962435483932495, + "rewards/margins": 2.214482307434082, + "rewards/rejected": -2.222445011138916, + "step": 261 + }, + { + "epoch": 1.5525925925925925, + "grad_norm": 29.30111226931869, + "learning_rate": 4.865097639018202e-07, + "logits/chosen": -1.2452741861343384, + "logits/rejected": -1.1993070840835571, + "logps/chosen": -42.200904846191406, + "logps/rejected": -56.703468322753906, + "loss": 0.2458, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7176186442375183, + "rewards/margins": 2.2573556900024414, + "rewards/rejected": -2.9749743938446045, + "step": 262 + }, + { + "epoch": 1.5585185185185186, + "grad_norm": 25.912554895976097, + "learning_rate": 4.862984525548091e-07, + "logits/chosen": -1.1260875463485718, + "logits/rejected": -1.1260864734649658, + "logps/chosen": -33.189659118652344, + "logps/rejected": -43.486534118652344, + "loss": 0.2525, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.013395726680755615, + "rewards/margins": 2.880253791809082, + "rewards/rejected": -2.8936493396759033, + "step": 263 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 25.587208623838446, + "learning_rate": 4.860855456675024e-07, + "logits/chosen": -1.5094867944717407, + "logits/rejected": -1.4383167028427124, + "logps/chosen": -34.93250274658203, + "logps/rejected": -62.74409484863281, + "loss": 0.1984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5001842379570007, + "rewards/margins": 3.726599931716919, + "rewards/rejected": -4.226784706115723, + "step": 264 + }, + { + "epoch": 1.5703703703703704, + "grad_norm": 27.07883301946322, + "learning_rate": 4.85871044677495e-07, + "logits/chosen": -1.2766215801239014, + "logits/rejected": -1.2287697792053223, + "logps/chosen": -43.3912239074707, + "logps/rejected": -58.06782150268555, + "loss": 0.2165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05057030916213989, + "rewards/margins": 3.6173009872436523, + "rewards/rejected": -3.6678709983825684, + "step": 265 + }, + { + "epoch": 1.5762962962962963, + "grad_norm": 28.18716416057205, + "learning_rate": 4.856549510331461e-07, + "logits/chosen": -1.9891289472579956, + "logits/rejected": -1.8930578231811523, + "logps/chosen": -39.6988639831543, + "logps/rejected": -50.377994537353516, + "loss": 0.247, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3518238663673401, + "rewards/margins": 2.627807140350342, + "rewards/rejected": -2.979630947113037, + "step": 266 + }, + { + "epoch": 1.5822222222222222, + "grad_norm": 25.21634175596078, + "learning_rate": 4.854372661935684e-07, + "logits/chosen": -1.3575857877731323, + "logits/rejected": -1.2936060428619385, + "logps/chosen": -39.070228576660156, + "logps/rejected": -49.42529296875, + "loss": 0.253, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28435996174812317, + "rewards/margins": 2.0522220134735107, + "rewards/rejected": -2.3365821838378906, + "step": 267 + }, + { + "epoch": 1.5881481481481483, + "grad_norm": 25.067139870537627, + "learning_rate": 4.852179916286189e-07, + "logits/chosen": -1.6648176908493042, + "logits/rejected": -1.3902709484100342, + "logps/chosen": -37.70637893676758, + "logps/rejected": -55.506195068359375, + "loss": 0.236, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2536647319793701, + "rewards/margins": 2.993668556213379, + "rewards/rejected": -3.247333288192749, + "step": 268 + }, + { + "epoch": 1.594074074074074, + "grad_norm": 27.770666704811592, + "learning_rate": 4.849971288188889e-07, + "logits/chosen": -1.361600637435913, + "logits/rejected": -1.1852885484695435, + "logps/chosen": -33.70117950439453, + "logps/rejected": -57.27318572998047, + "loss": 0.2575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07503369450569153, + "rewards/margins": 2.6005759239196777, + "rewards/rejected": -2.5255422592163086, + "step": 269 + }, + { + "epoch": 1.6, + "grad_norm": 34.25142945752442, + "learning_rate": 4.847746792556936e-07, + "logits/chosen": -1.7486881017684937, + "logits/rejected": -1.783259630203247, + "logps/chosen": -45.490604400634766, + "logps/rejected": -50.73610305786133, + "loss": 0.3175, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.502980649471283, + "rewards/margins": 3.7213191986083984, + "rewards/rejected": -4.224299907684326, + "step": 270 + }, + { + "epoch": 1.605925925925926, + "grad_norm": 20.275807912086314, + "learning_rate": 4.845506444410626e-07, + "logits/chosen": -1.2842497825622559, + "logits/rejected": -1.4023914337158203, + "logps/chosen": -36.809024810791016, + "logps/rejected": -41.20471954345703, + "loss": 0.2032, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5635099411010742, + "rewards/margins": 2.691178560256958, + "rewards/rejected": -3.254688262939453, + "step": 271 + }, + { + "epoch": 1.6118518518518519, + "grad_norm": 24.468859454275407, + "learning_rate": 4.843250258877294e-07, + "logits/chosen": -1.632100224494934, + "logits/rejected": -1.7789125442504883, + "logps/chosen": -40.27824401855469, + "logps/rejected": -42.3206672668457, + "loss": 0.2082, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21913456916809082, + "rewards/margins": 2.017493486404419, + "rewards/rejected": -2.2366278171539307, + "step": 272 + }, + { + "epoch": 1.6177777777777778, + "grad_norm": 35.329644939940636, + "learning_rate": 4.840978251191211e-07, + "logits/chosen": -1.4649416208267212, + "logits/rejected": -1.5622587203979492, + "logps/chosen": -44.524383544921875, + "logps/rejected": -47.37981414794922, + "loss": 0.3464, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.726753294467926, + "rewards/margins": 2.2740654945373535, + "rewards/rejected": -3.0008187294006348, + "step": 273 + }, + { + "epoch": 1.6237037037037036, + "grad_norm": 41.38773432746113, + "learning_rate": 4.838690436693483e-07, + "logits/chosen": -1.4717864990234375, + "logits/rejected": -1.5824425220489502, + "logps/chosen": -62.04584884643555, + "logps/rejected": -64.39595031738281, + "loss": 0.3574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29022544622421265, + "rewards/margins": 3.477273941040039, + "rewards/rejected": -3.7674996852874756, + "step": 274 + }, + { + "epoch": 1.6296296296296298, + "grad_norm": 28.53036915990997, + "learning_rate": 4.836386830831951e-07, + "logits/chosen": -1.4235268831253052, + "logits/rejected": -1.3757199048995972, + "logps/chosen": -33.49037170410156, + "logps/rejected": -52.49199676513672, + "loss": 0.2304, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2958151698112488, + "rewards/margins": 2.777085304260254, + "rewards/rejected": -3.0729002952575684, + "step": 275 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 18.64876805274949, + "learning_rate": 4.834067449161077e-07, + "logits/chosen": -1.503778100013733, + "logits/rejected": -1.242067575454712, + "logps/chosen": -41.87370300292969, + "logps/rejected": -67.22463989257812, + "loss": 0.1729, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38197654485702515, + "rewards/margins": 3.5509443283081055, + "rewards/rejected": -3.9329206943511963, + "step": 276 + }, + { + "epoch": 1.6414814814814815, + "grad_norm": 18.696944468106953, + "learning_rate": 4.83173230734185e-07, + "logits/chosen": -1.8584768772125244, + "logits/rejected": -1.5482829809188843, + "logps/chosen": -37.62712097167969, + "logps/rejected": -68.63243103027344, + "loss": 0.1572, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19617757201194763, + "rewards/margins": 2.8529129028320312, + "rewards/rejected": -3.0490903854370117, + "step": 277 + }, + { + "epoch": 1.6474074074074074, + "grad_norm": 26.003574415992446, + "learning_rate": 4.829381421141671e-07, + "logits/chosen": -1.407435417175293, + "logits/rejected": -1.323309302330017, + "logps/chosen": -35.42864227294922, + "logps/rejected": -50.165164947509766, + "loss": 0.2221, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4789162874221802, + "rewards/margins": 2.547053813934326, + "rewards/rejected": -3.025969982147217, + "step": 278 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 26.39467431573978, + "learning_rate": 4.827014806434253e-07, + "logits/chosen": -1.535237431526184, + "logits/rejected": -1.4902827739715576, + "logps/chosen": -57.465370178222656, + "logps/rejected": -78.98246765136719, + "loss": 0.1981, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.563177227973938, + "rewards/margins": 3.7371535301208496, + "rewards/rejected": -4.300330638885498, + "step": 279 + }, + { + "epoch": 1.6592592592592592, + "grad_norm": 32.49048736438936, + "learning_rate": 4.824632479199511e-07, + "logits/chosen": -1.4649295806884766, + "logits/rejected": -1.4040541648864746, + "logps/chosen": -44.324127197265625, + "logps/rejected": -55.108619689941406, + "loss": 0.2692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22248147428035736, + "rewards/margins": 2.6846671104431152, + "rewards/rejected": -2.907148838043213, + "step": 280 + }, + { + "epoch": 1.665185185185185, + "grad_norm": 29.020989526396118, + "learning_rate": 4.822234455523453e-07, + "logits/chosen": -1.7030525207519531, + "logits/rejected": -1.6422662734985352, + "logps/chosen": -34.391170501708984, + "logps/rejected": -58.28746795654297, + "loss": 0.2213, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.007947564125061, + "rewards/margins": 2.7358577251434326, + "rewards/rejected": -3.743805170059204, + "step": 281 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 20.797914036041025, + "learning_rate": 4.819820751598076e-07, + "logits/chosen": -1.2752454280853271, + "logits/rejected": -1.3115514516830444, + "logps/chosen": -34.01559829711914, + "logps/rejected": -46.77418899536133, + "loss": 0.1823, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3145977854728699, + "rewards/margins": 2.5336179733276367, + "rewards/rejected": -2.8482160568237305, + "step": 282 + }, + { + "epoch": 1.6770370370370369, + "grad_norm": 26.774632112271266, + "learning_rate": 4.817391383721249e-07, + "logits/chosen": -1.5755228996276855, + "logits/rejected": -1.2880446910858154, + "logps/chosen": -40.601654052734375, + "logps/rejected": -66.74862670898438, + "loss": 0.1948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3186114430427551, + "rewards/margins": 3.0807223320007324, + "rewards/rejected": -3.399333953857422, + "step": 283 + }, + { + "epoch": 1.682962962962963, + "grad_norm": 31.762544002402134, + "learning_rate": 4.814946368296616e-07, + "logits/chosen": -1.195683240890503, + "logits/rejected": -1.124356746673584, + "logps/chosen": -34.632972717285156, + "logps/rejected": -49.069664001464844, + "loss": 0.2904, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5742506384849548, + "rewards/margins": 2.327057361602783, + "rewards/rejected": -2.901308059692383, + "step": 284 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 33.560122086582936, + "learning_rate": 4.812485721833464e-07, + "logits/chosen": -1.8307454586029053, + "logits/rejected": -1.5758129358291626, + "logps/chosen": -48.31635665893555, + "logps/rejected": -76.99293518066406, + "loss": 0.242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42657211422920227, + "rewards/margins": 3.085447072982788, + "rewards/rejected": -3.512019157409668, + "step": 285 + }, + { + "epoch": 1.6948148148148148, + "grad_norm": 31.833892095022023, + "learning_rate": 4.810009460946635e-07, + "logits/chosen": -1.3257019519805908, + "logits/rejected": -1.190606713294983, + "logps/chosen": -38.6641845703125, + "logps/rejected": -52.16994094848633, + "loss": 0.2515, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23393523693084717, + "rewards/margins": 2.993025541305542, + "rewards/rejected": -3.226961135864258, + "step": 286 + }, + { + "epoch": 1.7007407407407409, + "grad_norm": 24.602658233645577, + "learning_rate": 4.8075176023564e-07, + "logits/chosen": -1.226719617843628, + "logits/rejected": -1.0526630878448486, + "logps/chosen": -37.76171112060547, + "logps/rejected": -55.442710876464844, + "loss": 0.2353, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.017082542181015015, + "rewards/margins": 2.848991632461548, + "rewards/rejected": -2.8319091796875, + "step": 287 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 27.23670345528548, + "learning_rate": 4.805010162888346e-07, + "logits/chosen": -1.474751353263855, + "logits/rejected": -1.519294023513794, + "logps/chosen": -44.07060623168945, + "logps/rejected": -49.262245178222656, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1540902554988861, + "rewards/margins": 2.2316653728485107, + "rewards/rejected": -2.3857555389404297, + "step": 288 + }, + { + "epoch": 1.7125925925925927, + "grad_norm": 30.319908739216153, + "learning_rate": 4.802487159473271e-07, + "logits/chosen": -1.283259630203247, + "logits/rejected": -1.2373545169830322, + "logps/chosen": -41.3947868347168, + "logps/rejected": -57.63949203491211, + "loss": 0.2539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4889850616455078, + "rewards/margins": 3.9944164752960205, + "rewards/rejected": -4.483401298522949, + "step": 289 + }, + { + "epoch": 1.7185185185185186, + "grad_norm": 23.991939953835697, + "learning_rate": 4.799948609147061e-07, + "logits/chosen": -1.344802737236023, + "logits/rejected": -1.218867301940918, + "logps/chosen": -39.203948974609375, + "logps/rejected": -55.3688850402832, + "loss": 0.2034, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4728401303291321, + "rewards/margins": 2.995742082595825, + "rewards/rejected": -3.4685819149017334, + "step": 290 + }, + { + "epoch": 1.7244444444444444, + "grad_norm": 18.676355499326434, + "learning_rate": 4.797394529050577e-07, + "logits/chosen": -1.393309235572815, + "logits/rejected": -1.3101699352264404, + "logps/chosen": -46.854278564453125, + "logps/rejected": -56.5775146484375, + "loss": 0.1496, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3527151048183441, + "rewards/margins": 3.0831854343414307, + "rewards/rejected": -3.4359004497528076, + "step": 291 + }, + { + "epoch": 1.7303703703703703, + "grad_norm": 24.923664319626276, + "learning_rate": 4.794824936429543e-07, + "logits/chosen": -1.4183282852172852, + "logits/rejected": -1.319956660270691, + "logps/chosen": -32.210166931152344, + "logps/rejected": -40.73335647583008, + "loss": 0.192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2939870357513428, + "rewards/margins": 3.1537084579467773, + "rewards/rejected": -2.8597211837768555, + "step": 292 + }, + { + "epoch": 1.7362962962962962, + "grad_norm": 22.354248418795603, + "learning_rate": 4.792239848634426e-07, + "logits/chosen": -1.1553823947906494, + "logits/rejected": -1.1823606491088867, + "logps/chosen": -44.60945129394531, + "logps/rejected": -52.90985870361328, + "loss": 0.2075, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29689791798591614, + "rewards/margins": 3.1230454444885254, + "rewards/rejected": -3.4199435710906982, + "step": 293 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 20.407211978062318, + "learning_rate": 4.789639283120322e-07, + "logits/chosen": -1.777173638343811, + "logits/rejected": -1.6720659732818604, + "logps/chosen": -30.502058029174805, + "logps/rejected": -55.912635803222656, + "loss": 0.194, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023857399821281433, + "rewards/margins": 4.332409858703613, + "rewards/rejected": -4.3085527420043945, + "step": 294 + }, + { + "epoch": 1.748148148148148, + "grad_norm": 22.108322034964687, + "learning_rate": 4.787023257446832e-07, + "logits/chosen": -1.170305609703064, + "logits/rejected": -1.1386812925338745, + "logps/chosen": -46.53575134277344, + "logps/rejected": -56.994422912597656, + "loss": 0.1593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07963424921035767, + "rewards/margins": 3.4485249519348145, + "rewards/rejected": -3.3688905239105225, + "step": 295 + }, + { + "epoch": 1.7540740740740741, + "grad_norm": 25.145388894157914, + "learning_rate": 4.784391789277952e-07, + "logits/chosen": -1.4293248653411865, + "logits/rejected": -1.3704808950424194, + "logps/chosen": -33.885250091552734, + "logps/rejected": -45.85121154785156, + "loss": 0.2342, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2255295366048813, + "rewards/margins": 2.5380513668060303, + "rewards/rejected": -2.763580799102783, + "step": 296 + }, + { + "epoch": 1.76, + "grad_norm": 25.641234909948803, + "learning_rate": 4.781744896381944e-07, + "logits/chosen": -1.3408482074737549, + "logits/rejected": -1.2661770582199097, + "logps/chosen": -50.95426940917969, + "logps/rejected": -62.9360466003418, + "loss": 0.2135, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.606664776802063, + "rewards/margins": 2.8116254806518555, + "rewards/rejected": -3.418290376663208, + "step": 297 + }, + { + "epoch": 1.765925925925926, + "grad_norm": 17.698843100494624, + "learning_rate": 4.779082596631226e-07, + "logits/chosen": -1.2342387437820435, + "logits/rejected": -0.7951022386550903, + "logps/chosen": -36.144840240478516, + "logps/rejected": -62.39626693725586, + "loss": 0.1044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.04224780201911926, + "rewards/margins": 4.0136003494262695, + "rewards/rejected": -4.055848121643066, + "step": 298 + }, + { + "epoch": 1.771851851851852, + "grad_norm": 24.352998739418076, + "learning_rate": 4.776404908002245e-07, + "logits/chosen": -1.4755496978759766, + "logits/rejected": -1.259360909461975, + "logps/chosen": -35.81382751464844, + "logps/rejected": -53.95246124267578, + "loss": 0.2259, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.28839975595474243, + "rewards/margins": 2.211383819580078, + "rewards/rejected": -2.499783515930176, + "step": 299 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 33.803527524668894, + "learning_rate": 4.773711848575356e-07, + "logits/chosen": -1.5609314441680908, + "logits/rejected": -1.7283426523208618, + "logps/chosen": -49.594703674316406, + "logps/rejected": -48.518550872802734, + "loss": 0.257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7066521644592285, + "rewards/margins": 2.6102638244628906, + "rewards/rejected": -3.316915988922119, + "step": 300 + }, + { + "epoch": 1.7837037037037038, + "grad_norm": 22.088311553708106, + "learning_rate": 4.771003436534702e-07, + "logits/chosen": -1.8934781551361084, + "logits/rejected": -1.519129753112793, + "logps/chosen": -35.62154006958008, + "logps/rejected": -61.455875396728516, + "loss": 0.1709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06477588415145874, + "rewards/margins": 3.86716628074646, + "rewards/rejected": -3.8023905754089355, + "step": 301 + }, + { + "epoch": 1.7896296296296297, + "grad_norm": 31.474291026302936, + "learning_rate": 4.7682796901680906e-07, + "logits/chosen": -0.7919188141822815, + "logits/rejected": -0.667506754398346, + "logps/chosen": -42.68754196166992, + "logps/rejected": -61.21702194213867, + "loss": 0.263, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5648635625839233, + "rewards/margins": 3.7736103534698486, + "rewards/rejected": -4.338474273681641, + "step": 302 + }, + { + "epoch": 1.7955555555555556, + "grad_norm": 25.861532042981946, + "learning_rate": 4.765540627866869e-07, + "logits/chosen": -1.824802041053772, + "logits/rejected": -1.9499473571777344, + "logps/chosen": -51.77365493774414, + "logps/rejected": -50.19049072265625, + "loss": 0.2147, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6279614567756653, + "rewards/margins": 2.7504382133483887, + "rewards/rejected": -3.37839937210083, + "step": 303 + }, + { + "epoch": 1.8014814814814815, + "grad_norm": 27.50126926128136, + "learning_rate": 4.7627862681258027e-07, + "logits/chosen": -1.2844971418380737, + "logits/rejected": -1.365068793296814, + "logps/chosen": -37.15514373779297, + "logps/rejected": -42.242286682128906, + "loss": 0.251, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05502760410308838, + "rewards/margins": 2.6144795417785645, + "rewards/rejected": -2.6695070266723633, + "step": 304 + }, + { + "epoch": 1.8074074074074074, + "grad_norm": 29.03711415892969, + "learning_rate": 4.7600166295429476e-07, + "logits/chosen": -0.9044826030731201, + "logits/rejected": -0.8295137882232666, + "logps/chosen": -34.650047302246094, + "logps/rejected": -51.922096252441406, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9077940583229065, + "rewards/margins": 2.484612464904785, + "rewards/rejected": -3.3924062252044678, + "step": 305 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 24.14727003294785, + "learning_rate": 4.7572317308195276e-07, + "logits/chosen": -1.8832111358642578, + "logits/rejected": -1.620338797569275, + "logps/chosen": -38.12425231933594, + "logps/rejected": -56.99446105957031, + "loss": 0.1977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22710034251213074, + "rewards/margins": 2.965156078338623, + "rewards/rejected": -3.192256450653076, + "step": 306 + }, + { + "epoch": 1.8192592592592591, + "grad_norm": 28.260474836345267, + "learning_rate": 4.7544315907598034e-07, + "logits/chosen": -1.2670873403549194, + "logits/rejected": -1.1744232177734375, + "logps/chosen": -36.84714126586914, + "logps/rejected": -44.011810302734375, + "loss": 0.2284, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.37556207180023193, + "rewards/margins": 2.0585269927978516, + "rewards/rejected": -2.434089183807373, + "step": 307 + }, + { + "epoch": 1.8251851851851852, + "grad_norm": 30.677307239625325, + "learning_rate": 4.7516162282709515e-07, + "logits/chosen": -2.0330681800842285, + "logits/rejected": -1.9144965410232544, + "logps/chosen": -38.90164566040039, + "logps/rejected": -51.84284210205078, + "loss": 0.2654, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3513489365577698, + "rewards/margins": 2.5029258728027344, + "rewards/rejected": -2.8542749881744385, + "step": 308 + }, + { + "epoch": 1.8311111111111111, + "grad_norm": 25.485483260448667, + "learning_rate": 4.748785662362932e-07, + "logits/chosen": -1.3988568782806396, + "logits/rejected": -1.1845300197601318, + "logps/chosen": -47.51979064941406, + "logps/rejected": -69.07670593261719, + "loss": 0.2036, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1303098201751709, + "rewards/margins": 3.500807285308838, + "rewards/rejected": -3.370497226715088, + "step": 309 + }, + { + "epoch": 1.837037037037037, + "grad_norm": 25.256216032781886, + "learning_rate": 4.7459399121483634e-07, + "logits/chosen": -1.3516114950180054, + "logits/rejected": -1.4268112182617188, + "logps/chosen": -48.036685943603516, + "logps/rejected": -57.18891906738281, + "loss": 0.2178, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3574332594871521, + "rewards/margins": 3.844609260559082, + "rewards/rejected": -4.202042579650879, + "step": 310 + }, + { + "epoch": 1.842962962962963, + "grad_norm": 24.369536690397222, + "learning_rate": 4.74307899684239e-07, + "logits/chosen": -1.069244623184204, + "logits/rejected": -1.054039478302002, + "logps/chosen": -45.125831604003906, + "logps/rejected": -57.4384651184082, + "loss": 0.2188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5561109185218811, + "rewards/margins": 3.588425636291504, + "rewards/rejected": -4.144536972045898, + "step": 311 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 31.092403944743825, + "learning_rate": 4.7402029357625563e-07, + "logits/chosen": -1.3228671550750732, + "logits/rejected": -1.3702974319458008, + "logps/chosen": -45.36236572265625, + "logps/rejected": -53.258853912353516, + "loss": 0.2624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0017190277576446533, + "rewards/margins": 3.654135227203369, + "rewards/rejected": -3.652416229248047, + "step": 312 + }, + { + "epoch": 1.854814814814815, + "grad_norm": 36.08547561166675, + "learning_rate": 4.737311748328673e-07, + "logits/chosen": -1.4084701538085938, + "logits/rejected": -1.2140371799468994, + "logps/chosen": -40.802616119384766, + "logps/rejected": -60.195823669433594, + "loss": 0.2686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7584589719772339, + "rewards/margins": 3.5438449382781982, + "rewards/rejected": -4.302303791046143, + "step": 313 + }, + { + "epoch": 1.8607407407407406, + "grad_norm": 23.310186468345425, + "learning_rate": 4.7344054540626887e-07, + "logits/chosen": -1.0628677606582642, + "logits/rejected": -1.2128937244415283, + "logps/chosen": -29.777320861816406, + "logps/rejected": -42.853614807128906, + "loss": 0.193, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09455633163452148, + "rewards/margins": 3.388399124145508, + "rewards/rejected": -3.2938427925109863, + "step": 314 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 23.41430379058325, + "learning_rate": 4.731484072588555e-07, + "logits/chosen": -1.1477389335632324, + "logits/rejected": -1.1705048084259033, + "logps/chosen": -39.31995391845703, + "logps/rejected": -56.09278869628906, + "loss": 0.1586, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.46085304021835327, + "rewards/margins": 3.9288570880889893, + "rewards/rejected": -4.389710426330566, + "step": 315 + }, + { + "epoch": 1.8725925925925926, + "grad_norm": 32.46581149348618, + "learning_rate": 4.7285476236320976e-07, + "logits/chosen": -1.323777198791504, + "logits/rejected": -1.2052544355392456, + "logps/chosen": -36.882781982421875, + "logps/rejected": -58.216888427734375, + "loss": 0.2763, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.299294114112854, + "rewards/margins": 2.8449900150299072, + "rewards/rejected": -3.144284248352051, + "step": 316 + }, + { + "epoch": 1.8785185185185185, + "grad_norm": 26.73276893926149, + "learning_rate": 4.725596127020879e-07, + "logits/chosen": -1.9721529483795166, + "logits/rejected": -1.777452826499939, + "logps/chosen": -44.33803176879883, + "logps/rejected": -61.64508056640625, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06346356868743896, + "rewards/margins": 2.4611425399780273, + "rewards/rejected": -2.397678852081299, + "step": 317 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 32.452202726241005, + "learning_rate": 4.7226296026840686e-07, + "logits/chosen": -1.0877735614776611, + "logits/rejected": -1.0793946981430054, + "logps/chosen": -39.05781936645508, + "logps/rejected": -49.42589569091797, + "loss": 0.2663, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8096023797988892, + "rewards/margins": 1.485464096069336, + "rewards/rejected": -2.2950665950775146, + "step": 318 + }, + { + "epoch": 1.8903703703703703, + "grad_norm": 34.12614874092215, + "learning_rate": 4.7196480706523066e-07, + "logits/chosen": -1.5061382055282593, + "logits/rejected": -1.2875308990478516, + "logps/chosen": -39.065887451171875, + "logps/rejected": -52.43286895751953, + "loss": 0.2586, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11397549510002136, + "rewards/margins": 2.9681153297424316, + "rewards/rejected": -3.0820908546447754, + "step": 319 + }, + { + "epoch": 1.8962962962962964, + "grad_norm": 21.572252361390387, + "learning_rate": 4.716651551057567e-07, + "logits/chosen": -1.606571912765503, + "logits/rejected": -1.3566153049468994, + "logps/chosen": -42.1258659362793, + "logps/rejected": -60.3515510559082, + "loss": 0.1804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23937255144119263, + "rewards/margins": 3.642030715942383, + "rewards/rejected": -3.8814032077789307, + "step": 320 + }, + { + "epoch": 1.9022222222222223, + "grad_norm": 23.09776497168057, + "learning_rate": 4.7136400641330245e-07, + "logits/chosen": -1.7957206964492798, + "logits/rejected": -1.8467280864715576, + "logps/chosen": -34.64049530029297, + "logps/rejected": -55.07294464111328, + "loss": 0.2012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3333526849746704, + "rewards/margins": 3.44081711769104, + "rewards/rejected": -3.774169921875, + "step": 321 + }, + { + "epoch": 1.9081481481481481, + "grad_norm": 22.774321177776685, + "learning_rate": 4.710613630212916e-07, + "logits/chosen": -1.482938289642334, + "logits/rejected": -1.5811318159103394, + "logps/chosen": -45.22616195678711, + "logps/rejected": -60.64653015136719, + "loss": 0.1673, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5251885652542114, + "rewards/margins": 4.717442512512207, + "rewards/rejected": -5.242630958557129, + "step": 322 + }, + { + "epoch": 1.914074074074074, + "grad_norm": 27.731235068985775, + "learning_rate": 4.707572269732404e-07, + "logits/chosen": -1.5442708730697632, + "logits/rejected": -1.4787030220031738, + "logps/chosen": -37.137718200683594, + "logps/rejected": -54.83098602294922, + "loss": 0.2146, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15929245948791504, + "rewards/margins": 3.0805439949035645, + "rewards/rejected": -2.9212515354156494, + "step": 323 + }, + { + "epoch": 1.92, + "grad_norm": 24.64035908249009, + "learning_rate": 4.7045160032274376e-07, + "logits/chosen": -1.3611880540847778, + "logits/rejected": -1.240279197692871, + "logps/chosen": -46.88545608520508, + "logps/rejected": -69.9762191772461, + "loss": 0.1878, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9051852226257324, + "rewards/margins": 3.8201780319213867, + "rewards/rejected": -4.725363254547119, + "step": 324 + }, + { + "epoch": 1.925925925925926, + "grad_norm": 25.51618342591243, + "learning_rate": 4.701444851334617e-07, + "logits/chosen": -1.103247880935669, + "logits/rejected": -1.1200182437896729, + "logps/chosen": -38.013099670410156, + "logps/rejected": -40.656707763671875, + "loss": 0.1805, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09585386514663696, + "rewards/margins": 2.9236769676208496, + "rewards/rejected": -3.019530773162842, + "step": 325 + }, + { + "epoch": 1.9318518518518517, + "grad_norm": 23.951817662174715, + "learning_rate": 4.698358834791051e-07, + "logits/chosen": -1.2133781909942627, + "logits/rejected": -1.1840780973434448, + "logps/chosen": -37.65591812133789, + "logps/rejected": -55.01872253417969, + "loss": 0.1766, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5142883062362671, + "rewards/margins": 3.075425386428833, + "rewards/rejected": -3.5897135734558105, + "step": 326 + }, + { + "epoch": 1.9377777777777778, + "grad_norm": 25.13824963787546, + "learning_rate": 4.695257974434215e-07, + "logits/chosen": -1.3680187463760376, + "logits/rejected": -1.4396944046020508, + "logps/chosen": -46.585350036621094, + "logps/rejected": -49.38848114013672, + "loss": 0.2593, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.672561526298523, + "rewards/margins": 2.856922149658203, + "rewards/rejected": -3.5294833183288574, + "step": 327 + }, + { + "epoch": 1.9437037037037037, + "grad_norm": 20.347354135425682, + "learning_rate": 4.6921422912018174e-07, + "logits/chosen": -1.5289157629013062, + "logits/rejected": -1.3484441041946411, + "logps/chosen": -33.157310485839844, + "logps/rejected": -60.96978759765625, + "loss": 0.1445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4831308424472809, + "rewards/margins": 3.873037099838257, + "rewards/rejected": -4.356168270111084, + "step": 328 + }, + { + "epoch": 1.9496296296296296, + "grad_norm": 23.106778896782263, + "learning_rate": 4.689011806131651e-07, + "logits/chosen": -1.5771872997283936, + "logits/rejected": -1.5938518047332764, + "logps/chosen": -44.27512741088867, + "logps/rejected": -46.58651351928711, + "loss": 0.1959, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38986605405807495, + "rewards/margins": 2.192875623703003, + "rewards/rejected": -2.5827417373657227, + "step": 329 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 30.558215857635854, + "learning_rate": 4.685866540361455e-07, + "logits/chosen": -1.098953127861023, + "logits/rejected": -0.836959958076477, + "logps/chosen": -35.18102264404297, + "logps/rejected": -60.627655029296875, + "loss": 0.2649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03618580102920532, + "rewards/margins": 4.475597381591797, + "rewards/rejected": -4.439411640167236, + "step": 330 + }, + { + "epoch": 1.9614814814814814, + "grad_norm": 26.406377528441052, + "learning_rate": 4.6827065151287726e-07, + "logits/chosen": -1.6159999370574951, + "logits/rejected": -1.5996906757354736, + "logps/chosen": -42.592185974121094, + "logps/rejected": -61.75004196166992, + "loss": 0.2316, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3190871477127075, + "rewards/margins": 2.9828944206237793, + "rewards/rejected": -3.3019816875457764, + "step": 331 + }, + { + "epoch": 1.9674074074074075, + "grad_norm": 29.63360426993791, + "learning_rate": 4.6795317517708037e-07, + "logits/chosen": -1.3993651866912842, + "logits/rejected": -1.431746244430542, + "logps/chosen": -40.97235107421875, + "logps/rejected": -51.440406799316406, + "loss": 0.2577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15230146050453186, + "rewards/margins": 3.651387929916382, + "rewards/rejected": -3.499086380004883, + "step": 332 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 23.887681678446096, + "learning_rate": 4.676342271724265e-07, + "logits/chosen": -2.0206501483917236, + "logits/rejected": -1.9321112632751465, + "logps/chosen": -33.036163330078125, + "logps/rejected": -52.85546112060547, + "loss": 0.2217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3907713294029236, + "rewards/margins": 3.7629597187042236, + "rewards/rejected": -4.153730869293213, + "step": 333 + }, + { + "epoch": 1.9792592592592593, + "grad_norm": 31.05342628187582, + "learning_rate": 4.673138096525243e-07, + "logits/chosen": -0.9585044384002686, + "logits/rejected": -0.7881700992584229, + "logps/chosen": -41.761714935302734, + "logps/rejected": -57.95026779174805, + "loss": 0.2294, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3382699191570282, + "rewards/margins": 2.7629165649414062, + "rewards/rejected": -3.1011862754821777, + "step": 334 + }, + { + "epoch": 1.9851851851851852, + "grad_norm": 23.832470475974606, + "learning_rate": 4.6699192478090495e-07, + "logits/chosen": -1.7739732265472412, + "logits/rejected": -1.647121787071228, + "logps/chosen": -36.00649642944336, + "logps/rejected": -56.30244445800781, + "loss": 0.1638, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5453657507896423, + "rewards/margins": 3.5081372261047363, + "rewards/rejected": -4.053503036499023, + "step": 335 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 23.724299564441143, + "learning_rate": 4.666685747310074e-07, + "logits/chosen": -1.0021092891693115, + "logits/rejected": -0.9383841753005981, + "logps/chosen": -48.95524597167969, + "logps/rejected": -65.27790832519531, + "loss": 0.1646, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.2525467276573181, + "rewards/margins": 4.3653106689453125, + "rewards/rejected": -4.617857456207275, + "step": 336 + }, + { + "epoch": 1.9970370370370372, + "grad_norm": 26.482303767274114, + "learning_rate": 4.663437616861641e-07, + "logits/chosen": -1.426429271697998, + "logits/rejected": -1.4977319240570068, + "logps/chosen": -38.05354309082031, + "logps/rejected": -45.32928466796875, + "loss": 0.1916, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18445011973381042, + "rewards/margins": 3.021121025085449, + "rewards/rejected": -3.205570697784424, + "step": 337 + }, + { + "epoch": 2.002962962962963, + "grad_norm": 16.43992788407079, + "learning_rate": 4.660174878395855e-07, + "logits/chosen": -1.5523961782455444, + "logits/rejected": -1.3887999057769775, + "logps/chosen": -42.240272521972656, + "logps/rejected": -56.8974723815918, + "loss": 0.1446, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.46149742603302, + "rewards/margins": 4.195261001586914, + "rewards/rejected": -4.656758785247803, + "step": 338 + }, + { + "epoch": 2.008888888888889, + "grad_norm": 11.567648071422163, + "learning_rate": 4.6568975539434624e-07, + "logits/chosen": -1.6070256233215332, + "logits/rejected": -1.5450011491775513, + "logps/chosen": -30.87481689453125, + "logps/rejected": -46.38202667236328, + "loss": 0.1257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.001979619264602661, + "rewards/margins": 2.454421281814575, + "rewards/rejected": -2.452441692352295, + "step": 339 + }, + { + "epoch": 2.0148148148148146, + "grad_norm": 10.578338909286192, + "learning_rate": 4.653605665633694e-07, + "logits/chosen": -1.3266496658325195, + "logits/rejected": -1.1827898025512695, + "logps/chosen": -46.79537582397461, + "logps/rejected": -70.38484191894531, + "loss": 0.1019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8711462020874023, + "rewards/margins": 3.143320083618164, + "rewards/rejected": -4.014466285705566, + "step": 340 + }, + { + "epoch": 2.0207407407407407, + "grad_norm": 12.217181224347149, + "learning_rate": 4.6502992356941193e-07, + "logits/chosen": -1.832109808921814, + "logits/rejected": -1.8823586702346802, + "logps/chosen": -41.478126525878906, + "logps/rejected": -60.84067916870117, + "loss": 0.0908, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2918078899383545, + "rewards/margins": 3.8748106956481934, + "rewards/rejected": -4.166618824005127, + "step": 341 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 12.075752437338522, + "learning_rate": 4.6469782864504993e-07, + "logits/chosen": -1.3877170085906982, + "logits/rejected": -1.3782953023910522, + "logps/chosen": -44.92713165283203, + "logps/rejected": -60.12208557128906, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3753489553928375, + "rewards/margins": 3.521347999572754, + "rewards/rejected": -3.8966970443725586, + "step": 342 + }, + { + "epoch": 2.0325925925925925, + "grad_norm": 10.663089210825552, + "learning_rate": 4.643642840326627e-07, + "logits/chosen": -1.2021465301513672, + "logits/rejected": -0.9962760210037231, + "logps/chosen": -27.163143157958984, + "logps/rejected": -56.33610534667969, + "loss": 0.087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.251925528049469, + "rewards/margins": 3.977407455444336, + "rewards/rejected": -4.22933292388916, + "step": 343 + }, + { + "epoch": 2.0385185185185186, + "grad_norm": 13.245834881134732, + "learning_rate": 4.6402929198441876e-07, + "logits/chosen": -1.2417113780975342, + "logits/rejected": -1.3178651332855225, + "logps/chosen": -40.25837326049805, + "logps/rejected": -54.30005645751953, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05435517430305481, + "rewards/margins": 4.211629867553711, + "rewards/rejected": -4.1572747230529785, + "step": 344 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 10.571718543433656, + "learning_rate": 4.6369285476225953e-07, + "logits/chosen": -1.4555777311325073, + "logits/rejected": -1.3790762424468994, + "logps/chosen": -28.030969619750977, + "logps/rejected": -49.054386138916016, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06195509433746338, + "rewards/margins": 3.6887340545654297, + "rewards/rejected": -3.7506890296936035, + "step": 345 + }, + { + "epoch": 2.0503703703703704, + "grad_norm": 12.624900961599034, + "learning_rate": 4.6335497463788497e-07, + "logits/chosen": -1.6352788209915161, + "logits/rejected": -1.643273115158081, + "logps/chosen": -49.78592300415039, + "logps/rejected": -65.03792572021484, + "loss": 0.0986, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6293268799781799, + "rewards/margins": 4.604469299316406, + "rewards/rejected": -5.2337965965271, + "step": 346 + }, + { + "epoch": 2.0562962962962965, + "grad_norm": 8.46422535980816, + "learning_rate": 4.6301565389273755e-07, + "logits/chosen": -1.5873762369155884, + "logits/rejected": -1.4620087146759033, + "logps/chosen": -37.836265563964844, + "logps/rejected": -47.346160888671875, + "loss": 0.0689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030138731002807617, + "rewards/margins": 3.4804584980010986, + "rewards/rejected": -3.450319766998291, + "step": 347 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 12.580939295603663, + "learning_rate": 4.6267489481798736e-07, + "logits/chosen": -1.265205979347229, + "logits/rejected": -1.074588656425476, + "logps/chosen": -42.781517028808594, + "logps/rejected": -64.60198974609375, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2199668288230896, + "rewards/margins": 5.077633380889893, + "rewards/rejected": -5.297599792480469, + "step": 348 + }, + { + "epoch": 2.0681481481481483, + "grad_norm": 17.749133036624073, + "learning_rate": 4.6233269971451627e-07, + "logits/chosen": -1.1558200120925903, + "logits/rejected": -1.1175287961959839, + "logps/chosen": -47.773460388183594, + "logps/rejected": -61.29571533203125, + "loss": 0.1078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3092385530471802, + "rewards/margins": 3.6212716102600098, + "rewards/rejected": -3.9305102825164795, + "step": 349 + }, + { + "epoch": 2.074074074074074, + "grad_norm": 14.385543975965255, + "learning_rate": 4.619890708929025e-07, + "logits/chosen": -1.5604002475738525, + "logits/rejected": -1.399170160293579, + "logps/chosen": -40.24705505371094, + "logps/rejected": -54.07878875732422, + "loss": 0.1248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07989335060119629, + "rewards/margins": 2.8104934692382812, + "rewards/rejected": -2.8903868198394775, + "step": 350 + }, + { + "epoch": 2.08, + "grad_norm": 8.82921299232262, + "learning_rate": 4.6164401067340526e-07, + "logits/chosen": -1.6723850965499878, + "logits/rejected": -1.534740686416626, + "logps/chosen": -40.84113311767578, + "logps/rejected": -51.969539642333984, + "loss": 0.0727, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15863922238349915, + "rewards/margins": 3.5295114517211914, + "rewards/rejected": -3.688150405883789, + "step": 351 + }, + { + "epoch": 2.0859259259259257, + "grad_norm": 11.642298858178583, + "learning_rate": 4.612975213859487e-07, + "logits/chosen": -1.572222113609314, + "logits/rejected": -1.4436020851135254, + "logps/chosen": -43.44025802612305, + "logps/rejected": -70.96936798095703, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4972277283668518, + "rewards/margins": 4.664485931396484, + "rewards/rejected": -5.161713600158691, + "step": 352 + }, + { + "epoch": 2.091851851851852, + "grad_norm": 9.739329509065628, + "learning_rate": 4.609496053701064e-07, + "logits/chosen": -1.2559152841567993, + "logits/rejected": -1.027353048324585, + "logps/chosen": -38.480262756347656, + "logps/rejected": -63.803558349609375, + "loss": 0.0687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8633000254631042, + "rewards/margins": 5.508317470550537, + "rewards/rejected": -6.371617317199707, + "step": 353 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 11.068841998119069, + "learning_rate": 4.606002649750855e-07, + "logits/chosen": -1.7230266332626343, + "logits/rejected": -1.7567236423492432, + "logps/chosen": -41.075870513916016, + "logps/rejected": -57.1865234375, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2410399317741394, + "rewards/margins": 4.128354549407959, + "rewards/rejected": -4.369394302368164, + "step": 354 + }, + { + "epoch": 2.1037037037037036, + "grad_norm": 10.99841662995637, + "learning_rate": 4.6024950255971106e-07, + "logits/chosen": -1.9418741464614868, + "logits/rejected": -1.7188175916671753, + "logps/chosen": -41.123714447021484, + "logps/rejected": -60.509952545166016, + "loss": 0.0736, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25673192739486694, + "rewards/margins": 3.7385413646698, + "rewards/rejected": -3.9952731132507324, + "step": 355 + }, + { + "epoch": 2.1096296296296297, + "grad_norm": 12.220073717808429, + "learning_rate": 4.598973204924097e-07, + "logits/chosen": -1.5152227878570557, + "logits/rejected": -1.2851159572601318, + "logps/chosen": -34.16219711303711, + "logps/rejected": -58.27785873413086, + "loss": 0.0936, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058980196714401245, + "rewards/margins": 4.199907302856445, + "rewards/rejected": -4.140926837921143, + "step": 356 + }, + { + "epoch": 2.1155555555555554, + "grad_norm": 11.959716813475435, + "learning_rate": 4.5954372115119395e-07, + "logits/chosen": -1.0060768127441406, + "logits/rejected": -0.8774399161338806, + "logps/chosen": -38.00749206542969, + "logps/rejected": -55.84514617919922, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07694879174232483, + "rewards/margins": 4.28627347946167, + "rewards/rejected": -4.209324836730957, + "step": 357 + }, + { + "epoch": 2.1214814814814815, + "grad_norm": 13.030300625065772, + "learning_rate": 4.5918870692364606e-07, + "logits/chosen": -1.5811065435409546, + "logits/rejected": -1.4525682926177979, + "logps/chosen": -42.92179489135742, + "logps/rejected": -65.98393249511719, + "loss": 0.1022, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11530932784080505, + "rewards/margins": 4.738978385925293, + "rewards/rejected": -4.854287624359131, + "step": 358 + }, + { + "epoch": 2.127407407407407, + "grad_norm": 11.182842417441053, + "learning_rate": 4.5883228020690204e-07, + "logits/chosen": -1.7302151918411255, + "logits/rejected": -1.6962320804595947, + "logps/chosen": -46.37171173095703, + "logps/rejected": -77.80236053466797, + "loss": 0.0852, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49540960788726807, + "rewards/margins": 5.423673152923584, + "rewards/rejected": -5.9190826416015625, + "step": 359 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 12.012153556810627, + "learning_rate": 4.5847444340763516e-07, + "logits/chosen": -2.043246269226074, + "logits/rejected": -1.7466188669204712, + "logps/chosen": -36.21939468383789, + "logps/rejected": -70.36598205566406, + "loss": 0.1002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5813055038452148, + "rewards/margins": 4.620772361755371, + "rewards/rejected": -5.202077865600586, + "step": 360 + }, + { + "epoch": 2.1392592592592594, + "grad_norm": 10.866062973581458, + "learning_rate": 4.5811519894204e-07, + "logits/chosen": -1.3014001846313477, + "logits/rejected": -1.369365930557251, + "logps/chosen": -37.454872131347656, + "logps/rejected": -48.353370666503906, + "loss": 0.0904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7284749746322632, + "rewards/margins": 3.3361270427703857, + "rewards/rejected": -4.064601898193359, + "step": 361 + }, + { + "epoch": 2.145185185185185, + "grad_norm": 10.984013926118568, + "learning_rate": 4.577545492358159e-07, + "logits/chosen": -1.3029181957244873, + "logits/rejected": -1.2786986827850342, + "logps/chosen": -34.83509826660156, + "logps/rejected": -43.100284576416016, + "loss": 0.0911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24407219886779785, + "rewards/margins": 3.1360340118408203, + "rewards/rejected": -3.3801064491271973, + "step": 362 + }, + { + "epoch": 2.151111111111111, + "grad_norm": 15.321427963115688, + "learning_rate": 4.573924967241509e-07, + "logits/chosen": -1.6620972156524658, + "logits/rejected": -1.6337506771087646, + "logps/chosen": -41.56214904785156, + "logps/rejected": -55.2154655456543, + "loss": 0.0901, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0873856246471405, + "rewards/margins": 3.668139696121216, + "rewards/rejected": -3.580754041671753, + "step": 363 + }, + { + "epoch": 2.157037037037037, + "grad_norm": 13.335824438416493, + "learning_rate": 4.5702904385170495e-07, + "logits/chosen": -1.4828283786773682, + "logits/rejected": -1.3530676364898682, + "logps/chosen": -37.79193878173828, + "logps/rejected": -55.125858306884766, + "loss": 0.0921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4327038824558258, + "rewards/margins": 3.8411686420440674, + "rewards/rejected": -4.273872375488281, + "step": 364 + }, + { + "epoch": 2.162962962962963, + "grad_norm": 10.513080158547776, + "learning_rate": 4.566641930725935e-07, + "logits/chosen": -0.9346736669540405, + "logits/rejected": -0.8324167728424072, + "logps/chosen": -38.22724533081055, + "logps/rejected": -60.57867431640625, + "loss": 0.0794, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40388479828834534, + "rewards/margins": 3.9846858978271484, + "rewards/rejected": -4.388570785522461, + "step": 365 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 17.76026800342366, + "learning_rate": 4.5629794685037125e-07, + "logits/chosen": -1.5361857414245605, + "logits/rejected": -1.4024276733398438, + "logps/chosen": -41.96015548706055, + "logps/rejected": -64.81401824951172, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6312123537063599, + "rewards/margins": 4.2823896408081055, + "rewards/rejected": -4.913601875305176, + "step": 366 + }, + { + "epoch": 2.1748148148148148, + "grad_norm": 14.5609677400579, + "learning_rate": 4.5593030765801493e-07, + "logits/chosen": -1.497239589691162, + "logits/rejected": -1.434057354927063, + "logps/chosen": -37.64501953125, + "logps/rejected": -54.797393798828125, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6892575025558472, + "rewards/margins": 4.99005651473999, + "rewards/rejected": -5.679313659667969, + "step": 367 + }, + { + "epoch": 2.180740740740741, + "grad_norm": 8.065156807391954, + "learning_rate": 4.555612779779071e-07, + "logits/chosen": -1.3586572408676147, + "logits/rejected": -1.0452971458435059, + "logps/chosen": -43.58069610595703, + "logps/rejected": -63.918155670166016, + "loss": 0.0665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6702809929847717, + "rewards/margins": 4.295268535614014, + "rewards/rejected": -4.965549468994141, + "step": 368 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 9.937243299148612, + "learning_rate": 4.551908603018191e-07, + "logits/chosen": -1.537664771080017, + "logits/rejected": -1.4797343015670776, + "logps/chosen": -40.95267868041992, + "logps/rejected": -59.62392807006836, + "loss": 0.0734, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8830336928367615, + "rewards/margins": 4.300657272338867, + "rewards/rejected": -5.183691024780273, + "step": 369 + }, + { + "epoch": 2.1925925925925926, + "grad_norm": 12.149045017374526, + "learning_rate": 4.548190571308944e-07, + "logits/chosen": -1.8814219236373901, + "logits/rejected": -1.5982637405395508, + "logps/chosen": -40.04777145385742, + "logps/rejected": -68.85643768310547, + "loss": 0.0906, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5396410226821899, + "rewards/margins": 4.6829514503479, + "rewards/rejected": -5.222592353820801, + "step": 370 + }, + { + "epoch": 2.1985185185185183, + "grad_norm": 16.77656741404385, + "learning_rate": 4.5444587097563166e-07, + "logits/chosen": -1.4021185636520386, + "logits/rejected": -1.355046033859253, + "logps/chosen": -41.689239501953125, + "logps/rejected": -54.89845657348633, + "loss": 0.1307, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5340372323989868, + "rewards/margins": 3.982806444168091, + "rewards/rejected": -4.516843795776367, + "step": 371 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 11.063737609204612, + "learning_rate": 4.540713043558677e-07, + "logits/chosen": -1.2235300540924072, + "logits/rejected": -1.3220264911651611, + "logps/chosen": -47.1357421875, + "logps/rejected": -69.21366882324219, + "loss": 0.0872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04345780611038208, + "rewards/margins": 5.431947708129883, + "rewards/rejected": -5.388490676879883, + "step": 372 + }, + { + "epoch": 2.2103703703703705, + "grad_norm": 11.275739717839482, + "learning_rate": 4.536953598007607e-07, + "logits/chosen": -1.437028169631958, + "logits/rejected": -1.5387985706329346, + "logps/chosen": -53.654788970947266, + "logps/rejected": -53.855712890625, + "loss": 0.0746, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4302152693271637, + "rewards/margins": 4.380431175231934, + "rewards/rejected": -4.8106465339660645, + "step": 373 + }, + { + "epoch": 2.216296296296296, + "grad_norm": 10.838669499719408, + "learning_rate": 4.533180398487726e-07, + "logits/chosen": -1.4005894660949707, + "logits/rejected": -1.527348279953003, + "logps/chosen": -53.5523681640625, + "logps/rejected": -60.16189956665039, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6702936887741089, + "rewards/margins": 5.403100967407227, + "rewards/rejected": -6.073394775390625, + "step": 374 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 11.602629913479493, + "learning_rate": 4.529393470476528e-07, + "logits/chosen": -1.31447172164917, + "logits/rejected": -1.3682016134262085, + "logps/chosen": -38.680519104003906, + "logps/rejected": -41.53880310058594, + "loss": 0.0792, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2111378014087677, + "rewards/margins": 3.4360599517822266, + "rewards/rejected": -3.6471974849700928, + "step": 375 + }, + { + "epoch": 2.228148148148148, + "grad_norm": 13.311358584472384, + "learning_rate": 4.525592839544202e-07, + "logits/chosen": -1.437548041343689, + "logits/rejected": -1.2081105709075928, + "logps/chosen": -34.22154235839844, + "logps/rejected": -64.00574493408203, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.184605598449707, + "rewards/margins": 4.785362720489502, + "rewards/rejected": -5.969968795776367, + "step": 376 + }, + { + "epoch": 2.234074074074074, + "grad_norm": 10.754664912428062, + "learning_rate": 4.521778531353462e-07, + "logits/chosen": -1.645902156829834, + "logits/rejected": -1.533874750137329, + "logps/chosen": -41.23374938964844, + "logps/rejected": -60.57324981689453, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9864581227302551, + "rewards/margins": 4.8595967292785645, + "rewards/rejected": -5.846055030822754, + "step": 377 + }, + { + "epoch": 2.24, + "grad_norm": 10.277846832273466, + "learning_rate": 4.517950571659376e-07, + "logits/chosen": -1.3001539707183838, + "logits/rejected": -1.2223761081695557, + "logps/chosen": -31.6280574798584, + "logps/rejected": -57.09972381591797, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4209780693054199, + "rewards/margins": 4.4373779296875, + "rewards/rejected": -4.858355522155762, + "step": 378 + }, + { + "epoch": 2.245925925925926, + "grad_norm": 11.884820901078847, + "learning_rate": 4.5141089863091876e-07, + "logits/chosen": -1.5289418697357178, + "logits/rejected": -1.4325823783874512, + "logps/chosen": -38.5244140625, + "logps/rejected": -60.34238052368164, + "loss": 0.0692, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4513339400291443, + "rewards/margins": 4.745321273803711, + "rewards/rejected": -5.1966552734375, + "step": 379 + }, + { + "epoch": 2.251851851851852, + "grad_norm": 11.294048882773792, + "learning_rate": 4.5102538012421463e-07, + "logits/chosen": -1.715264081954956, + "logits/rejected": -1.6054749488830566, + "logps/chosen": -31.455442428588867, + "logps/rejected": -51.70906066894531, + "loss": 0.0782, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01587429642677307, + "rewards/margins": 4.770445823669434, + "rewards/rejected": -4.754571914672852, + "step": 380 + }, + { + "epoch": 2.2577777777777777, + "grad_norm": 11.72778067934763, + "learning_rate": 4.506385042489328e-07, + "logits/chosen": -1.2105664014816284, + "logits/rejected": -1.2829630374908447, + "logps/chosen": -41.66542434692383, + "logps/rejected": -58.2675895690918, + "loss": 0.094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.11428321897983551, + "rewards/margins": 4.219393730163574, + "rewards/rejected": -4.333676815032959, + "step": 381 + }, + { + "epoch": 2.2637037037037038, + "grad_norm": 11.88382896794105, + "learning_rate": 4.5025027361734613e-07, + "logits/chosen": -1.5602684020996094, + "logits/rejected": -1.506664514541626, + "logps/chosen": -32.05914306640625, + "logps/rejected": -60.42582702636719, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41543838381767273, + "rewards/margins": 5.331958293914795, + "rewards/rejected": -5.747396469116211, + "step": 382 + }, + { + "epoch": 2.2696296296296294, + "grad_norm": 12.79526415048381, + "learning_rate": 4.498606908508753e-07, + "logits/chosen": -1.3425393104553223, + "logits/rejected": -1.3030786514282227, + "logps/chosen": -34.17608642578125, + "logps/rejected": -55.656944274902344, + "loss": 0.0939, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24564027786254883, + "rewards/margins": 5.182816505432129, + "rewards/rejected": -5.428457260131836, + "step": 383 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 8.607081975884558, + "learning_rate": 4.4946975858007064e-07, + "logits/chosen": -1.390102505683899, + "logits/rejected": -1.2515252828598022, + "logps/chosen": -30.730792999267578, + "logps/rejected": -53.722434997558594, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39248955249786377, + "rewards/margins": 5.531357765197754, + "rewards/rejected": -5.923847675323486, + "step": 384 + }, + { + "epoch": 2.2814814814814817, + "grad_norm": 16.021871038921365, + "learning_rate": 4.4907747944459484e-07, + "logits/chosen": -1.4583913087844849, + "logits/rejected": -1.394193172454834, + "logps/chosen": -48.586524963378906, + "logps/rejected": -57.825904846191406, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9691762924194336, + "rewards/margins": 3.857024669647217, + "rewards/rejected": -4.82620096206665, + "step": 385 + }, + { + "epoch": 2.2874074074074073, + "grad_norm": 12.513747923528404, + "learning_rate": 4.486838560932048e-07, + "logits/chosen": -1.4857195615768433, + "logits/rejected": -1.4608405828475952, + "logps/chosen": -39.87062072753906, + "logps/rejected": -54.67584228515625, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5097848176956177, + "rewards/margins": 3.999152183532715, + "rewards/rejected": -4.508936882019043, + "step": 386 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 9.060948338933432, + "learning_rate": 4.4828889118373394e-07, + "logits/chosen": -1.5972692966461182, + "logits/rejected": -1.5197217464447021, + "logps/chosen": -46.44121551513672, + "logps/rejected": -63.70100402832031, + "loss": 0.0532, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3537145256996155, + "rewards/margins": 5.512208938598633, + "rewards/rejected": -5.865923881530762, + "step": 387 + }, + { + "epoch": 2.299259259259259, + "grad_norm": 12.21618451799351, + "learning_rate": 4.4789258738307413e-07, + "logits/chosen": -1.712825059890747, + "logits/rejected": -1.6520627737045288, + "logps/chosen": -37.02870178222656, + "logps/rejected": -57.479644775390625, + "loss": 0.1091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015223681926727295, + "rewards/margins": 4.26918888092041, + "rewards/rejected": -4.284412860870361, + "step": 388 + }, + { + "epoch": 2.3051851851851852, + "grad_norm": 11.212588488728503, + "learning_rate": 4.474949473671578e-07, + "logits/chosen": -1.6660000085830688, + "logits/rejected": -1.5088659524917603, + "logps/chosen": -34.143104553222656, + "logps/rejected": -53.65770721435547, + "loss": 0.0697, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29158514738082886, + "rewards/margins": 5.010606288909912, + "rewards/rejected": -4.719020843505859, + "step": 389 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 8.774018598386073, + "learning_rate": 4.4709597382093976e-07, + "logits/chosen": -1.3193117380142212, + "logits/rejected": -1.144613265991211, + "logps/chosen": -36.79685592651367, + "logps/rejected": -60.21685028076172, + "loss": 0.0582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01098334789276123, + "rewards/margins": 4.089751720428467, + "rewards/rejected": -4.100735187530518, + "step": 390 + }, + { + "epoch": 2.317037037037037, + "grad_norm": 12.868024033007977, + "learning_rate": 4.4669566943837916e-07, + "logits/chosen": -1.4795200824737549, + "logits/rejected": -1.2550170421600342, + "logps/chosen": -38.5240478515625, + "logps/rejected": -57.04312515258789, + "loss": 0.0741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6837342381477356, + "rewards/margins": 4.087418079376221, + "rewards/rejected": -4.771152496337891, + "step": 391 + }, + { + "epoch": 2.322962962962963, + "grad_norm": 9.209291618824878, + "learning_rate": 4.462940369224212e-07, + "logits/chosen": -1.875885009765625, + "logits/rejected": -1.897498607635498, + "logps/chosen": -39.49676513671875, + "logps/rejected": -59.790672302246094, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7308411598205566, + "rewards/margins": 5.174574375152588, + "rewards/rejected": -5.9054155349731445, + "step": 392 + }, + { + "epoch": 2.328888888888889, + "grad_norm": 9.72538897552064, + "learning_rate": 4.4589107898497885e-07, + "logits/chosen": -1.1584303379058838, + "logits/rejected": -1.0515251159667969, + "logps/chosen": -41.24639129638672, + "logps/rejected": -58.28783416748047, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7245338559150696, + "rewards/margins": 5.01975154876709, + "rewards/rejected": -5.744285583496094, + "step": 393 + }, + { + "epoch": 2.334814814814815, + "grad_norm": 10.129377936987572, + "learning_rate": 4.454867983469148e-07, + "logits/chosen": -1.7846736907958984, + "logits/rejected": -1.6749004125595093, + "logps/chosen": -38.921051025390625, + "logps/rejected": -52.41502380371094, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06997081637382507, + "rewards/margins": 3.646679639816284, + "rewards/rejected": -3.7166504859924316, + "step": 394 + }, + { + "epoch": 2.3407407407407406, + "grad_norm": 10.256827451169878, + "learning_rate": 4.4508119773802294e-07, + "logits/chosen": -1.655611515045166, + "logits/rejected": -1.486838936805725, + "logps/chosen": -31.4853515625, + "logps/rejected": -52.45492172241211, + "loss": 0.0781, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4950629770755768, + "rewards/margins": 4.72844123840332, + "rewards/rejected": -5.223504066467285, + "step": 395 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 14.641010278912901, + "learning_rate": 4.4467427989700967e-07, + "logits/chosen": -1.8286457061767578, + "logits/rejected": -1.8515902757644653, + "logps/chosen": -52.1026496887207, + "logps/rejected": -67.81857299804688, + "loss": 0.104, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.699891209602356, + "rewards/margins": 5.736815452575684, + "rewards/rejected": -6.43670654296875, + "step": 396 + }, + { + "epoch": 2.3525925925925923, + "grad_norm": 13.932979253909581, + "learning_rate": 4.442660475714758e-07, + "logits/chosen": -1.407354474067688, + "logits/rejected": -1.4280579090118408, + "logps/chosen": -48.2438850402832, + "logps/rejected": -62.31871032714844, + "loss": 0.0757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9111906290054321, + "rewards/margins": 4.790341377258301, + "rewards/rejected": -5.701531887054443, + "step": 397 + }, + { + "epoch": 2.3585185185185185, + "grad_norm": 6.215058288543995, + "learning_rate": 4.438565035178979e-07, + "logits/chosen": -1.617882490158081, + "logits/rejected": -1.507187008857727, + "logps/chosen": -39.37698745727539, + "logps/rejected": -54.16233825683594, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2572103440761566, + "rewards/margins": 4.302726745605469, + "rewards/rejected": -4.559937000274658, + "step": 398 + }, + { + "epoch": 2.3644444444444446, + "grad_norm": 9.899082388032094, + "learning_rate": 4.434456505016094e-07, + "logits/chosen": -1.5732052326202393, + "logits/rejected": -1.5536649227142334, + "logps/chosen": -32.2484130859375, + "logps/rejected": -49.28650665283203, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1696593165397644, + "rewards/margins": 3.8268446922302246, + "rewards/rejected": -3.996504306793213, + "step": 399 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 10.019608057983106, + "learning_rate": 4.430334912967823e-07, + "logits/chosen": -1.7139465808868408, + "logits/rejected": -1.4648141860961914, + "logps/chosen": -38.92392349243164, + "logps/rejected": -59.63561248779297, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45068129897117615, + "rewards/margins": 4.821527004241943, + "rewards/rejected": -5.2722086906433105, + "step": 400 + }, + { + "epoch": 2.3762962962962964, + "grad_norm": 6.146789861935184, + "learning_rate": 4.4262002868640826e-07, + "logits/chosen": -1.6599886417388916, + "logits/rejected": -1.843379259109497, + "logps/chosen": -53.537452697753906, + "logps/rejected": -63.20209503173828, + "loss": 0.0358, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8360182046890259, + "rewards/margins": 4.479837894439697, + "rewards/rejected": -5.315855979919434, + "step": 401 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 8.120322997880242, + "learning_rate": 4.422052654622799e-07, + "logits/chosen": -1.338700532913208, + "logits/rejected": -1.5274639129638672, + "logps/chosen": -46.13654327392578, + "logps/rejected": -60.73787307739258, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.356442928314209, + "rewards/margins": 5.699449062347412, + "rewards/rejected": -7.055891990661621, + "step": 402 + }, + { + "epoch": 2.388148148148148, + "grad_norm": 17.358175977582217, + "learning_rate": 4.417892044249716e-07, + "logits/chosen": -1.484145998954773, + "logits/rejected": -1.342088222503662, + "logps/chosen": -39.93052673339844, + "logps/rejected": -59.5439453125, + "loss": 0.116, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8260771036148071, + "rewards/margins": 4.9727325439453125, + "rewards/rejected": -5.79880952835083, + "step": 403 + }, + { + "epoch": 2.3940740740740742, + "grad_norm": 8.567071041959487, + "learning_rate": 4.4137184838382125e-07, + "logits/chosen": -1.9984219074249268, + "logits/rejected": -1.9365501403808594, + "logps/chosen": -46.297367095947266, + "logps/rejected": -62.80537414550781, + "loss": 0.0529, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19484835863113403, + "rewards/margins": 5.696597099304199, + "rewards/rejected": -5.891445636749268, + "step": 404 + }, + { + "epoch": 2.4, + "grad_norm": 9.265199389793064, + "learning_rate": 4.409532001569105e-07, + "logits/chosen": -1.2973332405090332, + "logits/rejected": -1.4657697677612305, + "logps/chosen": -37.72175598144531, + "logps/rejected": -57.419185638427734, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0432603359222412, + "rewards/margins": 5.144648551940918, + "rewards/rejected": -6.187908172607422, + "step": 405 + }, + { + "epoch": 2.405925925925926, + "grad_norm": 17.31160313347824, + "learning_rate": 4.405332625710465e-07, + "logits/chosen": -1.5048401355743408, + "logits/rejected": -1.5686776638031006, + "logps/chosen": -41.25992202758789, + "logps/rejected": -51.92019271850586, + "loss": 0.0932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7663825750350952, + "rewards/margins": 3.883291006088257, + "rewards/rejected": -4.6496734619140625, + "step": 406 + }, + { + "epoch": 2.4118518518518517, + "grad_norm": 13.886785554067659, + "learning_rate": 4.401120384617423e-07, + "logits/chosen": -1.3854293823242188, + "logits/rejected": -1.417116641998291, + "logps/chosen": -49.49497604370117, + "logps/rejected": -62.962364196777344, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0801547765731812, + "rewards/margins": 5.861600875854492, + "rewards/rejected": -6.941755294799805, + "step": 407 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 13.366585938571683, + "learning_rate": 4.396895306731977e-07, + "logits/chosen": -1.0745258331298828, + "logits/rejected": -1.209951400756836, + "logps/chosen": -42.42633819580078, + "logps/rejected": -50.970062255859375, + "loss": 0.0778, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09873640537261963, + "rewards/margins": 4.979592800140381, + "rewards/rejected": -5.078329086303711, + "step": 408 + }, + { + "epoch": 2.423703703703704, + "grad_norm": 10.608096407712143, + "learning_rate": 4.3926574205828037e-07, + "logits/chosen": -1.454784631729126, + "logits/rejected": -1.3318755626678467, + "logps/chosen": -28.561241149902344, + "logps/rejected": -51.40495300292969, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3801489770412445, + "rewards/margins": 4.299707412719727, + "rewards/rejected": -4.679856300354004, + "step": 409 + }, + { + "epoch": 2.4296296296296296, + "grad_norm": 10.042327426623421, + "learning_rate": 4.388406754785063e-07, + "logits/chosen": -1.4973795413970947, + "logits/rejected": -1.3662190437316895, + "logps/chosen": -34.50919723510742, + "logps/rejected": -56.294063568115234, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7284948825836182, + "rewards/margins": 4.622704029083252, + "rewards/rejected": -5.351198673248291, + "step": 410 + }, + { + "epoch": 2.4355555555555557, + "grad_norm": 12.535369572053135, + "learning_rate": 4.3841433380402073e-07, + "logits/chosen": -1.4382065534591675, + "logits/rejected": -1.2739676237106323, + "logps/chosen": -40.13460922241211, + "logps/rejected": -68.07673645019531, + "loss": 0.0749, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0914524495601654, + "rewards/margins": 5.701619625091553, + "rewards/rejected": -5.79307222366333, + "step": 411 + }, + { + "epoch": 2.4414814814814814, + "grad_norm": 9.584609693394986, + "learning_rate": 4.379867199135785e-07, + "logits/chosen": -1.039376974105835, + "logits/rejected": -0.6864200234413147, + "logps/chosen": -36.3159294128418, + "logps/rejected": -68.66885375976562, + "loss": 0.0641, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8507220149040222, + "rewards/margins": 6.0718865394592285, + "rewards/rejected": -6.922608375549316, + "step": 412 + }, + { + "epoch": 2.4474074074074075, + "grad_norm": 10.158194545720395, + "learning_rate": 4.375578366945246e-07, + "logits/chosen": -1.2954965829849243, + "logits/rejected": -1.2651395797729492, + "logps/chosen": -41.173309326171875, + "logps/rejected": -54.60121154785156, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.059409737586975, + "rewards/margins": 4.310122966766357, + "rewards/rejected": -5.369532585144043, + "step": 413 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 15.06897747431, + "learning_rate": 4.3712768704277524e-07, + "logits/chosen": -1.5561192035675049, + "logits/rejected": -1.5117237567901611, + "logps/chosen": -35.58319091796875, + "logps/rejected": -50.53232192993164, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16081687808036804, + "rewards/margins": 4.436723709106445, + "rewards/rejected": -4.597539901733398, + "step": 414 + }, + { + "epoch": 2.4592592592592593, + "grad_norm": 16.494448887111883, + "learning_rate": 4.366962738627975e-07, + "logits/chosen": -1.8110413551330566, + "logits/rejected": -1.7763352394104004, + "logps/chosen": -27.89055633544922, + "logps/rejected": -52.024383544921875, + "loss": 0.098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5048554539680481, + "rewards/margins": 4.789825439453125, + "rewards/rejected": -5.294680595397949, + "step": 415 + }, + { + "epoch": 2.4651851851851854, + "grad_norm": 10.39957564348003, + "learning_rate": 4.3626360006759016e-07, + "logits/chosen": -1.3889925479888916, + "logits/rejected": -1.4343739748001099, + "logps/chosen": -44.50629806518555, + "logps/rejected": -59.46120071411133, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5878319144248962, + "rewards/margins": 5.08259391784668, + "rewards/rejected": -4.494762420654297, + "step": 416 + }, + { + "epoch": 2.471111111111111, + "grad_norm": 7.962756765530989, + "learning_rate": 4.3582966857866397e-07, + "logits/chosen": -1.805254340171814, + "logits/rejected": -1.5298397541046143, + "logps/chosen": -35.40469741821289, + "logps/rejected": -59.09782409667969, + "loss": 0.0498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.034172892570495605, + "rewards/margins": 5.298932075500488, + "rewards/rejected": -5.264759063720703, + "step": 417 + }, + { + "epoch": 2.477037037037037, + "grad_norm": 9.729500240098128, + "learning_rate": 4.353944823260221e-07, + "logits/chosen": -1.2123661041259766, + "logits/rejected": -1.0601003170013428, + "logps/chosen": -35.60749053955078, + "logps/rejected": -62.845333099365234, + "loss": 0.0706, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5825104713439941, + "rewards/margins": 4.653536796569824, + "rewards/rejected": -5.23604679107666, + "step": 418 + }, + { + "epoch": 2.482962962962963, + "grad_norm": 13.078064168370302, + "learning_rate": 4.3495804424813986e-07, + "logits/chosen": -1.7815532684326172, + "logits/rejected": -1.5699641704559326, + "logps/chosen": -36.4069938659668, + "logps/rejected": -56.897377014160156, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07872982323169708, + "rewards/margins": 4.7484540939331055, + "rewards/rejected": -4.669724464416504, + "step": 419 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 13.298324800536891, + "learning_rate": 4.3452035729194534e-07, + "logits/chosen": -1.675660490989685, + "logits/rejected": -1.5707687139511108, + "logps/chosen": -36.95539093017578, + "logps/rejected": -63.85231018066406, + "loss": 0.0895, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03855517506599426, + "rewards/margins": 6.0861358642578125, + "rewards/rejected": -6.047580718994141, + "step": 420 + }, + { + "epoch": 2.4948148148148146, + "grad_norm": 10.091782491568125, + "learning_rate": 4.340814244127993e-07, + "logits/chosen": -1.6767897605895996, + "logits/rejected": -1.5757229328155518, + "logps/chosen": -40.57682800292969, + "logps/rejected": -56.93830871582031, + "loss": 0.0705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6066794395446777, + "rewards/margins": 3.8079683780670166, + "rewards/rejected": -4.414648056030273, + "step": 421 + }, + { + "epoch": 2.5007407407407407, + "grad_norm": 7.613026313203136, + "learning_rate": 4.3364124857447525e-07, + "logits/chosen": -1.4799020290374756, + "logits/rejected": -1.1713473796844482, + "logps/chosen": -44.87807083129883, + "logps/rejected": -65.74991607666016, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8942358493804932, + "rewards/margins": 5.720493316650391, + "rewards/rejected": -6.614729404449463, + "step": 422 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 7.9345951349843435, + "learning_rate": 4.331998327491395e-07, + "logits/chosen": -1.092266321182251, + "logits/rejected": -1.1247848272323608, + "logps/chosen": -42.70750045776367, + "logps/rejected": -59.23059844970703, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8865154385566711, + "rewards/margins": 5.208459854125977, + "rewards/rejected": -6.094975471496582, + "step": 423 + }, + { + "epoch": 2.5125925925925925, + "grad_norm": 13.270734370115317, + "learning_rate": 4.3275717991733097e-07, + "logits/chosen": -2.118969440460205, + "logits/rejected": -1.9935420751571655, + "logps/chosen": -35.17277908325195, + "logps/rejected": -52.8382568359375, + "loss": 0.0795, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34134382009506226, + "rewards/margins": 3.8893234729766846, + "rewards/rejected": -4.2306671142578125, + "step": 424 + }, + { + "epoch": 2.5185185185185186, + "grad_norm": 10.153066540239442, + "learning_rate": 4.3231329306794106e-07, + "logits/chosen": -1.8174488544464111, + "logits/rejected": -1.8481026887893677, + "logps/chosen": -40.095062255859375, + "logps/rejected": -60.66204071044922, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.58811354637146, + "rewards/margins": 5.898916244506836, + "rewards/rejected": -6.487029552459717, + "step": 425 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 12.327594132627251, + "learning_rate": 4.3186817519819365e-07, + "logits/chosen": -1.3380463123321533, + "logits/rejected": -1.233199119567871, + "logps/chosen": -43.25425720214844, + "logps/rejected": -63.275611877441406, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8061537742614746, + "rewards/margins": 5.740635871887207, + "rewards/rejected": -6.54679012298584, + "step": 426 + }, + { + "epoch": 2.5303703703703704, + "grad_norm": 9.443105985819049, + "learning_rate": 4.314218293136247e-07, + "logits/chosen": -1.3254871368408203, + "logits/rejected": -1.3607250452041626, + "logps/chosen": -30.99878692626953, + "logps/rejected": -45.84544372558594, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09832948446273804, + "rewards/margins": 4.39568567276001, + "rewards/rejected": -4.494015216827393, + "step": 427 + }, + { + "epoch": 2.536296296296296, + "grad_norm": 15.015344786901764, + "learning_rate": 4.30974258428062e-07, + "logits/chosen": -1.9797377586364746, + "logits/rejected": -2.0408339500427246, + "logps/chosen": -47.02696990966797, + "logps/rejected": -50.319114685058594, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1645985841751099, + "rewards/margins": 3.7737340927124023, + "rewards/rejected": -4.938333034515381, + "step": 428 + }, + { + "epoch": 2.542222222222222, + "grad_norm": 9.349974145456377, + "learning_rate": 4.3052546556360486e-07, + "logits/chosen": -1.687772512435913, + "logits/rejected": -1.6783831119537354, + "logps/chosen": -32.642303466796875, + "logps/rejected": -47.195770263671875, + "loss": 0.0678, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3790569603443146, + "rewards/margins": 3.810107469558716, + "rewards/rejected": -4.189164638519287, + "step": 429 + }, + { + "epoch": 2.5481481481481483, + "grad_norm": 7.215406846905479, + "learning_rate": 4.300754537506036e-07, + "logits/chosen": -1.4609354734420776, + "logits/rejected": -1.5548583269119263, + "logps/chosen": -41.346309661865234, + "logps/rejected": -52.50380325317383, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40993720293045044, + "rewards/margins": 5.032467842102051, + "rewards/rejected": -5.442404747009277, + "step": 430 + }, + { + "epoch": 2.554074074074074, + "grad_norm": 10.222814355312524, + "learning_rate": 4.2962422602763925e-07, + "logits/chosen": -1.1078249216079712, + "logits/rejected": -0.9215734004974365, + "logps/chosen": -33.152610778808594, + "logps/rejected": -57.324745178222656, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7816075086593628, + "rewards/margins": 4.221255302429199, + "rewards/rejected": -5.002862453460693, + "step": 431 + }, + { + "epoch": 2.56, + "grad_norm": 18.265248426638163, + "learning_rate": 4.2917178544150284e-07, + "logits/chosen": -1.9573190212249756, + "logits/rejected": -1.8439496755599976, + "logps/chosen": -36.08943176269531, + "logps/rejected": -57.182159423828125, + "loss": 0.0873, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.192732572555542, + "rewards/margins": 4.874904632568359, + "rewards/rejected": -6.0676374435424805, + "step": 432 + }, + { + "epoch": 2.565925925925926, + "grad_norm": 10.235583078032512, + "learning_rate": 4.2871813504717497e-07, + "logits/chosen": -1.5011374950408936, + "logits/rejected": -1.2769317626953125, + "logps/chosen": -39.9639892578125, + "logps/rejected": -61.80725860595703, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6877691745758057, + "rewards/margins": 5.699342727661133, + "rewards/rejected": -6.387112140655518, + "step": 433 + }, + { + "epoch": 2.571851851851852, + "grad_norm": 9.62877392648817, + "learning_rate": 4.2826327790780505e-07, + "logits/chosen": -1.1425065994262695, + "logits/rejected": -1.1360677480697632, + "logps/chosen": -43.025978088378906, + "logps/rejected": -61.36090087890625, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4637770354747772, + "rewards/margins": 5.920599460601807, + "rewards/rejected": -6.384376525878906, + "step": 434 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 9.890311360978767, + "learning_rate": 4.278072170946909e-07, + "logits/chosen": -1.4360768795013428, + "logits/rejected": -1.4959397315979004, + "logps/chosen": -46.09064483642578, + "logps/rejected": -56.41389465332031, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.707612931728363, + "rewards/margins": 5.068720817565918, + "rewards/rejected": -5.776333332061768, + "step": 435 + }, + { + "epoch": 2.5837037037037036, + "grad_norm": 10.392170104491372, + "learning_rate": 4.273499556872576e-07, + "logits/chosen": -2.2950620651245117, + "logits/rejected": -2.092170238494873, + "logps/chosen": -36.24480438232422, + "logps/rejected": -64.73004150390625, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4665120542049408, + "rewards/margins": 5.585385322570801, + "rewards/rejected": -6.051897048950195, + "step": 436 + }, + { + "epoch": 2.5896296296296297, + "grad_norm": 10.12765358633514, + "learning_rate": 4.2689149677303716e-07, + "logits/chosen": -1.0707876682281494, + "logits/rejected": -1.1080553531646729, + "logps/chosen": -43.415000915527344, + "logps/rejected": -51.25993347167969, + "loss": 0.0572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4414452314376831, + "rewards/margins": 4.333120346069336, + "rewards/rejected": -4.774565696716309, + "step": 437 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 11.403233126394678, + "learning_rate": 4.264318434476472e-07, + "logits/chosen": -0.9671777486801147, + "logits/rejected": -1.0228941440582275, + "logps/chosen": -43.18418884277344, + "logps/rejected": -61.22423553466797, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.750145435333252, + "rewards/margins": 5.196172714233398, + "rewards/rejected": -5.94631814956665, + "step": 438 + }, + { + "epoch": 2.6014814814814815, + "grad_norm": 23.740794064578566, + "learning_rate": 4.2597099881477017e-07, + "logits/chosen": -1.4594378471374512, + "logits/rejected": -1.2035342454910278, + "logps/chosen": -35.04090118408203, + "logps/rejected": -48.212501525878906, + "loss": 0.1412, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5435713529586792, + "rewards/margins": 2.9230170249938965, + "rewards/rejected": -3.4665884971618652, + "step": 439 + }, + { + "epoch": 2.6074074074074076, + "grad_norm": 7.22571057177582, + "learning_rate": 4.2550896598613297e-07, + "logits/chosen": -1.6149311065673828, + "logits/rejected": -1.366624355316162, + "logps/chosen": -26.589153289794922, + "logps/rejected": -60.616371154785156, + "loss": 0.0451, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10370810329914093, + "rewards/margins": 5.520005226135254, + "rewards/rejected": -5.41629695892334, + "step": 440 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 8.984715683706641, + "learning_rate": 4.25045748081485e-07, + "logits/chosen": -1.4854509830474854, + "logits/rejected": -1.4200009107589722, + "logps/chosen": -34.8680305480957, + "logps/rejected": -62.0745735168457, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.705161452293396, + "rewards/margins": 5.211274147033691, + "rewards/rejected": -5.916436195373535, + "step": 441 + }, + { + "epoch": 2.6192592592592594, + "grad_norm": 9.126681254113208, + "learning_rate": 4.2458134822857774e-07, + "logits/chosen": -1.644370198249817, + "logits/rejected": -1.582041621208191, + "logps/chosen": -36.514915466308594, + "logps/rejected": -67.11909484863281, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7975553870201111, + "rewards/margins": 6.119021892547607, + "rewards/rejected": -6.916577339172363, + "step": 442 + }, + { + "epoch": 2.625185185185185, + "grad_norm": 13.885847569564467, + "learning_rate": 4.241157695631435e-07, + "logits/chosen": -1.5854181051254272, + "logits/rejected": -1.6152215003967285, + "logps/chosen": -38.029273986816406, + "logps/rejected": -62.7104377746582, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16970673203468323, + "rewards/margins": 6.10245943069458, + "rewards/rejected": -6.2721662521362305, + "step": 443 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 10.006195702980698, + "learning_rate": 4.2364901522887416e-07, + "logits/chosen": -1.1892244815826416, + "logits/rejected": -1.2867357730865479, + "logps/chosen": -36.74848937988281, + "logps/rejected": -57.911075592041016, + "loss": 0.0579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27545565366744995, + "rewards/margins": 5.799798488616943, + "rewards/rejected": -6.075253963470459, + "step": 444 + }, + { + "epoch": 2.637037037037037, + "grad_norm": 17.203886073678117, + "learning_rate": 4.2318108837739986e-07, + "logits/chosen": -1.2790842056274414, + "logits/rejected": -1.226332664489746, + "logps/chosen": -36.43606948852539, + "logps/rejected": -49.779937744140625, + "loss": 0.0892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7447434663772583, + "rewards/margins": 4.716348648071289, + "rewards/rejected": -5.461092948913574, + "step": 445 + }, + { + "epoch": 2.642962962962963, + "grad_norm": 10.069009996513447, + "learning_rate": 4.22711992168268e-07, + "logits/chosen": -1.6787967681884766, + "logits/rejected": -1.6690788269042969, + "logps/chosen": -44.772674560546875, + "logps/rejected": -56.2176399230957, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09006957709789276, + "rewards/margins": 4.575125694274902, + "rewards/rejected": -4.665195465087891, + "step": 446 + }, + { + "epoch": 2.648888888888889, + "grad_norm": 11.658795202407136, + "learning_rate": 4.2224172976892166e-07, + "logits/chosen": -1.3170833587646484, + "logits/rejected": -1.157299518585205, + "logps/chosen": -47.767242431640625, + "logps/rejected": -72.54170227050781, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17752492427825928, + "rewards/margins": 6.256413459777832, + "rewards/rejected": -6.433938980102539, + "step": 447 + }, + { + "epoch": 2.6548148148148147, + "grad_norm": 5.940086524497116, + "learning_rate": 4.217703043546783e-07, + "logits/chosen": -1.2264058589935303, + "logits/rejected": -1.195594310760498, + "logps/chosen": -43.642913818359375, + "logps/rejected": -60.73713684082031, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.053247094154358, + "rewards/margins": 6.064220428466797, + "rewards/rejected": -7.117467880249023, + "step": 448 + }, + { + "epoch": 2.660740740740741, + "grad_norm": 7.247831097507863, + "learning_rate": 4.2129771910870845e-07, + "logits/chosen": -1.4685695171356201, + "logits/rejected": -1.2932255268096924, + "logps/chosen": -39.4880256652832, + "logps/rejected": -72.38164520263672, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.781300961971283, + "rewards/margins": 6.027366638183594, + "rewards/rejected": -6.80866813659668, + "step": 449 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 8.142661646420805, + "learning_rate": 4.2082397722201385e-07, + "logits/chosen": -1.6290993690490723, + "logits/rejected": -1.3875850439071655, + "logps/chosen": -27.86062240600586, + "logps/rejected": -60.022274017333984, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38442179560661316, + "rewards/margins": 5.5896830558776855, + "rewards/rejected": -5.974104881286621, + "step": 450 + }, + { + "epoch": 2.6725925925925926, + "grad_norm": 12.094207410074398, + "learning_rate": 4.2034908189340634e-07, + "logits/chosen": -1.3270450830459595, + "logits/rejected": -1.184122920036316, + "logps/chosen": -36.17488479614258, + "logps/rejected": -62.60739517211914, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7593768239021301, + "rewards/margins": 5.755254745483398, + "rewards/rejected": -6.514631748199463, + "step": 451 + }, + { + "epoch": 2.6785185185185183, + "grad_norm": 10.502777697396722, + "learning_rate": 4.19873036329486e-07, + "logits/chosen": -1.6183356046676636, + "logits/rejected": -1.8080652952194214, + "logps/chosen": -39.19137954711914, + "logps/rejected": -61.41072082519531, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3307141363620758, + "rewards/margins": 5.200984954833984, + "rewards/rejected": -5.531699180603027, + "step": 452 + }, + { + "epoch": 2.6844444444444444, + "grad_norm": 5.2366059114266035, + "learning_rate": 4.1939584374461943e-07, + "logits/chosen": -1.7235251665115356, + "logits/rejected": -1.756578803062439, + "logps/chosen": -33.519996643066406, + "logps/rejected": -48.20545959472656, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2523934543132782, + "rewards/margins": 4.641357421875, + "rewards/rejected": -4.3889641761779785, + "step": 453 + }, + { + "epoch": 2.6903703703703705, + "grad_norm": 6.793456123885119, + "learning_rate": 4.189175073609184e-07, + "logits/chosen": -1.768607258796692, + "logits/rejected": -1.6957839727401733, + "logps/chosen": -42.76543426513672, + "logps/rejected": -55.93104934692383, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22528111934661865, + "rewards/margins": 4.591314792633057, + "rewards/rejected": -4.366034030914307, + "step": 454 + }, + { + "epoch": 2.696296296296296, + "grad_norm": 10.416935814267305, + "learning_rate": 4.184380304082177e-07, + "logits/chosen": -1.3785486221313477, + "logits/rejected": -1.3025366067886353, + "logps/chosen": -37.827213287353516, + "logps/rejected": -52.29517364501953, + "loss": 0.0686, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8132541179656982, + "rewards/margins": 4.752249717712402, + "rewards/rejected": -5.5655035972595215, + "step": 455 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 13.940426481316408, + "learning_rate": 4.179574161240536e-07, + "logits/chosen": -2.09922456741333, + "logits/rejected": -1.9302072525024414, + "logps/chosen": -30.62826156616211, + "logps/rejected": -48.425559997558594, + "loss": 0.0924, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37107735872268677, + "rewards/margins": 3.9561352729797363, + "rewards/rejected": -4.327212333679199, + "step": 456 + }, + { + "epoch": 2.7081481481481484, + "grad_norm": 14.616095915339471, + "learning_rate": 4.1747566775364175e-07, + "logits/chosen": -1.7652040719985962, + "logits/rejected": -1.5786316394805908, + "logps/chosen": -28.41473388671875, + "logps/rejected": -60.02101135253906, + "loss": 0.0874, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0990707278251648, + "rewards/margins": 5.42191219329834, + "rewards/rejected": -5.5209832191467285, + "step": 457 + }, + { + "epoch": 2.714074074074074, + "grad_norm": 11.634526933904187, + "learning_rate": 4.169927885498556e-07, + "logits/chosen": -1.509355068206787, + "logits/rejected": -1.5713496208190918, + "logps/chosen": -44.71356964111328, + "logps/rejected": -59.13399124145508, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6387603878974915, + "rewards/margins": 6.12332820892334, + "rewards/rejected": -6.762088775634766, + "step": 458 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 12.6955399694262, + "learning_rate": 4.16508781773204e-07, + "logits/chosen": -1.034393548965454, + "logits/rejected": -1.0755641460418701, + "logps/chosen": -50.58789825439453, + "logps/rejected": -65.17190551757812, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7241175174713135, + "rewards/margins": 6.459036350250244, + "rewards/rejected": -7.183154106140137, + "step": 459 + }, + { + "epoch": 2.725925925925926, + "grad_norm": 10.566702546539224, + "learning_rate": 4.1602365069180976e-07, + "logits/chosen": -1.2953882217407227, + "logits/rejected": -1.4143340587615967, + "logps/chosen": -48.79972839355469, + "logps/rejected": -62.423553466796875, + "loss": 0.067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5391722917556763, + "rewards/margins": 5.307343482971191, + "rewards/rejected": -6.846515655517578, + "step": 460 + }, + { + "epoch": 2.731851851851852, + "grad_norm": 7.496698557060088, + "learning_rate": 4.155373985813868e-07, + "logits/chosen": -1.5388610363006592, + "logits/rejected": -1.4691227674484253, + "logps/chosen": -34.53229522705078, + "logps/rejected": -45.744293212890625, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47361868619918823, + "rewards/margins": 4.789101600646973, + "rewards/rejected": -5.262720584869385, + "step": 461 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 11.142058376362055, + "learning_rate": 4.150500287252189e-07, + "logits/chosen": -1.1989514827728271, + "logits/rejected": -1.2030577659606934, + "logps/chosen": -42.25239944458008, + "logps/rejected": -57.260189056396484, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7515776753425598, + "rewards/margins": 5.3145246505737305, + "rewards/rejected": -6.066102981567383, + "step": 462 + }, + { + "epoch": 2.7437037037037038, + "grad_norm": 11.051939576284227, + "learning_rate": 4.145615444141369e-07, + "logits/chosen": -1.109269618988037, + "logits/rejected": -1.1979848146438599, + "logps/chosen": -46.74151611328125, + "logps/rejected": -53.64583206176758, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.73493492603302, + "rewards/margins": 5.678466320037842, + "rewards/rejected": -6.4134016036987305, + "step": 463 + }, + { + "epoch": 2.74962962962963, + "grad_norm": 7.400402446401093, + "learning_rate": 4.1407194894649677e-07, + "logits/chosen": -2.147449493408203, + "logits/rejected": -2.0689783096313477, + "logps/chosen": -37.51029968261719, + "logps/rejected": -69.01571655273438, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8444314002990723, + "rewards/margins": 6.505341529846191, + "rewards/rejected": -7.349773406982422, + "step": 464 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 17.234630076573417, + "learning_rate": 4.135812456281571e-07, + "logits/chosen": -1.9983713626861572, + "logits/rejected": -1.5781863927841187, + "logps/chosen": -47.452430725097656, + "logps/rejected": -84.41545104980469, + "loss": 0.0975, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7086008787155151, + "rewards/margins": 5.433979034423828, + "rewards/rejected": -7.142579555511475, + "step": 465 + }, + { + "epoch": 2.7614814814814816, + "grad_norm": 11.722950079081206, + "learning_rate": 4.1308943777245717e-07, + "logits/chosen": -1.4446229934692383, + "logits/rejected": -1.5490970611572266, + "logps/chosen": -33.173927307128906, + "logps/rejected": -53.479957580566406, + "loss": 0.075, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5102207660675049, + "rewards/margins": 6.010803699493408, + "rewards/rejected": -6.521024703979492, + "step": 466 + }, + { + "epoch": 2.7674074074074073, + "grad_norm": 13.810856571727623, + "learning_rate": 4.1259652870019426e-07, + "logits/chosen": -1.1215626001358032, + "logits/rejected": -1.2170990705490112, + "logps/chosen": -44.314571380615234, + "logps/rejected": -55.65526580810547, + "loss": 0.0819, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8959226012229919, + "rewards/margins": 5.7853522300720215, + "rewards/rejected": -6.681274890899658, + "step": 467 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 7.703802667245408, + "learning_rate": 4.121025217396011e-07, + "logits/chosen": -0.8404613733291626, + "logits/rejected": -0.8951901197433472, + "logps/chosen": -39.737998962402344, + "logps/rejected": -52.829063415527344, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0370070934295654, + "rewards/margins": 5.2985029220581055, + "rewards/rejected": -6.33551025390625, + "step": 468 + }, + { + "epoch": 2.779259259259259, + "grad_norm": 11.663820751466071, + "learning_rate": 4.1160742022632395e-07, + "logits/chosen": -1.3555892705917358, + "logits/rejected": -1.2612375020980835, + "logps/chosen": -36.32597732543945, + "logps/rejected": -56.69281005859375, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7630610466003418, + "rewards/margins": 4.628640651702881, + "rewards/rejected": -5.391702175140381, + "step": 469 + }, + { + "epoch": 2.785185185185185, + "grad_norm": 5.80144294575508, + "learning_rate": 4.1111122750339945e-07, + "logits/chosen": -1.330127239227295, + "logits/rejected": -1.1901962757110596, + "logps/chosen": -41.21522903442383, + "logps/rejected": -63.96897888183594, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2805338501930237, + "rewards/margins": 7.6646647453308105, + "rewards/rejected": -7.945198059082031, + "step": 470 + }, + { + "epoch": 2.7911111111111113, + "grad_norm": 9.363333254370783, + "learning_rate": 4.106139469212326e-07, + "logits/chosen": -1.337806224822998, + "logits/rejected": -1.2989773750305176, + "logps/chosen": -46.482181549072266, + "logps/rejected": -63.64010238647461, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9635290503501892, + "rewards/margins": 5.768977165222168, + "rewards/rejected": -6.73250675201416, + "step": 471 + }, + { + "epoch": 2.797037037037037, + "grad_norm": 14.879444811599774, + "learning_rate": 4.1011558183757374e-07, + "logits/chosen": -1.7458575963974, + "logits/rejected": -1.6218409538269043, + "logps/chosen": -30.842025756835938, + "logps/rejected": -56.45557403564453, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7571108341217041, + "rewards/margins": 5.208766937255859, + "rewards/rejected": -5.965878486633301, + "step": 472 + }, + { + "epoch": 2.802962962962963, + "grad_norm": 12.457885607362718, + "learning_rate": 4.0961613561749585e-07, + "logits/chosen": -2.1387579441070557, + "logits/rejected": -1.9743508100509644, + "logps/chosen": -45.234222412109375, + "logps/rejected": -69.26448059082031, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0624377727508545, + "rewards/margins": 5.491796016693115, + "rewards/rejected": -6.554234027862549, + "step": 473 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 11.785711167501036, + "learning_rate": 4.091156116333723e-07, + "logits/chosen": -1.527976632118225, + "logits/rejected": -1.4141886234283447, + "logps/chosen": -45.122650146484375, + "logps/rejected": -62.66756820678711, + "loss": 0.0607, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8391146659851074, + "rewards/margins": 4.945567607879639, + "rewards/rejected": -6.784682273864746, + "step": 474 + }, + { + "epoch": 2.814814814814815, + "grad_norm": 6.438450018764331, + "learning_rate": 4.086140132648534e-07, + "logits/chosen": -1.6443370580673218, + "logits/rejected": -1.5842254161834717, + "logps/chosen": -40.13809585571289, + "logps/rejected": -73.64363098144531, + "loss": 0.0382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7049580812454224, + "rewards/margins": 6.288333415985107, + "rewards/rejected": -6.99329137802124, + "step": 475 + }, + { + "epoch": 2.8207407407407405, + "grad_norm": 10.33277822610733, + "learning_rate": 4.081113438988443e-07, + "logits/chosen": -1.9966257810592651, + "logits/rejected": -1.940199851989746, + "logps/chosen": -40.505191802978516, + "logps/rejected": -59.13007736206055, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38005155324935913, + "rewards/margins": 5.394841194152832, + "rewards/rejected": -5.774892807006836, + "step": 476 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 9.375606930837032, + "learning_rate": 4.076076069294816e-07, + "logits/chosen": -1.5683081150054932, + "logits/rejected": -1.3896409273147583, + "logps/chosen": -41.28034973144531, + "logps/rejected": -68.79712677001953, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7804399728775024, + "rewards/margins": 4.933218955993652, + "rewards/rejected": -5.713658809661865, + "step": 477 + }, + { + "epoch": 2.8325925925925928, + "grad_norm": 9.043547249851873, + "learning_rate": 4.071028057581105e-07, + "logits/chosen": -1.6767199039459229, + "logits/rejected": -1.9358806610107422, + "logps/chosen": -66.44622039794922, + "logps/rejected": -67.9832763671875, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0683860778808594, + "rewards/margins": 5.570001602172852, + "rewards/rejected": -7.638387680053711, + "step": 478 + }, + { + "epoch": 2.8385185185185184, + "grad_norm": 5.908586113770876, + "learning_rate": 4.065969437932622e-07, + "logits/chosen": -1.6558722257614136, + "logits/rejected": -1.7621960639953613, + "logps/chosen": -52.626182556152344, + "logps/rejected": -61.46979522705078, + "loss": 0.0469, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2955846786499023, + "rewards/margins": 4.890224933624268, + "rewards/rejected": -6.185809135437012, + "step": 479 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 12.445167440486212, + "learning_rate": 4.0609002445063036e-07, + "logits/chosen": -1.5896965265274048, + "logits/rejected": -1.4875233173370361, + "logps/chosen": -40.567039489746094, + "logps/rejected": -54.923274993896484, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5689337849617004, + "rewards/margins": 5.234133720397949, + "rewards/rejected": -5.803068161010742, + "step": 480 + }, + { + "epoch": 2.85037037037037, + "grad_norm": 12.247510070974629, + "learning_rate": 4.0558205115304846e-07, + "logits/chosen": -2.029125690460205, + "logits/rejected": -1.977358102798462, + "logps/chosen": -46.63209533691406, + "logps/rejected": -69.29682922363281, + "loss": 0.0648, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12471213936805725, + "rewards/margins": 5.987627983093262, + "rewards/rejected": -6.112339973449707, + "step": 481 + }, + { + "epoch": 2.8562962962962963, + "grad_norm": 10.08015012343844, + "learning_rate": 4.050730273304663e-07, + "logits/chosen": -1.6675320863723755, + "logits/rejected": -1.3868141174316406, + "logps/chosen": -39.18746566772461, + "logps/rejected": -65.97308349609375, + "loss": 0.072, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7474696636199951, + "rewards/margins": 6.0732421875, + "rewards/rejected": -6.820712089538574, + "step": 482 + }, + { + "epoch": 2.862222222222222, + "grad_norm": 6.181099423214191, + "learning_rate": 4.045629564199273e-07, + "logits/chosen": -2.1505188941955566, + "logits/rejected": -2.0988426208496094, + "logps/chosen": -49.55622863769531, + "logps/rejected": -69.73926544189453, + "loss": 0.0325, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3313867747783661, + "rewards/margins": 6.1561055183410645, + "rewards/rejected": -6.487491607666016, + "step": 483 + }, + { + "epoch": 2.868148148148148, + "grad_norm": 13.024076592466916, + "learning_rate": 4.04051841865545e-07, + "logits/chosen": -1.7880933284759521, + "logits/rejected": -2.0355589389801025, + "logps/chosen": -45.80509948730469, + "logps/rejected": -41.591346740722656, + "loss": 0.0756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5180366039276123, + "rewards/margins": 3.310197114944458, + "rewards/rejected": -3.8282337188720703, + "step": 484 + }, + { + "epoch": 2.8740740740740742, + "grad_norm": 5.834739744406833, + "learning_rate": 4.0353968711847974e-07, + "logits/chosen": -1.5249016284942627, + "logits/rejected": -1.4942580461502075, + "logps/chosen": -47.616737365722656, + "logps/rejected": -65.72685241699219, + "loss": 0.0395, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.47097060084342957, + "rewards/margins": 5.679997444152832, + "rewards/rejected": -6.150968074798584, + "step": 485 + }, + { + "epoch": 2.88, + "grad_norm": 13.420831403558674, + "learning_rate": 4.030264956369157e-07, + "logits/chosen": -1.1555256843566895, + "logits/rejected": -1.1296708583831787, + "logps/chosen": -50.24407958984375, + "logps/rejected": -61.94266891479492, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4866521954536438, + "rewards/margins": 5.132786750793457, + "rewards/rejected": -5.619439125061035, + "step": 486 + }, + { + "epoch": 2.885925925925926, + "grad_norm": 10.70741309743604, + "learning_rate": 4.02512270886037e-07, + "logits/chosen": -1.7210756540298462, + "logits/rejected": -1.7596194744110107, + "logps/chosen": -52.329383850097656, + "logps/rejected": -49.94835662841797, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6579025983810425, + "rewards/margins": 4.657275199890137, + "rewards/rejected": -5.315177917480469, + "step": 487 + }, + { + "epoch": 2.891851851851852, + "grad_norm": 11.033154443356473, + "learning_rate": 4.01997016338005e-07, + "logits/chosen": -1.3694206476211548, + "logits/rejected": -1.2944746017456055, + "logps/chosen": -42.40999221801758, + "logps/rejected": -62.894813537597656, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0166794061660767, + "rewards/margins": 6.320663928985596, + "rewards/rejected": -7.337343215942383, + "step": 488 + }, + { + "epoch": 2.897777777777778, + "grad_norm": 13.768219520367577, + "learning_rate": 4.014807354719342e-07, + "logits/chosen": -1.243717908859253, + "logits/rejected": -1.4224070310592651, + "logps/chosen": -41.15166091918945, + "logps/rejected": -46.17465591430664, + "loss": 0.07, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2711668610572815, + "rewards/margins": 5.416727066040039, + "rewards/rejected": -5.687893867492676, + "step": 489 + }, + { + "epoch": 2.9037037037037035, + "grad_norm": 12.055706759326277, + "learning_rate": 4.00963431773869e-07, + "logits/chosen": -1.600014567375183, + "logits/rejected": -1.4893999099731445, + "logps/chosen": -37.52910232543945, + "logps/rejected": -56.09258270263672, + "loss": 0.0817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5928289890289307, + "rewards/margins": 4.855260848999023, + "rewards/rejected": -5.448089599609375, + "step": 490 + }, + { + "epoch": 2.9096296296296296, + "grad_norm": 6.6663303072591695, + "learning_rate": 4.0044510873676043e-07, + "logits/chosen": -1.4263795614242554, + "logits/rejected": -1.3732199668884277, + "logps/chosen": -47.48921203613281, + "logps/rejected": -64.12741088867188, + "loss": 0.036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6781390905380249, + "rewards/margins": 5.753025054931641, + "rewards/rejected": -6.431163787841797, + "step": 491 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 9.203786828995497, + "learning_rate": 3.9992576986044223e-07, + "logits/chosen": -1.6113818883895874, + "logits/rejected": -1.4018325805664062, + "logps/chosen": -42.585716247558594, + "logps/rejected": -72.791259765625, + "loss": 0.0424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0629472732543945, + "rewards/margins": 6.943647384643555, + "rewards/rejected": -8.00659465789795, + "step": 492 + }, + { + "epoch": 2.9214814814814813, + "grad_norm": 13.268694442559015, + "learning_rate": 3.9940541865160726e-07, + "logits/chosen": -1.9327130317687988, + "logits/rejected": -1.9189391136169434, + "logps/chosen": -44.672420501708984, + "logps/rejected": -55.77782440185547, + "loss": 0.0619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5316133499145508, + "rewards/margins": 4.759648323059082, + "rewards/rejected": -5.291261672973633, + "step": 493 + }, + { + "epoch": 2.9274074074074075, + "grad_norm": 8.631688599659915, + "learning_rate": 3.9888405862378395e-07, + "logits/chosen": -1.689211368560791, + "logits/rejected": -1.6311895847320557, + "logps/chosen": -50.45976257324219, + "logps/rejected": -59.6866569519043, + "loss": 0.0428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5894455909729004, + "rewards/margins": 5.608089923858643, + "rewards/rejected": -6.197535514831543, + "step": 494 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 14.665098491526825, + "learning_rate": 3.983616932973124e-07, + "logits/chosen": -1.5946894884109497, + "logits/rejected": -1.6317996978759766, + "logps/chosen": -38.38591766357422, + "logps/rejected": -51.410438537597656, + "loss": 0.0811, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6509155035018921, + "rewards/margins": 4.341098785400391, + "rewards/rejected": -4.992014408111572, + "step": 495 + }, + { + "epoch": 2.9392592592592592, + "grad_norm": 9.56403557410635, + "learning_rate": 3.9783832619932076e-07, + "logits/chosen": -1.5779447555541992, + "logits/rejected": -1.5724250078201294, + "logps/chosen": -38.29379653930664, + "logps/rejected": -56.06721496582031, + "loss": 0.0543, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6153882741928101, + "rewards/margins": 5.206683158874512, + "rewards/rejected": -5.822071552276611, + "step": 496 + }, + { + "epoch": 2.9451851851851854, + "grad_norm": 8.708843532666638, + "learning_rate": 3.973139608637015e-07, + "logits/chosen": -1.666991114616394, + "logits/rejected": -1.704276204109192, + "logps/chosen": -44.54426574707031, + "logps/rejected": -64.68223571777344, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8914271593093872, + "rewards/margins": 5.089811325073242, + "rewards/rejected": -5.98123836517334, + "step": 497 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 9.740568197362656, + "learning_rate": 3.9678860083108713e-07, + "logits/chosen": -1.3851922750473022, + "logits/rejected": -1.2133703231811523, + "logps/chosen": -35.292724609375, + "logps/rejected": -56.05168151855469, + "loss": 0.0676, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12854641675949097, + "rewards/margins": 4.927215576171875, + "rewards/rejected": -5.05576229095459, + "step": 498 + }, + { + "epoch": 2.957037037037037, + "grad_norm": 6.6252979700182335, + "learning_rate": 3.9626224964882685e-07, + "logits/chosen": -1.1749128103256226, + "logits/rejected": -1.0022330284118652, + "logps/chosen": -37.999114990234375, + "logps/rejected": -53.09870910644531, + "loss": 0.0368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06034252047538757, + "rewards/margins": 5.425458908081055, + "rewards/rejected": -5.365116596221924, + "step": 499 + }, + { + "epoch": 2.962962962962963, + "grad_norm": 9.100591870121587, + "learning_rate": 3.957349108709623e-07, + "logits/chosen": -1.2385791540145874, + "logits/rejected": -1.1235514879226685, + "logps/chosen": -41.54240417480469, + "logps/rejected": -57.841407775878906, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4359692931175232, + "rewards/margins": 4.188076019287109, + "rewards/rejected": -4.624044895172119, + "step": 500 + }, + { + "epoch": 2.968888888888889, + "grad_norm": 8.00234767245306, + "learning_rate": 3.9520658805820335e-07, + "logits/chosen": -1.8589203357696533, + "logits/rejected": -1.9100127220153809, + "logps/chosen": -47.3678092956543, + "logps/rejected": -68.48341369628906, + "loss": 0.0394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45701107382774353, + "rewards/margins": 7.444866180419922, + "rewards/rejected": -7.901876449584961, + "step": 501 + }, + { + "epoch": 2.974814814814815, + "grad_norm": 10.099174335269522, + "learning_rate": 3.946772847779045e-07, + "logits/chosen": -1.275445580482483, + "logits/rejected": -1.436569333076477, + "logps/chosen": -39.363243103027344, + "logps/rejected": -48.795955657958984, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5841267704963684, + "rewards/margins": 5.275768280029297, + "rewards/rejected": -5.859894752502441, + "step": 502 + }, + { + "epoch": 2.9807407407407407, + "grad_norm": 8.064642083628808, + "learning_rate": 3.941470046040406e-07, + "logits/chosen": -1.7252798080444336, + "logits/rejected": -1.6667922735214233, + "logps/chosen": -45.00448226928711, + "logps/rejected": -53.28717803955078, + "loss": 0.0483, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4511069059371948, + "rewards/margins": 5.3886237144470215, + "rewards/rejected": -5.839731216430664, + "step": 503 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 9.662113041163051, + "learning_rate": 3.936157511171826e-07, + "logits/chosen": -1.906599521636963, + "logits/rejected": -1.7287859916687012, + "logps/chosen": -34.12775421142578, + "logps/rejected": -59.56098556518555, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7677432894706726, + "rewards/margins": 5.709640979766846, + "rewards/rejected": -6.477384567260742, + "step": 504 + }, + { + "epoch": 2.9925925925925925, + "grad_norm": 11.592574124839727, + "learning_rate": 3.9308352790447354e-07, + "logits/chosen": -1.1439859867095947, + "logits/rejected": -1.0030274391174316, + "logps/chosen": -41.51778030395508, + "logps/rejected": -55.89512252807617, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8742501139640808, + "rewards/margins": 5.307973861694336, + "rewards/rejected": -6.182224273681641, + "step": 505 + }, + { + "epoch": 2.9985185185185186, + "grad_norm": 8.429500412566364, + "learning_rate": 3.9255033855960414e-07, + "logits/chosen": -1.666643500328064, + "logits/rejected": -1.315439224243164, + "logps/chosen": -35.58273696899414, + "logps/rejected": -75.78899383544922, + "loss": 0.0499, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2345635890960693, + "rewards/margins": 5.818065643310547, + "rewards/rejected": -7.052628517150879, + "step": 506 + }, + { + "epoch": 3.0044444444444443, + "grad_norm": 7.9465699446302835, + "learning_rate": 3.920161866827889e-07, + "logits/chosen": -1.6582579612731934, + "logits/rejected": -1.4823225736618042, + "logps/chosen": -37.8082389831543, + "logps/rejected": -64.94297790527344, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8594337105751038, + "rewards/margins": 5.952652931213379, + "rewards/rejected": -6.812087059020996, + "step": 507 + }, + { + "epoch": 3.0103703703703704, + "grad_norm": 2.6939179253705063, + "learning_rate": 3.914810758807414e-07, + "logits/chosen": -1.1479514837265015, + "logits/rejected": -0.9574017524719238, + "logps/chosen": -34.60985565185547, + "logps/rejected": -61.0933952331543, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8293716907501221, + "rewards/margins": 5.337098121643066, + "rewards/rejected": -6.166469573974609, + "step": 508 + }, + { + "epoch": 3.0162962962962965, + "grad_norm": 3.2817462401951696, + "learning_rate": 3.9094500976665025e-07, + "logits/chosen": -1.9915450811386108, + "logits/rejected": -1.973534107208252, + "logps/chosen": -40.481937408447266, + "logps/rejected": -61.4949951171875, + "loss": 0.0242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6554736495018005, + "rewards/margins": 5.948192596435547, + "rewards/rejected": -6.603665351867676, + "step": 509 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 3.489297028949427, + "learning_rate": 3.904079919601542e-07, + "logits/chosen": -1.9627009630203247, + "logits/rejected": -1.7277880907058716, + "logps/chosen": -40.081199645996094, + "logps/rejected": -68.22596740722656, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9112627506256104, + "rewards/margins": 8.041915893554688, + "rewards/rejected": -8.953178405761719, + "step": 510 + }, + { + "epoch": 3.0281481481481483, + "grad_norm": 3.2704990299983976, + "learning_rate": 3.898700260873182e-07, + "logits/chosen": -2.022371292114258, + "logits/rejected": -1.998417854309082, + "logps/chosen": -35.742862701416016, + "logps/rejected": -48.1036376953125, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3673231899738312, + "rewards/margins": 4.890713691711426, + "rewards/rejected": -4.523390769958496, + "step": 511 + }, + { + "epoch": 3.034074074074074, + "grad_norm": 4.390628410073519, + "learning_rate": 3.893311157806091e-07, + "logits/chosen": -1.4293193817138672, + "logits/rejected": -1.4505712985992432, + "logps/chosen": -48.03293991088867, + "logps/rejected": -60.35232162475586, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2736024856567383, + "rewards/margins": 5.292821884155273, + "rewards/rejected": -6.56642484664917, + "step": 512 + }, + { + "epoch": 3.04, + "grad_norm": 2.5706286347658893, + "learning_rate": 3.887912646788703e-07, + "logits/chosen": -1.5962660312652588, + "logits/rejected": -1.4947378635406494, + "logps/chosen": -37.679420471191406, + "logps/rejected": -67.93458557128906, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7557663917541504, + "rewards/margins": 6.053064823150635, + "rewards/rejected": -6.808831691741943, + "step": 513 + }, + { + "epoch": 3.0459259259259257, + "grad_norm": 2.6847904429490503, + "learning_rate": 3.882504764272979e-07, + "logits/chosen": -1.7403340339660645, + "logits/rejected": -1.5254353284835815, + "logps/chosen": -39.930538177490234, + "logps/rejected": -72.91048431396484, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5932849645614624, + "rewards/margins": 5.84938907623291, + "rewards/rejected": -6.44267463684082, + "step": 514 + }, + { + "epoch": 3.051851851851852, + "grad_norm": 2.305116393194017, + "learning_rate": 3.8770875467741577e-07, + "logits/chosen": -1.658508062362671, + "logits/rejected": -1.5594358444213867, + "logps/chosen": -41.339759826660156, + "logps/rejected": -74.74822998046875, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5647907257080078, + "rewards/margins": 6.762681007385254, + "rewards/rejected": -7.327471733093262, + "step": 515 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 2.445276515915435, + "learning_rate": 3.871661030870511e-07, + "logits/chosen": -1.4913989305496216, + "logits/rejected": -1.3115919828414917, + "logps/chosen": -48.97815704345703, + "logps/rejected": -78.59329986572266, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.327409267425537, + "rewards/margins": 8.095717430114746, + "rewards/rejected": -9.423127174377441, + "step": 516 + }, + { + "epoch": 3.0637037037037036, + "grad_norm": 3.8785229521156293, + "learning_rate": 3.866225253203093e-07, + "logits/chosen": -1.665648102760315, + "logits/rejected": -1.6093999147415161, + "logps/chosen": -44.661468505859375, + "logps/rejected": -66.47560119628906, + "loss": 0.0232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9593856334686279, + "rewards/margins": 7.025257110595703, + "rewards/rejected": -7.984643936157227, + "step": 517 + }, + { + "epoch": 3.0696296296296297, + "grad_norm": 4.002989049596063, + "learning_rate": 3.8607802504754984e-07, + "logits/chosen": -1.787219524383545, + "logits/rejected": -1.6592282056808472, + "logps/chosen": -39.96990966796875, + "logps/rejected": -63.611854553222656, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06485319137573242, + "rewards/margins": 6.901584625244141, + "rewards/rejected": -6.966438293457031, + "step": 518 + }, + { + "epoch": 3.0755555555555554, + "grad_norm": 2.9675791319527955, + "learning_rate": 3.85532605945361e-07, + "logits/chosen": -1.6534134149551392, + "logits/rejected": -1.783852458000183, + "logps/chosen": -50.092247009277344, + "logps/rejected": -62.32521438598633, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8201130628585815, + "rewards/margins": 6.162082672119141, + "rewards/rejected": -6.9821953773498535, + "step": 519 + }, + { + "epoch": 3.0814814814814815, + "grad_norm": 1.7844151140655473, + "learning_rate": 3.849862716965352e-07, + "logits/chosen": -1.269382357597351, + "logits/rejected": -1.2153847217559814, + "logps/chosen": -45.336570739746094, + "logps/rejected": -78.79979705810547, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9207139015197754, + "rewards/margins": 8.384785652160645, + "rewards/rejected": -10.305500030517578, + "step": 520 + }, + { + "epoch": 3.0874074074074076, + "grad_norm": 2.1034172372012776, + "learning_rate": 3.8443902599004406e-07, + "logits/chosen": -1.8327618837356567, + "logits/rejected": -1.6914877891540527, + "logps/chosen": -34.10930633544922, + "logps/rejected": -58.51155471801758, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6798341274261475, + "rewards/margins": 6.230722427368164, + "rewards/rejected": -6.910556316375732, + "step": 521 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 3.2968154046385667, + "learning_rate": 3.8389087252101395e-07, + "logits/chosen": -1.264432430267334, + "logits/rejected": -1.2380956411361694, + "logps/chosen": -41.07152557373047, + "logps/rejected": -62.34565353393555, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5138745307922363, + "rewards/margins": 6.157859802246094, + "rewards/rejected": -8.671734809875488, + "step": 522 + }, + { + "epoch": 3.0992592592592594, + "grad_norm": 2.2221442473212, + "learning_rate": 3.833418149907001e-07, + "logits/chosen": -1.3116705417633057, + "logits/rejected": -1.46858811378479, + "logps/chosen": -56.232017517089844, + "logps/rejected": -68.88365173339844, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6804299354553223, + "rewards/margins": 8.152290344238281, + "rewards/rejected": -9.832719802856445, + "step": 523 + }, + { + "epoch": 3.105185185185185, + "grad_norm": 3.7570593059357664, + "learning_rate": 3.827918571064626e-07, + "logits/chosen": -0.9017548561096191, + "logits/rejected": -0.8780984878540039, + "logps/chosen": -41.97267150878906, + "logps/rejected": -57.89922332763672, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7553677558898926, + "rewards/margins": 5.701758861541748, + "rewards/rejected": -6.457126617431641, + "step": 524 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 3.3754159307101665, + "learning_rate": 3.822410025817406e-07, + "logits/chosen": -1.785221815109253, + "logits/rejected": -1.7697774171829224, + "logps/chosen": -40.88723373413086, + "logps/rejected": -57.72883605957031, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4020781517028809, + "rewards/margins": 5.520038604736328, + "rewards/rejected": -6.922116756439209, + "step": 525 + }, + { + "epoch": 3.117037037037037, + "grad_norm": 5.658194893720724, + "learning_rate": 3.816892551360279e-07, + "logits/chosen": -2.2358968257904053, + "logits/rejected": -2.079282522201538, + "logps/chosen": -50.923828125, + "logps/rejected": -102.37371826171875, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3834398984909058, + "rewards/margins": 10.244373321533203, + "rewards/rejected": -11.627814292907715, + "step": 526 + }, + { + "epoch": 3.122962962962963, + "grad_norm": 1.6516866779926083, + "learning_rate": 3.8113661849484723e-07, + "logits/chosen": -1.627011775970459, + "logits/rejected": -1.5414108037948608, + "logps/chosen": -43.25103759765625, + "logps/rejected": -64.36841583251953, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.902656078338623, + "rewards/margins": 5.974774360656738, + "rewards/rejected": -6.8774309158325195, + "step": 527 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 2.946008061171455, + "learning_rate": 3.805830963897256e-07, + "logits/chosen": -1.742790937423706, + "logits/rejected": -1.3985412120819092, + "logps/chosen": -42.32114791870117, + "logps/rejected": -102.3118896484375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4717878103256226, + "rewards/margins": 10.213966369628906, + "rewards/rejected": -11.685752868652344, + "step": 528 + }, + { + "epoch": 3.1348148148148147, + "grad_norm": 2.4632882994990672, + "learning_rate": 3.8002869255816873e-07, + "logits/chosen": -1.5484261512756348, + "logits/rejected": -1.6879433393478394, + "logps/chosen": -56.2159309387207, + "logps/rejected": -70.41197967529297, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9030942916870117, + "rewards/margins": 7.6704864501953125, + "rewards/rejected": -9.573579788208008, + "step": 529 + }, + { + "epoch": 3.140740740740741, + "grad_norm": 3.1788918790550964, + "learning_rate": 3.7947341074363593e-07, + "logits/chosen": -1.720942497253418, + "logits/rejected": -1.575656771659851, + "logps/chosen": -45.5122184753418, + "logps/rejected": -68.554931640625, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.585146427154541, + "rewards/margins": 7.759179592132568, + "rewards/rejected": -9.34432601928711, + "step": 530 + }, + { + "epoch": 3.1466666666666665, + "grad_norm": 6.875646057259051, + "learning_rate": 3.7891725469551485e-07, + "logits/chosen": -1.036989450454712, + "logits/rejected": -0.9969202280044556, + "logps/chosen": -31.839096069335938, + "logps/rejected": -55.27592086791992, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.72756028175354, + "rewards/margins": 7.08872127532959, + "rewards/rejected": -7.816281795501709, + "step": 531 + }, + { + "epoch": 3.1525925925925926, + "grad_norm": 2.3835006340562814, + "learning_rate": 3.783602281690963e-07, + "logits/chosen": -1.673425316810608, + "logits/rejected": -1.5485360622406006, + "logps/chosen": -35.52006912231445, + "logps/rejected": -68.11849975585938, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3884926438331604, + "rewards/margins": 6.947432994842529, + "rewards/rejected": -7.335925102233887, + "step": 532 + }, + { + "epoch": 3.1585185185185187, + "grad_norm": 3.989692171252318, + "learning_rate": 3.7780233492554856e-07, + "logits/chosen": -1.7301304340362549, + "logits/rejected": -1.6668508052825928, + "logps/chosen": -33.42741394042969, + "logps/rejected": -58.77648162841797, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36675456166267395, + "rewards/margins": 7.16362190246582, + "rewards/rejected": -7.530376434326172, + "step": 533 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 2.1314519227467335, + "learning_rate": 3.7724357873189244e-07, + "logits/chosen": -1.3821113109588623, + "logits/rejected": -1.401893973350525, + "logps/chosen": -41.42967987060547, + "logps/rejected": -58.070919036865234, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3712562322616577, + "rewards/margins": 6.296813011169434, + "rewards/rejected": -7.668068885803223, + "step": 534 + }, + { + "epoch": 3.1703703703703705, + "grad_norm": 1.9872916286881108, + "learning_rate": 3.766839633609753e-07, + "logits/chosen": -1.4474170207977295, + "logits/rejected": -1.5491206645965576, + "logps/chosen": -40.968833923339844, + "logps/rejected": -56.61018753051758, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8881934285163879, + "rewards/margins": 6.965273857116699, + "rewards/rejected": -7.8534674644470215, + "step": 535 + }, + { + "epoch": 3.176296296296296, + "grad_norm": 2.2679168350553076, + "learning_rate": 3.761234925914459e-07, + "logits/chosen": -1.2111544609069824, + "logits/rejected": -1.0226666927337646, + "logps/chosen": -44.167083740234375, + "logps/rejected": -65.37567138671875, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7891241312026978, + "rewards/margins": 7.499638557434082, + "rewards/rejected": -8.288763046264648, + "step": 536 + }, + { + "epoch": 3.1822222222222223, + "grad_norm": 2.9464081811606833, + "learning_rate": 3.755621702077292e-07, + "logits/chosen": -1.2695435285568237, + "logits/rejected": -1.2304050922393799, + "logps/chosen": -43.53666305541992, + "logps/rejected": -68.55162048339844, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.62175452709198, + "rewards/margins": 7.167821407318115, + "rewards/rejected": -8.789575576782227, + "step": 537 + }, + { + "epoch": 3.188148148148148, + "grad_norm": 2.577399784863567, + "learning_rate": 3.75e-07, + "logits/chosen": -1.3915406465530396, + "logits/rejected": -1.3516476154327393, + "logps/chosen": -36.945255279541016, + "logps/rejected": -62.853023529052734, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.697420597076416, + "rewards/margins": 8.28250503540039, + "rewards/rejected": -9.979926109313965, + "step": 538 + }, + { + "epoch": 3.194074074074074, + "grad_norm": 2.4302332490814877, + "learning_rate": 3.7443698576415795e-07, + "logits/chosen": -1.3171603679656982, + "logits/rejected": -1.2523666620254517, + "logps/chosen": -57.70205307006836, + "logps/rejected": -60.83525085449219, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0728650093078613, + "rewards/margins": 6.117823600769043, + "rewards/rejected": -7.190688610076904, + "step": 539 + }, + { + "epoch": 3.2, + "grad_norm": 1.7579619325838043, + "learning_rate": 3.738731313018019e-07, + "logits/chosen": -1.5011435747146606, + "logits/rejected": -1.496269702911377, + "logps/chosen": -42.36829376220703, + "logps/rejected": -60.10231018066406, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6206650733947754, + "rewards/margins": 6.9321393966674805, + "rewards/rejected": -7.552803993225098, + "step": 540 + }, + { + "epoch": 3.205925925925926, + "grad_norm": 2.539448968573661, + "learning_rate": 3.7330844042020384e-07, + "logits/chosen": -1.5449732542037964, + "logits/rejected": -1.5869890451431274, + "logps/chosen": -42.37911605834961, + "logps/rejected": -61.17539978027344, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2428092062473297, + "rewards/margins": 6.85809850692749, + "rewards/rejected": -7.100907325744629, + "step": 541 + }, + { + "epoch": 3.211851851851852, + "grad_norm": 1.6571223721636208, + "learning_rate": 3.727429169322837e-07, + "logits/chosen": -1.9273267984390259, + "logits/rejected": -1.9184643030166626, + "logps/chosen": -34.810516357421875, + "logps/rejected": -58.148075103759766, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5371243953704834, + "rewards/margins": 6.692122459411621, + "rewards/rejected": -8.229247093200684, + "step": 542 + }, + { + "epoch": 3.2177777777777776, + "grad_norm": 2.3470121776227626, + "learning_rate": 3.721765646565833e-07, + "logits/chosen": -1.3661789894104004, + "logits/rejected": -1.1353942155838013, + "logps/chosen": -44.39642333984375, + "logps/rejected": -76.97927856445312, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3597421646118164, + "rewards/margins": 8.367069244384766, + "rewards/rejected": -9.726812362670898, + "step": 543 + }, + { + "epoch": 3.2237037037037037, + "grad_norm": 2.1307972412032377, + "learning_rate": 3.7160938741724057e-07, + "logits/chosen": -1.4313926696777344, + "logits/rejected": -1.4233020544052124, + "logps/chosen": -42.53575134277344, + "logps/rejected": -58.79235076904297, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8175709247589111, + "rewards/margins": 7.115467071533203, + "rewards/rejected": -8.933037757873535, + "step": 544 + }, + { + "epoch": 3.2296296296296294, + "grad_norm": 2.3490068166225244, + "learning_rate": 3.7104138904396374e-07, + "logits/chosen": -2.0127451419830322, + "logits/rejected": -2.0704212188720703, + "logps/chosen": -52.262203216552734, + "logps/rejected": -67.50675964355469, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5701168775558472, + "rewards/margins": 7.017406940460205, + "rewards/rejected": -8.587523460388184, + "step": 545 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 4.119175048227472, + "learning_rate": 3.704725733720055e-07, + "logits/chosen": -1.7419947385787964, + "logits/rejected": -1.4981815814971924, + "logps/chosen": -46.47538757324219, + "logps/rejected": -87.71017456054688, + "loss": 0.0282, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.745780110359192, + "rewards/margins": 8.732891082763672, + "rewards/rejected": -10.478671073913574, + "step": 546 + }, + { + "epoch": 3.2414814814814816, + "grad_norm": 2.3392522055872553, + "learning_rate": 3.699029442421374e-07, + "logits/chosen": -1.5528309345245361, + "logits/rejected": -1.6342136859893799, + "logps/chosen": -44.845664978027344, + "logps/rejected": -68.08673095703125, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0458078384399414, + "rewards/margins": 7.162102222442627, + "rewards/rejected": -8.20790958404541, + "step": 547 + }, + { + "epoch": 3.2474074074074073, + "grad_norm": 2.0232840255336706, + "learning_rate": 3.693325055006232e-07, + "logits/chosen": -2.193101167678833, + "logits/rejected": -2.054701805114746, + "logps/chosen": -34.14925003051758, + "logps/rejected": -62.127071380615234, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9216932654380798, + "rewards/margins": 6.902035236358643, + "rewards/rejected": -7.823729038238525, + "step": 548 + }, + { + "epoch": 3.2533333333333334, + "grad_norm": 2.022216893496469, + "learning_rate": 3.6876126099919373e-07, + "logits/chosen": -1.370928168296814, + "logits/rejected": -1.2814081907272339, + "logps/chosen": -32.92713165283203, + "logps/rejected": -61.8518180847168, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1010417938232422, + "rewards/margins": 7.316898822784424, + "rewards/rejected": -8.417941093444824, + "step": 549 + }, + { + "epoch": 3.259259259259259, + "grad_norm": 2.899770523123044, + "learning_rate": 3.681892145950203e-07, + "logits/chosen": -1.481421709060669, + "logits/rejected": -1.3554790019989014, + "logps/chosen": -39.57270812988281, + "logps/rejected": -65.8409194946289, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1680896282196045, + "rewards/margins": 6.664047718048096, + "rewards/rejected": -7.832137584686279, + "step": 550 + }, + { + "epoch": 3.265185185185185, + "grad_norm": 2.315306748673688, + "learning_rate": 3.6761637015068893e-07, + "logits/chosen": -1.265090823173523, + "logits/rejected": -1.2061020135879517, + "logps/chosen": -49.136653900146484, + "logps/rejected": -82.91927337646484, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.704127073287964, + "rewards/margins": 10.218045234680176, + "rewards/rejected": -12.922172546386719, + "step": 551 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 3.497832278225285, + "learning_rate": 3.67042731534174e-07, + "logits/chosen": -1.5351362228393555, + "logits/rejected": -1.425960898399353, + "logps/chosen": -45.82518768310547, + "logps/rejected": -72.26417541503906, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3638558387756348, + "rewards/margins": 8.301955223083496, + "rewards/rejected": -10.665811538696289, + "step": 552 + }, + { + "epoch": 3.277037037037037, + "grad_norm": 2.4080638985142384, + "learning_rate": 3.6646830261881263e-07, + "logits/chosen": -1.958190679550171, + "logits/rejected": -1.7889118194580078, + "logps/chosen": -53.166595458984375, + "logps/rejected": -86.16584777832031, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.144944429397583, + "rewards/margins": 8.708757400512695, + "rewards/rejected": -9.853702545166016, + "step": 553 + }, + { + "epoch": 3.282962962962963, + "grad_norm": 1.773594008636619, + "learning_rate": 3.6589308728327797e-07, + "logits/chosen": -1.5912288427352905, + "logits/rejected": -1.6438047885894775, + "logps/chosen": -51.71526336669922, + "logps/rejected": -74.97248840332031, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5826064348220825, + "rewards/margins": 8.281431198120117, + "rewards/rejected": -9.86403751373291, + "step": 554 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 2.9348832139784267, + "learning_rate": 3.653170894115533e-07, + "logits/chosen": -2.006295680999756, + "logits/rejected": -1.864182472229004, + "logps/chosen": -41.15538024902344, + "logps/rejected": -59.866580963134766, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9665942192077637, + "rewards/margins": 6.412576198577881, + "rewards/rejected": -7.3791704177856445, + "step": 555 + }, + { + "epoch": 3.294814814814815, + "grad_norm": 2.726239917687131, + "learning_rate": 3.6474031289290586e-07, + "logits/chosen": -1.9200947284698486, + "logits/rejected": -1.7760218381881714, + "logps/chosen": -39.4498176574707, + "logps/rejected": -66.42085266113281, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2401189804077148, + "rewards/margins": 6.8181610107421875, + "rewards/rejected": -8.058279991149902, + "step": 556 + }, + { + "epoch": 3.300740740740741, + "grad_norm": 3.9297578507818494, + "learning_rate": 3.641627616218603e-07, + "logits/chosen": -1.6229515075683594, + "logits/rejected": -1.7213314771652222, + "logps/chosen": -43.064247131347656, + "logps/rejected": -52.334712982177734, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9696689248085022, + "rewards/margins": 5.829996585845947, + "rewards/rejected": -6.799665927886963, + "step": 557 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 2.5631539966075465, + "learning_rate": 3.6358443949817283e-07, + "logits/chosen": -1.0429606437683105, + "logits/rejected": -1.2104207277297974, + "logps/chosen": -62.048316955566406, + "logps/rejected": -65.05219268798828, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2899022102355957, + "rewards/margins": 6.154999256134033, + "rewards/rejected": -8.444901466369629, + "step": 558 + }, + { + "epoch": 3.3125925925925928, + "grad_norm": 1.3823868358140514, + "learning_rate": 3.630053504268046e-07, + "logits/chosen": -1.3712897300720215, + "logits/rejected": -1.4772628545761108, + "logps/chosen": -51.50799560546875, + "logps/rejected": -56.672271728515625, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.256461262702942, + "rewards/margins": 6.195413112640381, + "rewards/rejected": -7.451874256134033, + "step": 559 + }, + { + "epoch": 3.3185185185185184, + "grad_norm": 3.197742548961152, + "learning_rate": 3.62425498317895e-07, + "logits/chosen": -1.6012769937515259, + "logits/rejected": -1.5433298349380493, + "logps/chosen": -43.341773986816406, + "logps/rejected": -67.37931823730469, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6422319412231445, + "rewards/margins": 7.589832782745361, + "rewards/rejected": -9.232064247131348, + "step": 560 + }, + { + "epoch": 3.3244444444444445, + "grad_norm": 1.2358853384087398, + "learning_rate": 3.6184488708673597e-07, + "logits/chosen": -1.2647123336791992, + "logits/rejected": -1.1407551765441895, + "logps/chosen": -44.367061614990234, + "logps/rejected": -72.44467163085938, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.255622625350952, + "rewards/margins": 7.809691905975342, + "rewards/rejected": -10.065313339233398, + "step": 561 + }, + { + "epoch": 3.33037037037037, + "grad_norm": 1.8475498854849877, + "learning_rate": 3.6126352065374517e-07, + "logits/chosen": -1.470711350440979, + "logits/rejected": -1.288089632987976, + "logps/chosen": -47.34082794189453, + "logps/rejected": -72.96466827392578, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1720082759857178, + "rewards/margins": 7.697314262390137, + "rewards/rejected": -8.869322776794434, + "step": 562 + }, + { + "epoch": 3.3362962962962963, + "grad_norm": 1.9803556762007266, + "learning_rate": 3.6068140294443943e-07, + "logits/chosen": -1.3812202215194702, + "logits/rejected": -1.2750273942947388, + "logps/chosen": -43.31711959838867, + "logps/rejected": -63.418060302734375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6750633716583252, + "rewards/margins": 6.8236236572265625, + "rewards/rejected": -8.498686790466309, + "step": 563 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 2.8447867177205497, + "learning_rate": 3.6009853788940856e-07, + "logits/chosen": -1.3081015348434448, + "logits/rejected": -1.373008131980896, + "logps/chosen": -41.99599838256836, + "logps/rejected": -54.60331726074219, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2790483236312866, + "rewards/margins": 6.154098033905029, + "rewards/rejected": -7.433146953582764, + "step": 564 + }, + { + "epoch": 3.348148148148148, + "grad_norm": 2.611575108505304, + "learning_rate": 3.595149294242884e-07, + "logits/chosen": -1.4728150367736816, + "logits/rejected": -1.578336238861084, + "logps/chosen": -39.75627899169922, + "logps/rejected": -61.03871154785156, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8853597640991211, + "rewards/margins": 6.816481590270996, + "rewards/rejected": -7.701841354370117, + "step": 565 + }, + { + "epoch": 3.354074074074074, + "grad_norm": 1.4157122583137127, + "learning_rate": 3.589305814897346e-07, + "logits/chosen": -1.7357616424560547, + "logits/rejected": -1.8931334018707275, + "logps/chosen": -43.53925323486328, + "logps/rejected": -71.56130981445312, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1073122024536133, + "rewards/margins": 8.838522911071777, + "rewards/rejected": -9.94583511352539, + "step": 566 + }, + { + "epoch": 3.36, + "grad_norm": 2.635191240095001, + "learning_rate": 3.5834549803139586e-07, + "logits/chosen": -1.142876386642456, + "logits/rejected": -1.163784384727478, + "logps/chosen": -39.930816650390625, + "logps/rejected": -53.69756317138672, + "loss": 0.0126, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7858086824417114, + "rewards/margins": 6.6770148277282715, + "rewards/rejected": -8.462823867797852, + "step": 567 + }, + { + "epoch": 3.365925925925926, + "grad_norm": 1.9424680925260813, + "learning_rate": 3.5775968299988725e-07, + "logits/chosen": -2.3536765575408936, + "logits/rejected": -2.014406681060791, + "logps/chosen": -38.420127868652344, + "logps/rejected": -81.54600524902344, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.818911612033844, + "rewards/margins": 9.348888397216797, + "rewards/rejected": -10.167799949645996, + "step": 568 + }, + { + "epoch": 3.3718518518518517, + "grad_norm": 2.2845030472585948, + "learning_rate": 3.571731403507635e-07, + "logits/chosen": -1.2182214260101318, + "logits/rejected": -1.2486618757247925, + "logps/chosen": -37.355804443359375, + "logps/rejected": -62.415714263916016, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.916069746017456, + "rewards/margins": 7.980484485626221, + "rewards/rejected": -8.896553993225098, + "step": 569 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 7.950874380857367, + "learning_rate": 3.565858740444927e-07, + "logits/chosen": -1.2446577548980713, + "logits/rejected": -1.2009145021438599, + "logps/chosen": -34.67760467529297, + "logps/rejected": -50.92575454711914, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5964220762252808, + "rewards/margins": 5.911198616027832, + "rewards/rejected": -7.507620811462402, + "step": 570 + }, + { + "epoch": 3.383703703703704, + "grad_norm": 1.9717151883082555, + "learning_rate": 3.559978880464289e-07, + "logits/chosen": -1.235534429550171, + "logits/rejected": -1.3239576816558838, + "logps/chosen": -39.85081481933594, + "logps/rejected": -57.22991180419922, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38666263222694397, + "rewards/margins": 7.964129447937012, + "rewards/rejected": -8.350791931152344, + "step": 571 + }, + { + "epoch": 3.3896296296296295, + "grad_norm": 2.2224780083074216, + "learning_rate": 3.5540918632678583e-07, + "logits/chosen": -1.7430646419525146, + "logits/rejected": -1.7376810312271118, + "logps/chosen": -47.81946563720703, + "logps/rejected": -70.83491516113281, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5437856912612915, + "rewards/margins": 8.201151847839355, + "rewards/rejected": -9.744937896728516, + "step": 572 + }, + { + "epoch": 3.3955555555555557, + "grad_norm": 3.4318865366613247, + "learning_rate": 3.5481977286060995e-07, + "logits/chosen": -1.4170291423797607, + "logits/rejected": -1.5433483123779297, + "logps/chosen": -47.669212341308594, + "logps/rejected": -77.21953582763672, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.699216604232788, + "rewards/margins": 8.496070861816406, + "rewards/rejected": -11.195287704467773, + "step": 573 + }, + { + "epoch": 3.4014814814814813, + "grad_norm": 1.9972289407606707, + "learning_rate": 3.542296516277535e-07, + "logits/chosen": -1.040672779083252, + "logits/rejected": -1.0020769834518433, + "logps/chosen": -47.71010971069336, + "logps/rejected": -65.93901824951172, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5380032062530518, + "rewards/margins": 7.058391094207764, + "rewards/rejected": -8.596394538879395, + "step": 574 + }, + { + "epoch": 3.4074074074074074, + "grad_norm": 3.0477572554006946, + "learning_rate": 3.5363882661284767e-07, + "logits/chosen": -1.711016058921814, + "logits/rejected": -1.6152186393737793, + "logps/chosen": -37.883445739746094, + "logps/rejected": -53.224891662597656, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2196171283721924, + "rewards/margins": 6.921237945556641, + "rewards/rejected": -8.140854835510254, + "step": 575 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 2.1287963617619745, + "learning_rate": 3.53047301805276e-07, + "logits/chosen": -1.5620815753936768, + "logits/rejected": -1.5769602060317993, + "logps/chosen": -53.81451416015625, + "logps/rejected": -64.60205841064453, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9155237674713135, + "rewards/margins": 7.080358028411865, + "rewards/rejected": -7.995882034301758, + "step": 576 + }, + { + "epoch": 3.419259259259259, + "grad_norm": 3.119553571905832, + "learning_rate": 3.5245508119914683e-07, + "logits/chosen": -1.545169472694397, + "logits/rejected": -1.5148056745529175, + "logps/chosen": -44.71506881713867, + "logps/rejected": -68.02455139160156, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8168060779571533, + "rewards/margins": 8.291409492492676, + "rewards/rejected": -10.10821533203125, + "step": 577 + }, + { + "epoch": 3.4251851851851853, + "grad_norm": 2.1741179080277244, + "learning_rate": 3.518621687932671e-07, + "logits/chosen": -1.4934191703796387, + "logits/rejected": -1.4217729568481445, + "logps/chosen": -44.57159423828125, + "logps/rejected": -68.36115264892578, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.414076328277588, + "rewards/margins": 8.635876655578613, + "rewards/rejected": -10.049952507019043, + "step": 578 + }, + { + "epoch": 3.431111111111111, + "grad_norm": 3.5979935916648906, + "learning_rate": 3.5126856859111464e-07, + "logits/chosen": -1.3749088048934937, + "logits/rejected": -1.2176498174667358, + "logps/chosen": -43.616153717041016, + "logps/rejected": -71.89463806152344, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36016517877578735, + "rewards/margins": 9.268863677978516, + "rewards/rejected": -9.6290283203125, + "step": 579 + }, + { + "epoch": 3.437037037037037, + "grad_norm": 4.438241469697319, + "learning_rate": 3.5067428460081157e-07, + "logits/chosen": -1.0625383853912354, + "logits/rejected": -1.0654343366622925, + "logps/chosen": -34.844093322753906, + "logps/rejected": -53.295997619628906, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.308157354593277, + "rewards/margins": 7.436243057250977, + "rewards/rejected": -7.744400501251221, + "step": 580 + }, + { + "epoch": 3.442962962962963, + "grad_norm": 1.639309679332784, + "learning_rate": 3.5007932083509687e-07, + "logits/chosen": -1.6686345338821411, + "logits/rejected": -1.4766108989715576, + "logps/chosen": -44.36106872558594, + "logps/rejected": -79.30110168457031, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.979895830154419, + "rewards/margins": 9.241266250610352, + "rewards/rejected": -10.221162796020508, + "step": 581 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 1.9840382447613365, + "learning_rate": 3.494836813112998e-07, + "logits/chosen": -1.3479125499725342, + "logits/rejected": -1.352927565574646, + "logps/chosen": -46.45930862426758, + "logps/rejected": -60.15100860595703, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6770470142364502, + "rewards/margins": 8.113531112670898, + "rewards/rejected": -9.790578842163086, + "step": 582 + }, + { + "epoch": 3.454814814814815, + "grad_norm": 2.367955115764623, + "learning_rate": 3.488873700513124e-07, + "logits/chosen": -1.7887319326400757, + "logits/rejected": -1.5854450464248657, + "logps/chosen": -43.808101654052734, + "logps/rejected": -76.43734741210938, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.833268165588379, + "rewards/margins": 9.030920028686523, + "rewards/rejected": -10.864188194274902, + "step": 583 + }, + { + "epoch": 3.4607407407407407, + "grad_norm": 1.0365638614829655, + "learning_rate": 3.482903910815625e-07, + "logits/chosen": -1.5932282209396362, + "logits/rejected": -1.48412024974823, + "logps/chosen": -37.209903717041016, + "logps/rejected": -77.74060821533203, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3066656589508057, + "rewards/margins": 8.648603439331055, + "rewards/rejected": -9.955268859863281, + "step": 584 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 2.3646988217750335, + "learning_rate": 3.476927484329862e-07, + "logits/chosen": -1.5596637725830078, + "logits/rejected": -1.6348111629486084, + "logps/chosen": -42.485225677490234, + "logps/rejected": -53.519187927246094, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.168082356452942, + "rewards/margins": 6.6856207847595215, + "rewards/rejected": -7.853703022003174, + "step": 585 + }, + { + "epoch": 3.4725925925925925, + "grad_norm": 3.430393631502938, + "learning_rate": 3.4709444614100113e-07, + "logits/chosen": -1.573261022567749, + "logits/rejected": -1.584892749786377, + "logps/chosen": -40.92796325683594, + "logps/rejected": -58.3778076171875, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7870615124702454, + "rewards/margins": 7.023894309997559, + "rewards/rejected": -7.810956001281738, + "step": 586 + }, + { + "epoch": 3.4785185185185186, + "grad_norm": 3.106079011723257, + "learning_rate": 3.46495488245479e-07, + "logits/chosen": -1.4963181018829346, + "logits/rejected": -1.3813966512680054, + "logps/chosen": -32.22932434082031, + "logps/rejected": -63.854183197021484, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8944528102874756, + "rewards/margins": 9.122952461242676, + "rewards/rejected": -10.017404556274414, + "step": 587 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 0.9107948624763849, + "learning_rate": 3.4589587879071814e-07, + "logits/chosen": -1.522757887840271, + "logits/rejected": -1.4110288619995117, + "logps/chosen": -31.915634155273438, + "logps/rejected": -72.63192749023438, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4233702421188354, + "rewards/margins": 10.482945442199707, + "rewards/rejected": -11.906315803527832, + "step": 588 + }, + { + "epoch": 3.4903703703703703, + "grad_norm": 0.7654845847244033, + "learning_rate": 3.452956218254165e-07, + "logits/chosen": -0.7181179523468018, + "logits/rejected": -0.9414831399917603, + "logps/chosen": -60.01945114135742, + "logps/rejected": -79.92008209228516, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5449278354644775, + "rewards/margins": 8.45158863067627, + "rewards/rejected": -10.996516227722168, + "step": 589 + }, + { + "epoch": 3.4962962962962965, + "grad_norm": 2.7343597937414907, + "learning_rate": 3.44694721402644e-07, + "logits/chosen": -1.83599853515625, + "logits/rejected": -1.9086142778396606, + "logps/chosen": -42.17871856689453, + "logps/rejected": -66.1895751953125, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6002362966537476, + "rewards/margins": 9.458833694458008, + "rewards/rejected": -11.059069633483887, + "step": 590 + }, + { + "epoch": 3.502222222222222, + "grad_norm": 1.115662564068377, + "learning_rate": 3.440931815798156e-07, + "logits/chosen": -1.4597891569137573, + "logits/rejected": -1.5365428924560547, + "logps/chosen": -38.84928512573242, + "logps/rejected": -55.871421813964844, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3517990112304688, + "rewards/margins": 7.835026741027832, + "rewards/rejected": -9.186826705932617, + "step": 591 + }, + { + "epoch": 3.5081481481481482, + "grad_norm": 1.631199912161018, + "learning_rate": 3.434910064186633e-07, + "logits/chosen": -1.519524097442627, + "logits/rejected": -1.2287867069244385, + "logps/chosen": -54.680198669433594, + "logps/rejected": -81.27238464355469, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0984071493148804, + "rewards/margins": 9.239324569702148, + "rewards/rejected": -10.337732315063477, + "step": 592 + }, + { + "epoch": 3.514074074074074, + "grad_norm": 2.2319395068778145, + "learning_rate": 3.428881999852093e-07, + "logits/chosen": -1.8822609186172485, + "logits/rejected": -1.9900339841842651, + "logps/chosen": -60.23460388183594, + "logps/rejected": -66.42163848876953, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2803685665130615, + "rewards/margins": 7.103879928588867, + "rewards/rejected": -10.384248733520508, + "step": 593 + }, + { + "epoch": 3.52, + "grad_norm": 1.4189293626243713, + "learning_rate": 3.4228476634973836e-07, + "logits/chosen": -1.3194860219955444, + "logits/rejected": -1.3229904174804688, + "logps/chosen": -35.27375793457031, + "logps/rejected": -47.984230041503906, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1222331523895264, + "rewards/margins": 6.366780757904053, + "rewards/rejected": -7.489014148712158, + "step": 594 + }, + { + "epoch": 3.525925925925926, + "grad_norm": 1.4373583401932304, + "learning_rate": 3.4168070958676985e-07, + "logits/chosen": -1.0970607995986938, + "logits/rejected": -0.8358435034751892, + "logps/chosen": -32.3553352355957, + "logps/rejected": -66.28813934326172, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2889058589935303, + "rewards/margins": 7.470686435699463, + "rewards/rejected": -9.759592056274414, + "step": 595 + }, + { + "epoch": 3.531851851851852, + "grad_norm": 1.6877804430201413, + "learning_rate": 3.41076033775031e-07, + "logits/chosen": -1.2642066478729248, + "logits/rejected": -1.1834553480148315, + "logps/chosen": -45.60649108886719, + "logps/rejected": -72.86331176757812, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6096386909484863, + "rewards/margins": 8.637361526489258, + "rewards/rejected": -10.246999740600586, + "step": 596 + }, + { + "epoch": 3.537777777777778, + "grad_norm": 1.8759383611475728, + "learning_rate": 3.404707429974289e-07, + "logits/chosen": -1.5445129871368408, + "logits/rejected": -1.601426362991333, + "logps/chosen": -44.18944549560547, + "logps/rejected": -64.6243667602539, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5349621772766113, + "rewards/margins": 7.849261283874512, + "rewards/rejected": -10.384223937988281, + "step": 597 + }, + { + "epoch": 3.5437037037037036, + "grad_norm": 2.6280084496105967, + "learning_rate": 3.3986484134102294e-07, + "logits/chosen": -1.6996586322784424, + "logits/rejected": -1.57733154296875, + "logps/chosen": -32.68741226196289, + "logps/rejected": -51.48966979980469, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0923914909362793, + "rewards/margins": 7.8780293464660645, + "rewards/rejected": -8.970420837402344, + "step": 598 + }, + { + "epoch": 3.5496296296296297, + "grad_norm": 2.0022463955320684, + "learning_rate": 3.392583328969975e-07, + "logits/chosen": -1.815554141998291, + "logits/rejected": -1.819061517715454, + "logps/chosen": -43.40631866455078, + "logps/rejected": -59.122802734375, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6866397857666016, + "rewards/margins": 7.3134284019470215, + "rewards/rejected": -9.000067710876465, + "step": 599 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 1.6284522933445447, + "learning_rate": 3.3865122176063385e-07, + "logits/chosen": -1.4640413522720337, + "logits/rejected": -1.4328960180282593, + "logps/chosen": -64.9375, + "logps/rejected": -82.05753326416016, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3941292762756348, + "rewards/margins": 8.559239387512207, + "rewards/rejected": -11.953369140625, + "step": 600 + }, + { + "epoch": 3.5614814814814815, + "grad_norm": 2.5895176046146102, + "learning_rate": 3.380435120312831e-07, + "logits/chosen": -2.206264019012451, + "logits/rejected": -1.858229160308838, + "logps/chosen": -32.72801208496094, + "logps/rejected": -79.77110290527344, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8773959875106812, + "rewards/margins": 9.627907752990723, + "rewards/rejected": -10.505304336547852, + "step": 601 + }, + { + "epoch": 3.5674074074074076, + "grad_norm": 1.864701579440055, + "learning_rate": 3.374352078123379e-07, + "logits/chosen": -1.914277195930481, + "logits/rejected": -1.7512773275375366, + "logps/chosen": -47.07988739013672, + "logps/rejected": -86.24842834472656, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.883514165878296, + "rewards/margins": 9.177932739257812, + "rewards/rejected": -12.061447143554688, + "step": 602 + }, + { + "epoch": 3.5733333333333333, + "grad_norm": 4.481967030457234, + "learning_rate": 3.36826313211205e-07, + "logits/chosen": -1.933127999305725, + "logits/rejected": -1.8415874242782593, + "logps/chosen": -40.879615783691406, + "logps/rejected": -72.97755432128906, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9781012535095215, + "rewards/margins": 8.083052635192871, + "rewards/rejected": -10.061153411865234, + "step": 603 + }, + { + "epoch": 3.5792592592592594, + "grad_norm": 3.978013520120644, + "learning_rate": 3.36216832339278e-07, + "logits/chosen": -1.7329645156860352, + "logits/rejected": -1.6681479215621948, + "logps/chosen": -56.697105407714844, + "logps/rejected": -81.9181900024414, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.683328151702881, + "rewards/margins": 10.081719398498535, + "rewards/rejected": -12.765048027038574, + "step": 604 + }, + { + "epoch": 3.585185185185185, + "grad_norm": 1.4428693502250116, + "learning_rate": 3.3560676931190866e-07, + "logits/chosen": -1.5751123428344727, + "logits/rejected": -1.5669260025024414, + "logps/chosen": -60.73744201660156, + "logps/rejected": -90.38732147216797, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3062713146209717, + "rewards/margins": 10.367473602294922, + "rewards/rejected": -11.673744201660156, + "step": 605 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 2.0391085171572185, + "learning_rate": 3.3499612824837976e-07, + "logits/chosen": -1.114881157875061, + "logits/rejected": -0.9926152229309082, + "logps/chosen": -44.00819396972656, + "logps/rejected": -70.94087219238281, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.272582769393921, + "rewards/margins": 9.082612037658691, + "rewards/rejected": -10.355194091796875, + "step": 606 + }, + { + "epoch": 3.597037037037037, + "grad_norm": 1.057804941519028, + "learning_rate": 3.343849132718771e-07, + "logits/chosen": -1.5145657062530518, + "logits/rejected": -1.4814441204071045, + "logps/chosen": -41.45111083984375, + "logps/rejected": -62.060455322265625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5824816226959229, + "rewards/margins": 6.7437310218811035, + "rewards/rejected": -8.326212882995605, + "step": 607 + }, + { + "epoch": 3.602962962962963, + "grad_norm": 2.626600924225498, + "learning_rate": 3.337731285094616e-07, + "logits/chosen": -1.8049207925796509, + "logits/rejected": -1.74497652053833, + "logps/chosen": -40.572940826416016, + "logps/rejected": -63.675750732421875, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2193665504455566, + "rewards/margins": 6.881660461425781, + "rewards/rejected": -9.10102653503418, + "step": 608 + }, + { + "epoch": 3.608888888888889, + "grad_norm": 1.2907642289410493, + "learning_rate": 3.3316077809204163e-07, + "logits/chosen": -1.744649052619934, + "logits/rejected": -1.6019152402877808, + "logps/chosen": -52.210235595703125, + "logps/rejected": -74.92243194580078, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9966752529144287, + "rewards/margins": 9.754047393798828, + "rewards/rejected": -11.750722885131836, + "step": 609 + }, + { + "epoch": 3.6148148148148147, + "grad_norm": 2.2152999887221654, + "learning_rate": 3.3254786615434495e-07, + "logits/chosen": -1.7057602405548096, + "logits/rejected": -1.7353460788726807, + "logps/chosen": -33.89598083496094, + "logps/rejected": -52.0216064453125, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5950571298599243, + "rewards/margins": 6.827630996704102, + "rewards/rejected": -7.422687530517578, + "step": 610 + }, + { + "epoch": 3.620740740740741, + "grad_norm": 2.6009366158084894, + "learning_rate": 3.319343968348908e-07, + "logits/chosen": -1.6717418432235718, + "logits/rejected": -1.541961431503296, + "logps/chosen": -45.757110595703125, + "logps/rejected": -77.61996459960938, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8870725631713867, + "rewards/margins": 9.506970405578613, + "rewards/rejected": -12.394044876098633, + "step": 611 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 1.2297638615351605, + "learning_rate": 3.3132037427596186e-07, + "logits/chosen": -1.7202712297439575, + "logits/rejected": -1.6844654083251953, + "logps/chosen": -31.21420669555664, + "logps/rejected": -62.227325439453125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.343228816986084, + "rewards/margins": 9.035632133483887, + "rewards/rejected": -10.378860473632812, + "step": 612 + }, + { + "epoch": 3.6325925925925926, + "grad_norm": 2.3969901798812137, + "learning_rate": 3.3070580262357676e-07, + "logits/chosen": -0.8491813540458679, + "logits/rejected": -0.882433295249939, + "logps/chosen": -47.235923767089844, + "logps/rejected": -62.53350830078125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8392652273178101, + "rewards/margins": 7.459800720214844, + "rewards/rejected": -8.299065589904785, + "step": 613 + }, + { + "epoch": 3.6385185185185183, + "grad_norm": 1.5139951967012626, + "learning_rate": 3.3009068602746135e-07, + "logits/chosen": -1.6768100261688232, + "logits/rejected": -1.4328532218933105, + "logps/chosen": -50.028465270996094, + "logps/rejected": -88.61032104492188, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.902674436569214, + "rewards/margins": 10.083459854125977, + "rewards/rejected": -12.986133575439453, + "step": 614 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 3.876731182363171, + "learning_rate": 3.294750286410213e-07, + "logits/chosen": -1.7702586650848389, + "logits/rejected": -1.7367758750915527, + "logps/chosen": -41.51939392089844, + "logps/rejected": -65.0497817993164, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2305309772491455, + "rewards/margins": 8.569968223571777, + "rewards/rejected": -9.800498962402344, + "step": 615 + }, + { + "epoch": 3.6503703703703705, + "grad_norm": 5.985446773820635, + "learning_rate": 3.288588346213139e-07, + "logits/chosen": -1.6211884021759033, + "logits/rejected": -1.6923515796661377, + "logps/chosen": -47.695152282714844, + "logps/rejected": -63.015865325927734, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.490757942199707, + "rewards/margins": 8.131473541259766, + "rewards/rejected": -9.622231483459473, + "step": 616 + }, + { + "epoch": 3.656296296296296, + "grad_norm": 2.232883236877547, + "learning_rate": 3.282421081290195e-07, + "logits/chosen": -1.707023024559021, + "logits/rejected": -1.5826444625854492, + "logps/chosen": -47.43241882324219, + "logps/rejected": -76.33192443847656, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0400581359863281, + "rewards/margins": 7.394491672515869, + "rewards/rejected": -8.434549331665039, + "step": 617 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 2.0024865182396723, + "learning_rate": 3.2762485332841404e-07, + "logits/chosen": -1.7938523292541504, + "logits/rejected": -1.6399613618850708, + "logps/chosen": -33.98335266113281, + "logps/rejected": -55.67890167236328, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4796392321586609, + "rewards/margins": 6.540268421173096, + "rewards/rejected": -7.019907474517822, + "step": 618 + }, + { + "epoch": 3.6681481481481484, + "grad_norm": 2.1922455464812303, + "learning_rate": 3.27007074387341e-07, + "logits/chosen": -1.6379787921905518, + "logits/rejected": -1.5925250053405762, + "logps/chosen": -47.66102600097656, + "logps/rejected": -62.999855041503906, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8236360549926758, + "rewards/margins": 7.007270812988281, + "rewards/rejected": -8.830906867980957, + "step": 619 + }, + { + "epoch": 3.674074074074074, + "grad_norm": 3.044208794209425, + "learning_rate": 3.2638877547718263e-07, + "logits/chosen": -1.7888829708099365, + "logits/rejected": -1.5564265251159668, + "logps/chosen": -41.539024353027344, + "logps/rejected": -66.69727325439453, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7005274295806885, + "rewards/margins": 8.028372764587402, + "rewards/rejected": -10.728899955749512, + "step": 620 + }, + { + "epoch": 3.68, + "grad_norm": 2.5233284063470625, + "learning_rate": 3.2576996077283217e-07, + "logits/chosen": -0.9492640495300293, + "logits/rejected": -0.8763157725334167, + "logps/chosen": -43.827125549316406, + "logps/rejected": -68.0890884399414, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6933960914611816, + "rewards/margins": 9.143272399902344, + "rewards/rejected": -11.836669921875, + "step": 621 + }, + { + "epoch": 3.685925925925926, + "grad_norm": 1.3687699276618075, + "learning_rate": 3.251506344526658e-07, + "logits/chosen": -1.7169603109359741, + "logits/rejected": -1.5893797874450684, + "logps/chosen": -40.559478759765625, + "logps/rejected": -72.53877258300781, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8258345127105713, + "rewards/margins": 8.606887817382812, + "rewards/rejected": -10.432722091674805, + "step": 622 + }, + { + "epoch": 3.691851851851852, + "grad_norm": 1.811447056426145, + "learning_rate": 3.2453080069851403e-07, + "logits/chosen": -1.3575303554534912, + "logits/rejected": -1.4146182537078857, + "logps/chosen": -50.56128692626953, + "logps/rejected": -69.76527404785156, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2796287536621094, + "rewards/margins": 8.239351272583008, + "rewards/rejected": -10.518980026245117, + "step": 623 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 3.3947370038677263, + "learning_rate": 3.239104636956337e-07, + "logits/chosen": -1.722593069076538, + "logits/rejected": -1.583823323249817, + "logps/chosen": -51.618125915527344, + "logps/rejected": -80.45478820800781, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1515426635742188, + "rewards/margins": 9.869569778442383, + "rewards/rejected": -11.021112442016602, + "step": 624 + }, + { + "epoch": 3.7037037037037037, + "grad_norm": 0.7776787350074231, + "learning_rate": 3.2328962763267993e-07, + "logits/chosen": -1.3223106861114502, + "logits/rejected": -1.3556057214736938, + "logps/chosen": -45.37212371826172, + "logps/rejected": -70.70166015625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9760671854019165, + "rewards/margins": 8.622611999511719, + "rewards/rejected": -10.598679542541504, + "step": 625 + }, + { + "epoch": 3.70962962962963, + "grad_norm": 2.5724293932549016, + "learning_rate": 3.2266829670167736e-07, + "logits/chosen": -1.8816306591033936, + "logits/rejected": -1.727679967880249, + "logps/chosen": -43.26377868652344, + "logps/rejected": -87.9742660522461, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6621125936508179, + "rewards/margins": 10.484230041503906, + "rewards/rejected": -12.146342277526855, + "step": 626 + }, + { + "epoch": 3.7155555555555555, + "grad_norm": 2.743832235404827, + "learning_rate": 3.2204647509799216e-07, + "logits/chosen": -1.9375836849212646, + "logits/rejected": -1.9025050401687622, + "logps/chosen": -62.01106643676758, + "logps/rejected": -70.475830078125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.905947208404541, + "rewards/margins": 7.062991142272949, + "rewards/rejected": -9.968938827514648, + "step": 627 + }, + { + "epoch": 3.7214814814814816, + "grad_norm": 1.7788380759300608, + "learning_rate": 3.2142416702030365e-07, + "logits/chosen": -1.6858139038085938, + "logits/rejected": -1.46574068069458, + "logps/chosen": -34.817596435546875, + "logps/rejected": -68.91419982910156, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0730873346328735, + "rewards/margins": 8.742530822753906, + "rewards/rejected": -9.815618515014648, + "step": 628 + }, + { + "epoch": 3.7274074074074073, + "grad_norm": 2.0130515415798644, + "learning_rate": 3.2080137667057595e-07, + "logits/chosen": -1.6360899209976196, + "logits/rejected": -1.587815523147583, + "logps/chosen": -36.161739349365234, + "logps/rejected": -51.88941192626953, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8041471242904663, + "rewards/margins": 7.299750804901123, + "rewards/rejected": -8.103898048400879, + "step": 629 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 3.822746078893138, + "learning_rate": 3.201781082540297e-07, + "logits/chosen": -1.9967753887176514, + "logits/rejected": -1.8232698440551758, + "logps/chosen": -31.268970489501953, + "logps/rejected": -60.96702194213867, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2652736306190491, + "rewards/margins": 7.545168876647949, + "rewards/rejected": -7.810442924499512, + "step": 630 + }, + { + "epoch": 3.739259259259259, + "grad_norm": 3.4510801637868753, + "learning_rate": 3.1955436597911315e-07, + "logits/chosen": -1.2564302682876587, + "logits/rejected": -1.2304657697677612, + "logps/chosen": -44.99286651611328, + "logps/rejected": -61.647193908691406, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.33876633644104, + "rewards/margins": 8.576675415039062, + "rewards/rejected": -9.915441513061523, + "step": 631 + }, + { + "epoch": 3.745185185185185, + "grad_norm": 2.702360678099268, + "learning_rate": 3.1893015405747467e-07, + "logits/chosen": -1.3866653442382812, + "logits/rejected": -1.3158271312713623, + "logps/chosen": -37.21662521362305, + "logps/rejected": -61.110347747802734, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9909217357635498, + "rewards/margins": 10.208771705627441, + "rewards/rejected": -12.19969367980957, + "step": 632 + }, + { + "epoch": 3.7511111111111113, + "grad_norm": 1.307284115251361, + "learning_rate": 3.183054767039333e-07, + "logits/chosen": -1.290824294090271, + "logits/rejected": -1.5284253358840942, + "logps/chosen": -60.69805145263672, + "logps/rejected": -65.60289001464844, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2406201362609863, + "rewards/margins": 7.582151412963867, + "rewards/rejected": -9.822771072387695, + "step": 633 + }, + { + "epoch": 3.757037037037037, + "grad_norm": 1.5871167186944344, + "learning_rate": 3.176803381364512e-07, + "logits/chosen": -2.3787388801574707, + "logits/rejected": -2.057800769805908, + "logps/chosen": -46.9011344909668, + "logps/rejected": -81.9096908569336, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9250102043151855, + "rewards/margins": 8.01507568359375, + "rewards/rejected": -10.940086364746094, + "step": 634 + }, + { + "epoch": 3.762962962962963, + "grad_norm": 2.839912087777198, + "learning_rate": 3.170547425761046e-07, + "logits/chosen": -1.5725014209747314, + "logits/rejected": -1.5229789018630981, + "logps/chosen": -37.74675750732422, + "logps/rejected": -68.0882568359375, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.480069637298584, + "rewards/margins": 9.368812561035156, + "rewards/rejected": -10.848881721496582, + "step": 635 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 3.5480712966399017, + "learning_rate": 3.164286942470553e-07, + "logits/chosen": -1.657641887664795, + "logits/rejected": -1.3071664571762085, + "logps/chosen": -42.06265640258789, + "logps/rejected": -89.001220703125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8841521739959717, + "rewards/margins": 11.081164360046387, + "rewards/rejected": -12.965315818786621, + "step": 636 + }, + { + "epoch": 3.774814814814815, + "grad_norm": 1.7011165480825463, + "learning_rate": 3.1580219737652254e-07, + "logits/chosen": -1.8198199272155762, + "logits/rejected": -1.7223021984100342, + "logps/chosen": -40.75661087036133, + "logps/rejected": -70.20028686523438, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.252716541290283, + "rewards/margins": 9.318144798278809, + "rewards/rejected": -11.57086181640625, + "step": 637 + }, + { + "epoch": 3.7807407407407405, + "grad_norm": 1.9852461114559077, + "learning_rate": 3.1517525619475394e-07, + "logits/chosen": -1.4178622961044312, + "logits/rejected": -1.3874255418777466, + "logps/chosen": -35.899024963378906, + "logps/rejected": -53.1900634765625, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4967761039733887, + "rewards/margins": 7.301504135131836, + "rewards/rejected": -8.798280715942383, + "step": 638 + }, + { + "epoch": 3.7866666666666666, + "grad_norm": 0.9802194626023532, + "learning_rate": 3.145478749349974e-07, + "logits/chosen": -1.647719144821167, + "logits/rejected": -1.615530252456665, + "logps/chosen": -52.62092208862305, + "logps/rejected": -74.0809326171875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5072264671325684, + "rewards/margins": 8.355988502502441, + "rewards/rejected": -10.863214492797852, + "step": 639 + }, + { + "epoch": 3.7925925925925927, + "grad_norm": 2.410614126181565, + "learning_rate": 3.139200578334724e-07, + "logits/chosen": -1.3317790031433105, + "logits/rejected": -1.2641746997833252, + "logps/chosen": -47.751319885253906, + "logps/rejected": -70.82566833496094, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.740259885787964, + "rewards/margins": 6.87734317779541, + "rewards/rejected": -9.617603302001953, + "step": 640 + }, + { + "epoch": 3.7985185185185184, + "grad_norm": 3.0815761741282435, + "learning_rate": 3.132918091293411e-07, + "logits/chosen": -1.2445253133773804, + "logits/rejected": -1.2281543016433716, + "logps/chosen": -44.184261322021484, + "logps/rejected": -67.85798645019531, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.710573196411133, + "rewards/margins": 7.016053676605225, + "rewards/rejected": -9.726627349853516, + "step": 641 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 2.3331148256321246, + "learning_rate": 3.126631330646801e-07, + "logits/chosen": -1.5085358619689941, + "logits/rejected": -1.5857086181640625, + "logps/chosen": -37.31532669067383, + "logps/rejected": -60.988746643066406, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8923465013504028, + "rewards/margins": 8.49822998046875, + "rewards/rejected": -9.390576362609863, + "step": 642 + }, + { + "epoch": 3.8103703703703706, + "grad_norm": 2.3789903560495462, + "learning_rate": 3.120340338844516e-07, + "logits/chosen": -2.025918960571289, + "logits/rejected": -2.0642755031585693, + "logps/chosen": -41.12156677246094, + "logps/rejected": -59.014190673828125, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0245141983032227, + "rewards/margins": 8.251988410949707, + "rewards/rejected": -10.27650260925293, + "step": 643 + }, + { + "epoch": 3.8162962962962963, + "grad_norm": 2.6624718851606644, + "learning_rate": 3.1140451583647464e-07, + "logits/chosen": -1.790588617324829, + "logits/rejected": -1.8795886039733887, + "logps/chosen": -40.51091766357422, + "logps/rejected": -71.70164489746094, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3066911697387695, + "rewards/margins": 10.29784870147705, + "rewards/rejected": -12.60453987121582, + "step": 644 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 2.1636336873715973, + "learning_rate": 3.1077458317139677e-07, + "logits/chosen": -1.3567291498184204, + "logits/rejected": -1.442582368850708, + "logps/chosen": -37.33628845214844, + "logps/rejected": -50.815921783447266, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4248002767562866, + "rewards/margins": 6.379508018493652, + "rewards/rejected": -7.80430793762207, + "step": 645 + }, + { + "epoch": 3.828148148148148, + "grad_norm": 3.142146106508061, + "learning_rate": 3.1014424014266494e-07, + "logits/chosen": -1.6922850608825684, + "logits/rejected": -1.6728633642196655, + "logps/chosen": -32.17321014404297, + "logps/rejected": -57.95257568359375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2450605630874634, + "rewards/margins": 9.735218048095703, + "rewards/rejected": -10.980278015136719, + "step": 646 + }, + { + "epoch": 3.834074074074074, + "grad_norm": 1.0327148739028738, + "learning_rate": 3.095134910064971e-07, + "logits/chosen": -1.3771042823791504, + "logits/rejected": -1.573880672454834, + "logps/chosen": -50.5789680480957, + "logps/rejected": -55.373573303222656, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5233265161514282, + "rewards/margins": 7.985459327697754, + "rewards/rejected": -9.508785247802734, + "step": 647 + }, + { + "epoch": 3.84, + "grad_norm": 1.7197133806532463, + "learning_rate": 3.0888234002185325e-07, + "logits/chosen": -1.8431189060211182, + "logits/rejected": -1.7777403593063354, + "logps/chosen": -34.486351013183594, + "logps/rejected": -60.21614074707031, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4829984903335571, + "rewards/margins": 8.156380653381348, + "rewards/rejected": -9.639379501342773, + "step": 648 + }, + { + "epoch": 3.845925925925926, + "grad_norm": 1.7262987298671724, + "learning_rate": 3.082507914504068e-07, + "logits/chosen": -1.2412363290786743, + "logits/rejected": -1.2558114528656006, + "logps/chosen": -46.88982009887695, + "logps/rejected": -74.43185424804688, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0637423992156982, + "rewards/margins": 9.63477897644043, + "rewards/rejected": -11.69852066040039, + "step": 649 + }, + { + "epoch": 3.851851851851852, + "grad_norm": 3.642086296060839, + "learning_rate": 3.0761884955651563e-07, + "logits/chosen": -1.5971522331237793, + "logits/rejected": -1.6424740552902222, + "logps/chosen": -48.26805114746094, + "logps/rejected": -54.9079704284668, + "loss": 0.0131, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8122729659080505, + "rewards/margins": 7.315271377563477, + "rewards/rejected": -8.127544403076172, + "step": 650 + }, + { + "epoch": 3.8577777777777778, + "grad_norm": 2.5304696991003826, + "learning_rate": 3.069865186071938e-07, + "logits/chosen": -1.5003931522369385, + "logits/rejected": -1.357499599456787, + "logps/chosen": -38.52720642089844, + "logps/rejected": -66.80923461914062, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9663707613945007, + "rewards/margins": 9.103131294250488, + "rewards/rejected": -10.069501876831055, + "step": 651 + }, + { + "epoch": 3.863703703703704, + "grad_norm": 1.275499734191683, + "learning_rate": 3.0635380287208184e-07, + "logits/chosen": -2.0289735794067383, + "logits/rejected": -2.004368543624878, + "logps/chosen": -41.99580383300781, + "logps/rejected": -66.64373779296875, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.089265823364258, + "rewards/margins": 9.074212074279785, + "rewards/rejected": -11.163477897644043, + "step": 652 + }, + { + "epoch": 3.8696296296296295, + "grad_norm": 3.416542035435825, + "learning_rate": 3.057207066234188e-07, + "logits/chosen": -1.1870657205581665, + "logits/rejected": -1.2482651472091675, + "logps/chosen": -42.56634521484375, + "logps/rejected": -58.963680267333984, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7613425850868225, + "rewards/margins": 7.487583637237549, + "rewards/rejected": -8.248926162719727, + "step": 653 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 2.8088220635856005, + "learning_rate": 3.0508723413601296e-07, + "logits/chosen": -0.9739608764648438, + "logits/rejected": -0.9807726144790649, + "logps/chosen": -47.68108367919922, + "logps/rejected": -66.4764175415039, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5027942657470703, + "rewards/margins": 7.769590377807617, + "rewards/rejected": -10.272384643554688, + "step": 654 + }, + { + "epoch": 3.8814814814814813, + "grad_norm": 2.674068519159376, + "learning_rate": 3.0445338968721283e-07, + "logits/chosen": -1.8572252988815308, + "logits/rejected": -1.7199347019195557, + "logps/chosen": -51.708492279052734, + "logps/rejected": -80.59733581542969, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1489932537078857, + "rewards/margins": 10.27888298034668, + "rewards/rejected": -11.427876472473145, + "step": 655 + }, + { + "epoch": 3.8874074074074074, + "grad_norm": 2.9778285430298514, + "learning_rate": 3.0381917755687896e-07, + "logits/chosen": -1.8231042623519897, + "logits/rejected": -1.619692325592041, + "logps/chosen": -44.044952392578125, + "logps/rejected": -72.98948669433594, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5052106380462646, + "rewards/margins": 8.7330904006958, + "rewards/rejected": -11.238300323486328, + "step": 656 + }, + { + "epoch": 3.8933333333333335, + "grad_norm": 5.21161187358889, + "learning_rate": 3.0318460202735415e-07, + "logits/chosen": -1.2046602964401245, + "logits/rejected": -1.2679613828659058, + "logps/chosen": -36.49602508544922, + "logps/rejected": -59.53178024291992, + "loss": 0.0255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9609317779541016, + "rewards/margins": 7.4171247482299805, + "rewards/rejected": -9.378056526184082, + "step": 657 + }, + { + "epoch": 3.899259259259259, + "grad_norm": 0.901184093067749, + "learning_rate": 3.025496673834351e-07, + "logits/chosen": -1.6594105958938599, + "logits/rejected": -1.6217676401138306, + "logps/chosen": -46.861385345458984, + "logps/rejected": -63.68336486816406, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7903817892074585, + "rewards/margins": 8.82080364227295, + "rewards/rejected": -10.611185073852539, + "step": 658 + }, + { + "epoch": 3.9051851851851853, + "grad_norm": 4.509371487983056, + "learning_rate": 3.0191437791234335e-07, + "logits/chosen": -1.4345513582229614, + "logits/rejected": -1.4621853828430176, + "logps/chosen": -40.582069396972656, + "logps/rejected": -67.71416473388672, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.575059175491333, + "rewards/margins": 8.932149887084961, + "rewards/rejected": -10.507207870483398, + "step": 659 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 1.974158077967259, + "learning_rate": 3.0127873790369625e-07, + "logits/chosen": -1.837843656539917, + "logits/rejected": -1.8465306758880615, + "logps/chosen": -31.680309295654297, + "logps/rejected": -47.00579833984375, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8345265984535217, + "rewards/margins": 6.112514019012451, + "rewards/rejected": -6.94704008102417, + "step": 660 + }, + { + "epoch": 3.917037037037037, + "grad_norm": 3.9937805277174365, + "learning_rate": 3.006427516494781e-07, + "logits/chosen": -1.159081220626831, + "logits/rejected": -0.9563398957252502, + "logps/chosen": -33.920318603515625, + "logps/rejected": -63.24695587158203, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36665505170822144, + "rewards/margins": 8.001738548278809, + "rewards/rejected": -8.368393898010254, + "step": 661 + }, + { + "epoch": 3.9229629629629628, + "grad_norm": 3.017214815756595, + "learning_rate": 3.000064234440111e-07, + "logits/chosen": -1.8783760070800781, + "logits/rejected": -1.8840227127075195, + "logps/chosen": -46.433624267578125, + "logps/rejected": -66.72588348388672, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8653470873832703, + "rewards/margins": 7.810515403747559, + "rewards/rejected": -8.675862312316895, + "step": 662 + }, + { + "epoch": 3.928888888888889, + "grad_norm": 1.1211584130537804, + "learning_rate": 2.9936975758392644e-07, + "logits/chosen": -1.8524677753448486, + "logits/rejected": -1.9079548120498657, + "logps/chosen": -57.159034729003906, + "logps/rejected": -69.21312713623047, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6859822273254395, + "rewards/margins": 7.816958904266357, + "rewards/rejected": -10.502941131591797, + "step": 663 + }, + { + "epoch": 3.934814814814815, + "grad_norm": 2.5458746214899377, + "learning_rate": 2.9873275836813526e-07, + "logits/chosen": -1.7048072814941406, + "logits/rejected": -1.8171309232711792, + "logps/chosen": -48.66388702392578, + "logps/rejected": -63.1533203125, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7343381643295288, + "rewards/margins": 8.266016006469727, + "rewards/rejected": -10.000354766845703, + "step": 664 + }, + { + "epoch": 3.9407407407407407, + "grad_norm": 1.9736274445994668, + "learning_rate": 2.980954300977995e-07, + "logits/chosen": -1.619166374206543, + "logits/rejected": -1.5777629613876343, + "logps/chosen": -49.931640625, + "logps/rejected": -78.51671600341797, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6705405712127686, + "rewards/margins": 9.18619155883789, + "rewards/rejected": -12.856733322143555, + "step": 665 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 2.552365730581539, + "learning_rate": 2.974577770763028e-07, + "logits/chosen": -1.7241450548171997, + "logits/rejected": -1.6734381914138794, + "logps/chosen": -46.34965515136719, + "logps/rejected": -90.48927307128906, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3870890140533447, + "rewards/margins": 11.964576721191406, + "rewards/rejected": -13.351667404174805, + "step": 666 + }, + { + "epoch": 3.9525925925925924, + "grad_norm": 2.3305835074048704, + "learning_rate": 2.96819803609222e-07, + "logits/chosen": -1.9842724800109863, + "logits/rejected": -2.00046443939209, + "logps/chosen": -35.96994400024414, + "logps/rejected": -57.09709548950195, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.034604787826538, + "rewards/margins": 7.337852478027344, + "rewards/rejected": -8.372458457946777, + "step": 667 + }, + { + "epoch": 3.9585185185185185, + "grad_norm": 1.5958168958315395, + "learning_rate": 2.9618151400429735e-07, + "logits/chosen": -1.620787262916565, + "logits/rejected": -1.6079652309417725, + "logps/chosen": -42.559288024902344, + "logps/rejected": -65.05506134033203, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2509933710098267, + "rewards/margins": 8.53365421295166, + "rewards/rejected": -9.784646987915039, + "step": 668 + }, + { + "epoch": 3.964444444444444, + "grad_norm": 1.3718053352193202, + "learning_rate": 2.955429125714038e-07, + "logits/chosen": -1.5058695077896118, + "logits/rejected": -1.4394159317016602, + "logps/chosen": -37.802268981933594, + "logps/rejected": -67.28629302978516, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9949045777320862, + "rewards/margins": 8.948270797729492, + "rewards/rejected": -9.943174362182617, + "step": 669 + }, + { + "epoch": 3.9703703703703703, + "grad_norm": 1.3132523241213343, + "learning_rate": 2.949040036225218e-07, + "logits/chosen": -1.5042078495025635, + "logits/rejected": -1.624751091003418, + "logps/chosen": -54.53010940551758, + "logps/rejected": -74.37867736816406, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.157304286956787, + "rewards/margins": 9.978010177612305, + "rewards/rejected": -13.13531494140625, + "step": 670 + }, + { + "epoch": 3.9762962962962964, + "grad_norm": 1.3508609108880327, + "learning_rate": 2.9426479147170836e-07, + "logits/chosen": -1.6658778190612793, + "logits/rejected": -1.5220093727111816, + "logps/chosen": -35.21366500854492, + "logps/rejected": -65.32410430908203, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7612733840942383, + "rewards/margins": 9.228185653686523, + "rewards/rejected": -10.989458084106445, + "step": 671 + }, + { + "epoch": 3.982222222222222, + "grad_norm": 1.9276538943992374, + "learning_rate": 2.9362528043506767e-07, + "logits/chosen": -1.1979081630706787, + "logits/rejected": -1.2840179204940796, + "logps/chosen": -56.21266555786133, + "logps/rejected": -78.62033081054688, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.804077386856079, + "rewards/margins": 9.969850540161133, + "rewards/rejected": -12.773927688598633, + "step": 672 + }, + { + "epoch": 3.988148148148148, + "grad_norm": 2.7896360676711693, + "learning_rate": 2.929854748307221e-07, + "logits/chosen": -1.3991694450378418, + "logits/rejected": -1.437917709350586, + "logps/chosen": -44.147377014160156, + "logps/rejected": -65.40982818603516, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3406189680099487, + "rewards/margins": 9.758732795715332, + "rewards/rejected": -11.099350929260254, + "step": 673 + }, + { + "epoch": 3.9940740740740743, + "grad_norm": 2.80652860847146, + "learning_rate": 2.923453789787828e-07, + "logits/chosen": -1.7140244245529175, + "logits/rejected": -1.575933575630188, + "logps/chosen": -46.123146057128906, + "logps/rejected": -67.82181549072266, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8217127323150635, + "rewards/margins": 8.149543762207031, + "rewards/rejected": -9.971257209777832, + "step": 674 + }, + { + "epoch": 4.0, + "grad_norm": 3.2592488566875524, + "learning_rate": 2.9170499720132106e-07, + "logits/chosen": -1.7309939861297607, + "logits/rejected": -1.6547414064407349, + "logps/chosen": -50.30008316040039, + "logps/rejected": -86.93077850341797, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0141303539276123, + "rewards/margins": 9.713563919067383, + "rewards/rejected": -11.727693557739258, + "step": 675 + }, + { + "epoch": 4.005925925925926, + "grad_norm": 0.44616270427309485, + "learning_rate": 2.9106433382233877e-07, + "logits/chosen": -1.6794055700302124, + "logits/rejected": -1.6650766134262085, + "logps/chosen": -34.78199768066406, + "logps/rejected": -61.020729064941406, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.43512761592865, + "rewards/margins": 8.089323043823242, + "rewards/rejected": -9.524450302124023, + "step": 676 + }, + { + "epoch": 4.011851851851852, + "grad_norm": 0.341375032203737, + "learning_rate": 2.90423393167739e-07, + "logits/chosen": -1.6758373975753784, + "logits/rejected": -1.7946535348892212, + "logps/chosen": -53.309226989746094, + "logps/rejected": -90.131591796875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5563483238220215, + "rewards/margins": 12.359565734863281, + "rewards/rejected": -13.915914535522461, + "step": 677 + }, + { + "epoch": 4.017777777777778, + "grad_norm": 0.34700385447182475, + "learning_rate": 2.897821795652972e-07, + "logits/chosen": -1.9766027927398682, + "logits/rejected": -1.864401936531067, + "logps/chosen": -38.792755126953125, + "logps/rejected": -81.77716064453125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4709629714488983, + "rewards/margins": 11.713024139404297, + "rewards/rejected": -12.183988571166992, + "step": 678 + }, + { + "epoch": 4.023703703703704, + "grad_norm": 0.4037522038903148, + "learning_rate": 2.891406973446319e-07, + "logits/chosen": -1.5716941356658936, + "logits/rejected": -1.5570297241210938, + "logps/chosen": -60.495906829833984, + "logps/rejected": -77.89375305175781, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.105973958969116, + "rewards/margins": 8.402738571166992, + "rewards/rejected": -10.508712768554688, + "step": 679 + }, + { + "epoch": 4.029629629629629, + "grad_norm": 0.30335133752738797, + "learning_rate": 2.8849895083717536e-07, + "logits/chosen": -1.4498519897460938, + "logits/rejected": -1.3987611532211304, + "logps/chosen": -45.42414474487305, + "logps/rejected": -68.12642669677734, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8546907901763916, + "rewards/margins": 9.858153343200684, + "rewards/rejected": -12.712844848632812, + "step": 680 + }, + { + "epoch": 4.035555555555556, + "grad_norm": 0.5402501262528404, + "learning_rate": 2.8785694437614416e-07, + "logits/chosen": -1.4688129425048828, + "logits/rejected": -1.3592897653579712, + "logps/chosen": -41.556304931640625, + "logps/rejected": -67.09361267089844, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2745485305786133, + "rewards/margins": 8.337224006652832, + "rewards/rejected": -10.611772537231445, + "step": 681 + }, + { + "epoch": 4.0414814814814815, + "grad_norm": 0.557643458455523, + "learning_rate": 2.872146822965105e-07, + "logits/chosen": -1.367553472518921, + "logits/rejected": -1.0743577480316162, + "logps/chosen": -34.53441619873047, + "logps/rejected": -67.5462646484375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9292609691619873, + "rewards/margins": 9.131160736083984, + "rewards/rejected": -11.06042194366455, + "step": 682 + }, + { + "epoch": 4.047407407407407, + "grad_norm": 0.22131488540300684, + "learning_rate": 2.865721689349722e-07, + "logits/chosen": -1.3739020824432373, + "logits/rejected": -1.0636659860610962, + "logps/chosen": -43.13876724243164, + "logps/rejected": -79.29910278320312, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9893310070037842, + "rewards/margins": 10.191972732543945, + "rewards/rejected": -12.181304931640625, + "step": 683 + }, + { + "epoch": 4.053333333333334, + "grad_norm": 0.2563358598810667, + "learning_rate": 2.8592940862992415e-07, + "logits/chosen": -1.4154634475708008, + "logits/rejected": -1.2823362350463867, + "logps/chosen": -40.450199127197266, + "logps/rejected": -69.16603088378906, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.343075752258301, + "rewards/margins": 9.379046440124512, + "rewards/rejected": -11.722122192382812, + "step": 684 + }, + { + "epoch": 4.059259259259259, + "grad_norm": 0.4045380885099888, + "learning_rate": 2.8528640572142835e-07, + "logits/chosen": -1.6875383853912354, + "logits/rejected": -1.620922565460205, + "logps/chosen": -32.78309631347656, + "logps/rejected": -56.436370849609375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2603614330291748, + "rewards/margins": 9.30524730682373, + "rewards/rejected": -10.565608978271484, + "step": 685 + }, + { + "epoch": 4.065185185185185, + "grad_norm": 0.38515890207527487, + "learning_rate": 2.846431645511851e-07, + "logits/chosen": -1.4085663557052612, + "logits/rejected": -1.379919409751892, + "logps/chosen": -36.84951400756836, + "logps/rejected": -67.17756652832031, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5410351753234863, + "rewards/margins": 10.908300399780273, + "rewards/rejected": -12.449335098266602, + "step": 686 + }, + { + "epoch": 4.071111111111111, + "grad_norm": 1.1313449143022392, + "learning_rate": 2.839996894625037e-07, + "logits/chosen": -1.6871917247772217, + "logits/rejected": -1.337934136390686, + "logps/chosen": -42.18588638305664, + "logps/rejected": -82.45413970947266, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5758676528930664, + "rewards/margins": 11.01657485961914, + "rewards/rejected": -13.592442512512207, + "step": 687 + }, + { + "epoch": 4.077037037037037, + "grad_norm": 0.7964723232122505, + "learning_rate": 2.8335598480027224e-07, + "logits/chosen": -1.4245645999908447, + "logits/rejected": -1.473569393157959, + "logps/chosen": -54.070159912109375, + "logps/rejected": -68.75679016113281, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.470885753631592, + "rewards/margins": 7.473635673522949, + "rewards/rejected": -9.944520950317383, + "step": 688 + }, + { + "epoch": 4.082962962962963, + "grad_norm": 0.3792283961142719, + "learning_rate": 2.8271205491092963e-07, + "logits/chosen": -1.634963870048523, + "logits/rejected": -1.5209393501281738, + "logps/chosen": -38.38444519042969, + "logps/rejected": -76.35704803466797, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5493276119232178, + "rewards/margins": 12.205493927001953, + "rewards/rejected": -13.75482177734375, + "step": 689 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 0.3997631224622663, + "learning_rate": 2.820679041424352e-07, + "logits/chosen": -1.3238720893859863, + "logits/rejected": -1.2506842613220215, + "logps/chosen": -30.587196350097656, + "logps/rejected": -53.626861572265625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9008562564849854, + "rewards/margins": 7.825616836547852, + "rewards/rejected": -9.726472854614258, + "step": 690 + }, + { + "epoch": 4.094814814814815, + "grad_norm": 0.4811894590769858, + "learning_rate": 2.814235368442398e-07, + "logits/chosen": -1.9758150577545166, + "logits/rejected": -1.9664149284362793, + "logps/chosen": -52.41566467285156, + "logps/rejected": -81.12825775146484, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.485563278198242, + "rewards/margins": 9.805296897888184, + "rewards/rejected": -12.290860176086426, + "step": 691 + }, + { + "epoch": 4.100740740740741, + "grad_norm": 0.4049087658472698, + "learning_rate": 2.8077895736725647e-07, + "logits/chosen": -1.5128250122070312, + "logits/rejected": -1.4495248794555664, + "logps/chosen": -47.274620056152344, + "logps/rejected": -81.28178405761719, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5623488426208496, + "rewards/margins": 10.90963077545166, + "rewards/rejected": -13.471979141235352, + "step": 692 + }, + { + "epoch": 4.1066666666666665, + "grad_norm": 0.7133811245985797, + "learning_rate": 2.801341700638307e-07, + "logits/chosen": -1.385081171989441, + "logits/rejected": -1.4335522651672363, + "logps/chosen": -53.6270751953125, + "logps/rejected": -71.13097381591797, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.143235683441162, + "rewards/margins": 9.599444389343262, + "rewards/rejected": -12.742679595947266, + "step": 693 + }, + { + "epoch": 4.112592592592593, + "grad_norm": 0.4479230736646601, + "learning_rate": 2.7948917928771153e-07, + "logits/chosen": -1.4062227010726929, + "logits/rejected": -1.4977256059646606, + "logps/chosen": -42.570579528808594, + "logps/rejected": -71.04094696044922, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6300326585769653, + "rewards/margins": 10.64380931854248, + "rewards/rejected": -12.273841857910156, + "step": 694 + }, + { + "epoch": 4.118518518518519, + "grad_norm": 0.2811534329638401, + "learning_rate": 2.7884398939402156e-07, + "logits/chosen": -1.6848721504211426, + "logits/rejected": -1.6964337825775146, + "logps/chosen": -38.059608459472656, + "logps/rejected": -56.316627502441406, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5647661685943604, + "rewards/margins": 9.532676696777344, + "rewards/rejected": -11.097443580627441, + "step": 695 + }, + { + "epoch": 4.124444444444444, + "grad_norm": 0.5476766976057272, + "learning_rate": 2.78198604739228e-07, + "logits/chosen": -1.293939471244812, + "logits/rejected": -1.3563249111175537, + "logps/chosen": -46.362632751464844, + "logps/rejected": -52.52191162109375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0411702394485474, + "rewards/margins": 8.045003890991211, + "rewards/rejected": -9.086174011230469, + "step": 696 + }, + { + "epoch": 4.13037037037037, + "grad_norm": 0.28241806387271107, + "learning_rate": 2.7755302968111346e-07, + "logits/chosen": -2.09261155128479, + "logits/rejected": -2.1400394439697266, + "logps/chosen": -49.53968811035156, + "logps/rejected": -97.72592163085938, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2412805557250977, + "rewards/margins": 13.639833450317383, + "rewards/rejected": -14.88111400604248, + "step": 697 + }, + { + "epoch": 4.136296296296297, + "grad_norm": 0.3964227629392675, + "learning_rate": 2.7690726857874564e-07, + "logits/chosen": -1.924755573272705, + "logits/rejected": -1.8481194972991943, + "logps/chosen": -40.84855270385742, + "logps/rejected": -65.64152526855469, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4446959495544434, + "rewards/margins": 10.051863670349121, + "rewards/rejected": -11.496559143066406, + "step": 698 + }, + { + "epoch": 4.142222222222222, + "grad_norm": 0.18798709520475315, + "learning_rate": 2.7626132579244893e-07, + "logits/chosen": -1.5864109992980957, + "logits/rejected": -1.5083001852035522, + "logps/chosen": -42.136207580566406, + "logps/rejected": -73.30389404296875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3849995136260986, + "rewards/margins": 9.932413101196289, + "rewards/rejected": -12.317412376403809, + "step": 699 + }, + { + "epoch": 4.148148148148148, + "grad_norm": 0.31778139923542953, + "learning_rate": 2.756152056837743e-07, + "logits/chosen": -1.566931128501892, + "logits/rejected": -1.6499443054199219, + "logps/chosen": -47.97265625, + "logps/rejected": -62.361839294433594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.526698589324951, + "rewards/margins": 9.130287170410156, + "rewards/rejected": -11.656986236572266, + "step": 700 + }, + { + "epoch": 4.1540740740740745, + "grad_norm": 0.27186160437951046, + "learning_rate": 2.749689126154698e-07, + "logits/chosen": -1.5107808113098145, + "logits/rejected": -1.403849482536316, + "logps/chosen": -34.08753967285156, + "logps/rejected": -59.909873962402344, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3250665664672852, + "rewards/margins": 8.911955833435059, + "rewards/rejected": -10.237022399902344, + "step": 701 + }, + { + "epoch": 4.16, + "grad_norm": 0.5485812218119803, + "learning_rate": 2.743224509514519e-07, + "logits/chosen": -1.731938362121582, + "logits/rejected": -1.6260651350021362, + "logps/chosen": -43.674652099609375, + "logps/rejected": -76.96273803710938, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.195899724960327, + "rewards/margins": 11.439129829406738, + "rewards/rejected": -14.635029792785645, + "step": 702 + }, + { + "epoch": 4.165925925925926, + "grad_norm": 0.4031706245145299, + "learning_rate": 2.73675825056775e-07, + "logits/chosen": -1.7939112186431885, + "logits/rejected": -1.9431332349777222, + "logps/chosen": -45.181190490722656, + "logps/rejected": -61.67738342285156, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5738084316253662, + "rewards/margins": 9.546890258789062, + "rewards/rejected": -10.120698928833008, + "step": 703 + }, + { + "epoch": 4.1718518518518515, + "grad_norm": 0.3689385733663178, + "learning_rate": 2.730290392976025e-07, + "logits/chosen": -1.2359377145767212, + "logits/rejected": -1.576162338256836, + "logps/chosen": -52.07234191894531, + "logps/rejected": -66.13168334960938, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4633660316467285, + "rewards/margins": 9.161190032958984, + "rewards/rejected": -11.624555587768555, + "step": 704 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 0.2884304692703736, + "learning_rate": 2.723820980411774e-07, + "logits/chosen": -1.401477575302124, + "logits/rejected": -1.3764700889587402, + "logps/chosen": -37.40475082397461, + "logps/rejected": -61.718605041503906, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7576137781143188, + "rewards/margins": 8.915960311889648, + "rewards/rejected": -10.67357349395752, + "step": 705 + }, + { + "epoch": 4.183703703703704, + "grad_norm": 0.40859441414556874, + "learning_rate": 2.7173500565579256e-07, + "logits/chosen": -1.9278970956802368, + "logits/rejected": -1.8463729619979858, + "logps/chosen": -56.57386779785156, + "logps/rejected": -95.51248168945312, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.846968173980713, + "rewards/margins": 13.251940727233887, + "rewards/rejected": -17.098907470703125, + "step": 706 + }, + { + "epoch": 4.189629629629629, + "grad_norm": 0.1915169354428314, + "learning_rate": 2.7108776651076116e-07, + "logits/chosen": -1.8610478639602661, + "logits/rejected": -1.8643684387207031, + "logps/chosen": -34.90126419067383, + "logps/rejected": -67.45941162109375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8161227703094482, + "rewards/margins": 10.358243942260742, + "rewards/rejected": -12.174367904663086, + "step": 707 + }, + { + "epoch": 4.195555555555556, + "grad_norm": 0.6811289236407966, + "learning_rate": 2.704403849763878e-07, + "logits/chosen": -1.4927406311035156, + "logits/rejected": -1.4667131900787354, + "logps/chosen": -46.89321517944336, + "logps/rejected": -73.92708587646484, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.232743740081787, + "rewards/margins": 9.66046142578125, + "rewards/rejected": -11.893203735351562, + "step": 708 + }, + { + "epoch": 4.201481481481482, + "grad_norm": 0.7640135263515507, + "learning_rate": 2.697928654239378e-07, + "logits/chosen": -1.563415288925171, + "logits/rejected": -1.6332391500473022, + "logps/chosen": -40.70332717895508, + "logps/rejected": -60.507545471191406, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8509180545806885, + "rewards/margins": 8.442529678344727, + "rewards/rejected": -10.293449401855469, + "step": 709 + }, + { + "epoch": 4.207407407407407, + "grad_norm": 0.6091245161088805, + "learning_rate": 2.6914521222560907e-07, + "logits/chosen": -1.6223444938659668, + "logits/rejected": -1.419754981994629, + "logps/chosen": -50.94943618774414, + "logps/rejected": -81.05281829833984, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3365554809570312, + "rewards/margins": 10.70995044708252, + "rewards/rejected": -13.046506881713867, + "step": 710 + }, + { + "epoch": 4.213333333333333, + "grad_norm": 0.49108019369046707, + "learning_rate": 2.6849742975450163e-07, + "logits/chosen": -1.5237915515899658, + "logits/rejected": -1.3569711446762085, + "logps/chosen": -46.981380462646484, + "logps/rejected": -74.44270324707031, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2535314559936523, + "rewards/margins": 9.97585391998291, + "rewards/rejected": -11.229385375976562, + "step": 711 + }, + { + "epoch": 4.2192592592592595, + "grad_norm": 0.5511110163173969, + "learning_rate": 2.6784952238458824e-07, + "logits/chosen": -1.4845302104949951, + "logits/rejected": -1.4548472166061401, + "logps/chosen": -42.92816162109375, + "logps/rejected": -69.95036315917969, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3284811973571777, + "rewards/margins": 10.633824348449707, + "rewards/rejected": -12.962305068969727, + "step": 712 + }, + { + "epoch": 4.225185185185185, + "grad_norm": 0.6631086415739061, + "learning_rate": 2.672014944906854e-07, + "logits/chosen": -1.6981308460235596, + "logits/rejected": -1.5001087188720703, + "logps/chosen": -46.60968017578125, + "logps/rejected": -88.11974334716797, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.792693853378296, + "rewards/margins": 11.876981735229492, + "rewards/rejected": -13.669676780700684, + "step": 713 + }, + { + "epoch": 4.231111111111111, + "grad_norm": 0.23031970682456634, + "learning_rate": 2.665533504484231e-07, + "logits/chosen": -1.549435019493103, + "logits/rejected": -1.5263104438781738, + "logps/chosen": -45.03104782104492, + "logps/rejected": -68.78125762939453, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.494513511657715, + "rewards/margins": 10.230485916137695, + "rewards/rejected": -13.72499942779541, + "step": 714 + }, + { + "epoch": 4.237037037037037, + "grad_norm": 0.4011643778274213, + "learning_rate": 2.6590509463421573e-07, + "logits/chosen": -2.088960647583008, + "logits/rejected": -1.963022232055664, + "logps/chosen": -37.30885314941406, + "logps/rejected": -71.3285903930664, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.178884506225586, + "rewards/margins": 10.42874526977539, + "rewards/rejected": -12.607630729675293, + "step": 715 + }, + { + "epoch": 4.242962962962963, + "grad_norm": 0.4448671943062079, + "learning_rate": 2.6525673142523217e-07, + "logits/chosen": -1.758203148841858, + "logits/rejected": -1.5110293626785278, + "logps/chosen": -55.106895446777344, + "logps/rejected": -96.56599426269531, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7676544189453125, + "rewards/margins": 13.347644805908203, + "rewards/rejected": -17.115299224853516, + "step": 716 + }, + { + "epoch": 4.248888888888889, + "grad_norm": 0.4172731802131882, + "learning_rate": 2.646082651993668e-07, + "logits/chosen": -2.040677070617676, + "logits/rejected": -2.044556140899658, + "logps/chosen": -45.693824768066406, + "logps/rejected": -64.56766510009766, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5376042127609253, + "rewards/margins": 9.785834312438965, + "rewards/rejected": -11.32343864440918, + "step": 717 + }, + { + "epoch": 4.254814814814814, + "grad_norm": 0.4123890195750414, + "learning_rate": 2.6395970033520944e-07, + "logits/chosen": -1.487902283668518, + "logits/rejected": -1.4854258298873901, + "logps/chosen": -50.966705322265625, + "logps/rejected": -65.53778076171875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6284089088439941, + "rewards/margins": 8.387168884277344, + "rewards/rejected": -10.01557731628418, + "step": 718 + }, + { + "epoch": 4.260740740740741, + "grad_norm": 1.6566065496557807, + "learning_rate": 2.6331104121201575e-07, + "logits/chosen": -1.9715131521224976, + "logits/rejected": -1.799248456954956, + "logps/chosen": -50.722068786621094, + "logps/rejected": -91.66192626953125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7970070838928223, + "rewards/margins": 9.966672897338867, + "rewards/rejected": -13.763680458068848, + "step": 719 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.39476587023081217, + "learning_rate": 2.626622922096782e-07, + "logits/chosen": -1.5378649234771729, + "logits/rejected": -1.4630424976348877, + "logps/chosen": -46.547203063964844, + "logps/rejected": -80.9116439819336, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3771755695343018, + "rewards/margins": 11.267572402954102, + "rewards/rejected": -13.644747734069824, + "step": 720 + }, + { + "epoch": 4.272592592592592, + "grad_norm": 0.5669746583387688, + "learning_rate": 2.6201345770869584e-07, + "logits/chosen": -1.5094960927963257, + "logits/rejected": -1.3966280221939087, + "logps/chosen": -40.22157287597656, + "logps/rejected": -72.3768539428711, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9138059616088867, + "rewards/margins": 11.016143798828125, + "rewards/rejected": -12.929948806762695, + "step": 721 + }, + { + "epoch": 4.278518518518519, + "grad_norm": 0.6436582390962847, + "learning_rate": 2.6136454209014513e-07, + "logits/chosen": -1.6182851791381836, + "logits/rejected": -1.5757420063018799, + "logps/chosen": -47.499351501464844, + "logps/rejected": -72.69096374511719, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6912693977355957, + "rewards/margins": 9.962089538574219, + "rewards/rejected": -12.653359413146973, + "step": 722 + }, + { + "epoch": 4.2844444444444445, + "grad_norm": 0.3075696401766946, + "learning_rate": 2.6071554973565036e-07, + "logits/chosen": -1.0639721155166626, + "logits/rejected": -1.0852502584457397, + "logps/chosen": -40.48524475097656, + "logps/rejected": -65.24382019042969, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.40928316116333, + "rewards/margins": 10.195439338684082, + "rewards/rejected": -12.604722023010254, + "step": 723 + }, + { + "epoch": 4.29037037037037, + "grad_norm": 0.339129929102594, + "learning_rate": 2.600664850273538e-07, + "logits/chosen": -1.5270686149597168, + "logits/rejected": -1.4607093334197998, + "logps/chosen": -52.461341857910156, + "logps/rejected": -70.19440460205078, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4749996662139893, + "rewards/margins": 9.578967094421387, + "rewards/rejected": -12.053966522216797, + "step": 724 + }, + { + "epoch": 4.296296296296296, + "grad_norm": 0.4827892185298997, + "learning_rate": 2.594173523478864e-07, + "logits/chosen": -1.8119120597839355, + "logits/rejected": -1.6746340990066528, + "logps/chosen": -37.49448013305664, + "logps/rejected": -69.68302917480469, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0536510944366455, + "rewards/margins": 10.237602233886719, + "rewards/rejected": -13.291254043579102, + "step": 725 + }, + { + "epoch": 4.302222222222222, + "grad_norm": 0.4377884100324364, + "learning_rate": 2.587681560803379e-07, + "logits/chosen": -1.5027875900268555, + "logits/rejected": -1.6284228563308716, + "logps/chosen": -45.38482666015625, + "logps/rejected": -71.05689239501953, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0713844299316406, + "rewards/margins": 10.261495590209961, + "rewards/rejected": -13.332880020141602, + "step": 726 + }, + { + "epoch": 4.308148148148148, + "grad_norm": 0.288174657644559, + "learning_rate": 2.5811890060822754e-07, + "logits/chosen": -1.5329185724258423, + "logits/rejected": -1.6217219829559326, + "logps/chosen": -54.94902038574219, + "logps/rejected": -75.75360870361328, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8438515663146973, + "rewards/margins": 11.377456665039062, + "rewards/rejected": -13.221307754516602, + "step": 727 + }, + { + "epoch": 4.314074074074074, + "grad_norm": 0.3568800731107457, + "learning_rate": 2.574695903154744e-07, + "logits/chosen": -1.2863966226577759, + "logits/rejected": -1.3943865299224854, + "logps/chosen": -52.2459716796875, + "logps/rejected": -68.17107391357422, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.828482151031494, + "rewards/margins": 9.69110107421875, + "rewards/rejected": -12.519582748413086, + "step": 728 + }, + { + "epoch": 4.32, + "grad_norm": 0.49248567873103494, + "learning_rate": 2.5682022958636753e-07, + "logits/chosen": -1.5085232257843018, + "logits/rejected": -1.2503995895385742, + "logps/chosen": -38.498687744140625, + "logps/rejected": -76.63363647460938, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.512385368347168, + "rewards/margins": 11.470130920410156, + "rewards/rejected": -13.98251724243164, + "step": 729 + }, + { + "epoch": 4.325925925925926, + "grad_norm": 0.3106187542124455, + "learning_rate": 2.5617082280553655e-07, + "logits/chosen": -1.7640349864959717, + "logits/rejected": -1.7682902812957764, + "logps/chosen": -41.87824249267578, + "logps/rejected": -69.6838150024414, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.774226427078247, + "rewards/margins": 8.697710990905762, + "rewards/rejected": -11.47193717956543, + "step": 730 + }, + { + "epoch": 4.331851851851852, + "grad_norm": 0.2636700118454411, + "learning_rate": 2.5552137435792215e-07, + "logits/chosen": -1.506960391998291, + "logits/rejected": -1.710749626159668, + "logps/chosen": -52.51408004760742, + "logps/rejected": -69.61265563964844, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9004751443862915, + "rewards/margins": 10.869614601135254, + "rewards/rejected": -12.770090103149414, + "step": 731 + }, + { + "epoch": 4.337777777777778, + "grad_norm": 0.3296961452649938, + "learning_rate": 2.5487188862874633e-07, + "logits/chosen": -1.400599718093872, + "logits/rejected": -1.525638461112976, + "logps/chosen": -39.74789810180664, + "logps/rejected": -74.80589294433594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5468039512634277, + "rewards/margins": 10.360738754272461, + "rewards/rejected": -12.907543182373047, + "step": 732 + }, + { + "epoch": 4.343703703703704, + "grad_norm": 0.2820919892356091, + "learning_rate": 2.542223700034827e-07, + "logits/chosen": -1.4439010620117188, + "logits/rejected": -1.4324991703033447, + "logps/chosen": -35.00661087036133, + "logps/rejected": -68.59481811523438, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4738209247589111, + "rewards/margins": 10.527488708496094, + "rewards/rejected": -12.001309394836426, + "step": 733 + }, + { + "epoch": 4.3496296296296295, + "grad_norm": 0.36821571256857155, + "learning_rate": 2.535728228678273e-07, + "logits/chosen": -1.9044547080993652, + "logits/rejected": -1.7454307079315186, + "logps/chosen": -41.30238723754883, + "logps/rejected": -70.7161865234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1526682376861572, + "rewards/margins": 9.870889663696289, + "rewards/rejected": -12.023557662963867, + "step": 734 + }, + { + "epoch": 4.355555555555555, + "grad_norm": 0.5309724031670829, + "learning_rate": 2.529232516076684e-07, + "logits/chosen": -1.8040151596069336, + "logits/rejected": -1.8421554565429688, + "logps/chosen": -34.45152282714844, + "logps/rejected": -64.11228942871094, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7052326202392578, + "rewards/margins": 9.953506469726562, + "rewards/rejected": -11.65873908996582, + "step": 735 + }, + { + "epoch": 4.361481481481482, + "grad_norm": 0.5620423781992994, + "learning_rate": 2.522736606090572e-07, + "logits/chosen": -1.9946095943450928, + "logits/rejected": -1.9713982343673706, + "logps/chosen": -47.492515563964844, + "logps/rejected": -74.77565002441406, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4397852420806885, + "rewards/margins": 10.341421127319336, + "rewards/rejected": -12.781206130981445, + "step": 736 + }, + { + "epoch": 4.367407407407407, + "grad_norm": 0.5850268258670716, + "learning_rate": 2.5162405425817804e-07, + "logits/chosen": -1.7595919370651245, + "logits/rejected": -1.523133397102356, + "logps/chosen": -38.44977569580078, + "logps/rejected": -75.89402770996094, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9519128799438477, + "rewards/margins": 10.567989349365234, + "rewards/rejected": -13.519901275634766, + "step": 737 + }, + { + "epoch": 4.373333333333333, + "grad_norm": 0.36409290406422345, + "learning_rate": 2.5097443694131944e-07, + "logits/chosen": -1.910279393196106, + "logits/rejected": -1.6399755477905273, + "logps/chosen": -49.62364196777344, + "logps/rejected": -94.20441436767578, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5087497234344482, + "rewards/margins": 13.17677116394043, + "rewards/rejected": -15.68552017211914, + "step": 738 + }, + { + "epoch": 4.37925925925926, + "grad_norm": 0.27280875284841666, + "learning_rate": 2.503248130448434e-07, + "logits/chosen": -1.222158432006836, + "logits/rejected": -1.1858694553375244, + "logps/chosen": -41.11689758300781, + "logps/rejected": -69.91155242919922, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.33315372467041, + "rewards/margins": 10.658595085144043, + "rewards/rejected": -13.991748809814453, + "step": 739 + }, + { + "epoch": 4.385185185185185, + "grad_norm": 0.48153543008194893, + "learning_rate": 2.496751869551567e-07, + "logits/chosen": -1.213226079940796, + "logits/rejected": -1.2117483615875244, + "logps/chosen": -53.64634704589844, + "logps/rejected": -79.62733459472656, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.945598840713501, + "rewards/margins": 9.169174194335938, + "rewards/rejected": -12.11477279663086, + "step": 740 + }, + { + "epoch": 4.391111111111111, + "grad_norm": 0.3260449818828276, + "learning_rate": 2.4902556305868064e-07, + "logits/chosen": -1.6482226848602295, + "logits/rejected": -1.369916558265686, + "logps/chosen": -49.26082992553711, + "logps/rejected": -80.69975280761719, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.141822338104248, + "rewards/margins": 10.516618728637695, + "rewards/rejected": -14.658441543579102, + "step": 741 + }, + { + "epoch": 4.397037037037037, + "grad_norm": 0.5688273128014634, + "learning_rate": 2.4837594574182194e-07, + "logits/chosen": -1.4720345735549927, + "logits/rejected": -1.4317636489868164, + "logps/chosen": -47.89663314819336, + "logps/rejected": -64.76475524902344, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.222043037414551, + "rewards/margins": 7.936434745788574, + "rewards/rejected": -11.158477783203125, + "step": 742 + }, + { + "epoch": 4.402962962962963, + "grad_norm": 0.7988128886724679, + "learning_rate": 2.477263393909429e-07, + "logits/chosen": -1.9475209712982178, + "logits/rejected": -1.9408725500106812, + "logps/chosen": -41.1652717590332, + "logps/rejected": -73.0821533203125, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.581312656402588, + "rewards/margins": 10.089584350585938, + "rewards/rejected": -12.670897483825684, + "step": 743 + }, + { + "epoch": 4.408888888888889, + "grad_norm": 0.6956004735540405, + "learning_rate": 2.4707674839233165e-07, + "logits/chosen": -1.6389610767364502, + "logits/rejected": -1.5735392570495605, + "logps/chosen": -39.5618782043457, + "logps/rejected": -75.9864501953125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.017786979675293, + "rewards/margins": 12.282722473144531, + "rewards/rejected": -15.30051040649414, + "step": 744 + }, + { + "epoch": 4.4148148148148145, + "grad_norm": 0.2740055720964037, + "learning_rate": 2.4642717713217266e-07, + "logits/chosen": -1.5863927602767944, + "logits/rejected": -1.5586270093917847, + "logps/chosen": -51.11642837524414, + "logps/rejected": -78.3765869140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.301530361175537, + "rewards/margins": 11.2866792678833, + "rewards/rejected": -15.588210105895996, + "step": 745 + }, + { + "epoch": 4.420740740740741, + "grad_norm": 0.5552007613739203, + "learning_rate": 2.4577762999651727e-07, + "logits/chosen": -1.665607213973999, + "logits/rejected": -1.6078526973724365, + "logps/chosen": -40.21504211425781, + "logps/rejected": -80.14183807373047, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7667739391326904, + "rewards/margins": 10.827664375305176, + "rewards/rejected": -13.594437599182129, + "step": 746 + }, + { + "epoch": 4.426666666666667, + "grad_norm": 0.32790522815064826, + "learning_rate": 2.451281113712537e-07, + "logits/chosen": -1.6151583194732666, + "logits/rejected": -1.5450541973114014, + "logps/chosen": -37.39631271362305, + "logps/rejected": -68.37910461425781, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6576449871063232, + "rewards/margins": 9.498502731323242, + "rewards/rejected": -12.156147003173828, + "step": 747 + }, + { + "epoch": 4.432592592592592, + "grad_norm": 0.3278713822461682, + "learning_rate": 2.4447862564207783e-07, + "logits/chosen": -1.6984974145889282, + "logits/rejected": -1.4929834604263306, + "logps/chosen": -41.737998962402344, + "logps/rejected": -82.5109634399414, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1655399799346924, + "rewards/margins": 11.867754936218262, + "rewards/rejected": -14.033292770385742, + "step": 748 + }, + { + "epoch": 4.438518518518519, + "grad_norm": 0.2728427147646305, + "learning_rate": 2.438291771944635e-07, + "logits/chosen": -1.3253810405731201, + "logits/rejected": -1.304832935333252, + "logps/chosen": -32.45439529418945, + "logps/rejected": -64.35618591308594, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.709897518157959, + "rewards/margins": 9.905057907104492, + "rewards/rejected": -11.61495590209961, + "step": 749 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.2992721613416659, + "learning_rate": 2.431797704136325e-07, + "logits/chosen": -1.3873847723007202, + "logits/rejected": -1.1295723915100098, + "logps/chosen": -37.542747497558594, + "logps/rejected": -80.18496704101562, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6178274154663086, + "rewards/margins": 10.947952270507812, + "rewards/rejected": -13.565778732299805, + "step": 750 + }, + { + "epoch": 4.45037037037037, + "grad_norm": 0.23217219343574805, + "learning_rate": 2.425304096845256e-07, + "logits/chosen": -2.1051247119903564, + "logits/rejected": -2.2890005111694336, + "logps/chosen": -71.4691162109375, + "logps/rejected": -85.68158721923828, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.611448287963867, + "rewards/margins": 10.625432968139648, + "rewards/rejected": -15.236881256103516, + "step": 751 + }, + { + "epoch": 4.456296296296296, + "grad_norm": 0.31223869571923357, + "learning_rate": 2.4188109939177244e-07, + "logits/chosen": -1.3819341659545898, + "logits/rejected": -1.1680090427398682, + "logps/chosen": -46.19057846069336, + "logps/rejected": -78.53694152832031, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2963953018188477, + "rewards/margins": 10.76588249206543, + "rewards/rejected": -13.062276840209961, + "step": 752 + }, + { + "epoch": 4.4622222222222225, + "grad_norm": 0.5598617582374887, + "learning_rate": 2.412318439196621e-07, + "logits/chosen": -1.2198549509048462, + "logits/rejected": -1.4387001991271973, + "logps/chosen": -49.030216217041016, + "logps/rejected": -57.43315124511719, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.343151092529297, + "rewards/margins": 8.80035400390625, + "rewards/rejected": -11.143505096435547, + "step": 753 + }, + { + "epoch": 4.468148148148148, + "grad_norm": 0.388327433841169, + "learning_rate": 2.405826476521137e-07, + "logits/chosen": -1.7963589429855347, + "logits/rejected": -1.6334545612335205, + "logps/chosen": -42.468994140625, + "logps/rejected": -81.61261749267578, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.233997106552124, + "rewards/margins": 11.50871467590332, + "rewards/rejected": -14.742712020874023, + "step": 754 + }, + { + "epoch": 4.474074074074074, + "grad_norm": 0.36873722546852566, + "learning_rate": 2.399335149726463e-07, + "logits/chosen": -1.864285945892334, + "logits/rejected": -1.8117425441741943, + "logps/chosen": -43.57659149169922, + "logps/rejected": -61.182159423828125, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8054585456848145, + "rewards/margins": 8.73664665222168, + "rewards/rejected": -11.542104721069336, + "step": 755 + }, + { + "epoch": 4.48, + "grad_norm": 0.9656825907418681, + "learning_rate": 2.392844502643497e-07, + "logits/chosen": -1.5215977430343628, + "logits/rejected": -1.389098882675171, + "logps/chosen": -42.34136199951172, + "logps/rejected": -69.96714782714844, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9610204696655273, + "rewards/margins": 10.299530982971191, + "rewards/rejected": -12.260551452636719, + "step": 756 + }, + { + "epoch": 4.485925925925926, + "grad_norm": 0.25342334021643204, + "learning_rate": 2.3863545790985485e-07, + "logits/chosen": -1.7879868745803833, + "logits/rejected": -1.884661078453064, + "logps/chosen": -49.11554718017578, + "logps/rejected": -73.60369110107422, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.804800271987915, + "rewards/margins": 10.777206420898438, + "rewards/rejected": -13.582006454467773, + "step": 757 + }, + { + "epoch": 4.491851851851852, + "grad_norm": 0.4088130548067966, + "learning_rate": 2.379865422913042e-07, + "logits/chosen": -1.6179802417755127, + "logits/rejected": -1.5631730556488037, + "logps/chosen": -38.03044509887695, + "logps/rejected": -74.8770980834961, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0774755477905273, + "rewards/margins": 10.472331047058105, + "rewards/rejected": -13.549808502197266, + "step": 758 + }, + { + "epoch": 4.497777777777777, + "grad_norm": 0.24621046378728606, + "learning_rate": 2.3733770779032184e-07, + "logits/chosen": -1.0918471813201904, + "logits/rejected": -1.4053854942321777, + "logps/chosen": -44.596763610839844, + "logps/rejected": -68.87012481689453, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8497509956359863, + "rewards/margins": 11.430082321166992, + "rewards/rejected": -13.27983283996582, + "step": 759 + }, + { + "epoch": 4.503703703703704, + "grad_norm": 0.4039328343347822, + "learning_rate": 2.3668895878798423e-07, + "logits/chosen": -1.666556477546692, + "logits/rejected": -1.495963454246521, + "logps/chosen": -35.45246505737305, + "logps/rejected": -56.131202697753906, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8134236335754395, + "rewards/margins": 7.402918338775635, + "rewards/rejected": -8.216341972351074, + "step": 760 + }, + { + "epoch": 4.50962962962963, + "grad_norm": 0.6762734783611634, + "learning_rate": 2.360402996647906e-07, + "logits/chosen": -1.7931262254714966, + "logits/rejected": -1.6433568000793457, + "logps/chosen": -49.27035903930664, + "logps/rejected": -93.5889892578125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.803044080734253, + "rewards/margins": 13.115455627441406, + "rewards/rejected": -16.918498992919922, + "step": 761 + }, + { + "epoch": 4.515555555555555, + "grad_norm": 0.6202135342855698, + "learning_rate": 2.3539173480063318e-07, + "logits/chosen": -1.5122251510620117, + "logits/rejected": -1.5641227960586548, + "logps/chosen": -51.571006774902344, + "logps/rejected": -71.65675354003906, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.491517066955566, + "rewards/margins": 8.394001007080078, + "rewards/rejected": -12.885517120361328, + "step": 762 + }, + { + "epoch": 4.521481481481482, + "grad_norm": 0.5983718330378733, + "learning_rate": 2.3474326857476783e-07, + "logits/chosen": -1.9533593654632568, + "logits/rejected": -1.8577436208724976, + "logps/chosen": -40.418888092041016, + "logps/rejected": -72.35916137695312, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6917667388916016, + "rewards/margins": 11.463409423828125, + "rewards/rejected": -14.155176162719727, + "step": 763 + }, + { + "epoch": 4.5274074074074075, + "grad_norm": 0.5523399287602111, + "learning_rate": 2.340949053657843e-07, + "logits/chosen": -1.3917421102523804, + "logits/rejected": -1.4328992366790771, + "logps/chosen": -53.25602722167969, + "logps/rejected": -73.91915893554688, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.373779296875, + "rewards/margins": 10.894166946411133, + "rewards/rejected": -13.267946243286133, + "step": 764 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.2775376512794254, + "learning_rate": 2.3344664955157685e-07, + "logits/chosen": -1.2376761436462402, + "logits/rejected": -1.156088948249817, + "logps/chosen": -31.447860717773438, + "logps/rejected": -66.50593566894531, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0162906646728516, + "rewards/margins": 10.467144012451172, + "rewards/rejected": -12.483434677124023, + "step": 765 + }, + { + "epoch": 4.539259259259259, + "grad_norm": 0.37613779762460614, + "learning_rate": 2.3279850550931458e-07, + "logits/chosen": -1.581992745399475, + "logits/rejected": -1.3465206623077393, + "logps/chosen": -45.77264404296875, + "logps/rejected": -78.6106185913086, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8553242683410645, + "rewards/margins": 10.771078109741211, + "rewards/rejected": -13.626401901245117, + "step": 766 + }, + { + "epoch": 4.545185185185185, + "grad_norm": 0.26554629446762146, + "learning_rate": 2.3215047761541172e-07, + "logits/chosen": -1.3734705448150635, + "logits/rejected": -1.1478571891784668, + "logps/chosen": -31.20534896850586, + "logps/rejected": -67.67927551269531, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9986950159072876, + "rewards/margins": 10.351818084716797, + "rewards/rejected": -11.350513458251953, + "step": 767 + }, + { + "epoch": 4.551111111111111, + "grad_norm": 0.4328268508688234, + "learning_rate": 2.3150257024549845e-07, + "logits/chosen": -1.4306426048278809, + "logits/rejected": -1.2422418594360352, + "logps/chosen": -35.660118103027344, + "logps/rejected": -66.81575775146484, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6081128120422363, + "rewards/margins": 11.39274787902832, + "rewards/rejected": -14.000861167907715, + "step": 768 + }, + { + "epoch": 4.557037037037037, + "grad_norm": 0.5336695898129881, + "learning_rate": 2.3085478777439096e-07, + "logits/chosen": -1.548780083656311, + "logits/rejected": -1.5772260427474976, + "logps/chosen": -42.280174255371094, + "logps/rejected": -63.68194580078125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8020665645599365, + "rewards/margins": 9.052971839904785, + "rewards/rejected": -11.855037689208984, + "step": 769 + }, + { + "epoch": 4.562962962962963, + "grad_norm": 0.22474609924604425, + "learning_rate": 2.302071345760622e-07, + "logits/chosen": -1.84547758102417, + "logits/rejected": -1.9938280582427979, + "logps/chosen": -60.60712432861328, + "logps/rejected": -72.54964447021484, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1693592071533203, + "rewards/margins": 10.86662483215332, + "rewards/rejected": -14.03598403930664, + "step": 770 + }, + { + "epoch": 4.568888888888889, + "grad_norm": 1.0805753910268383, + "learning_rate": 2.2955961502361232e-07, + "logits/chosen": -1.837569236755371, + "logits/rejected": -1.8610866069793701, + "logps/chosen": -45.81024932861328, + "logps/rejected": -63.923728942871094, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9928364753723145, + "rewards/margins": 9.243518829345703, + "rewards/rejected": -11.23635482788086, + "step": 771 + }, + { + "epoch": 4.574814814814815, + "grad_norm": 0.2815304479426961, + "learning_rate": 2.2891223348923882e-07, + "logits/chosen": -1.499778389930725, + "logits/rejected": -1.4236795902252197, + "logps/chosen": -54.17256164550781, + "logps/rejected": -88.95265197753906, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.209611654281616, + "rewards/margins": 12.507816314697266, + "rewards/rejected": -15.717428207397461, + "step": 772 + }, + { + "epoch": 4.58074074074074, + "grad_norm": 0.28313198718639915, + "learning_rate": 2.2826499434420745e-07, + "logits/chosen": -1.4156975746154785, + "logits/rejected": -1.4123296737670898, + "logps/chosen": -41.201629638671875, + "logps/rejected": -69.11726379394531, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8570693731307983, + "rewards/margins": 10.749539375305176, + "rewards/rejected": -12.606608390808105, + "step": 773 + }, + { + "epoch": 4.586666666666667, + "grad_norm": 0.3125143240803341, + "learning_rate": 2.2761790195882261e-07, + "logits/chosen": -1.5952832698822021, + "logits/rejected": -1.4846335649490356, + "logps/chosen": -37.14253234863281, + "logps/rejected": -74.46310424804688, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.426466703414917, + "rewards/margins": 11.401270866394043, + "rewards/rejected": -12.827737808227539, + "step": 774 + }, + { + "epoch": 4.592592592592593, + "grad_norm": 0.6926539611379354, + "learning_rate": 2.2697096070239748e-07, + "logits/chosen": -1.636855125427246, + "logits/rejected": -1.7660517692565918, + "logps/chosen": -64.5381851196289, + "logps/rejected": -70.02143859863281, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0808470249176025, + "rewards/margins": 8.71413516998291, + "rewards/rejected": -11.794981956481934, + "step": 775 + }, + { + "epoch": 4.598518518518518, + "grad_norm": 0.41730602281644963, + "learning_rate": 2.2632417494322503e-07, + "logits/chosen": -1.5239399671554565, + "logits/rejected": -1.5000847578048706, + "logps/chosen": -50.480892181396484, + "logps/rejected": -75.76677703857422, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8016414642333984, + "rewards/margins": 11.542520523071289, + "rewards/rejected": -14.344161987304688, + "step": 776 + }, + { + "epoch": 4.604444444444445, + "grad_norm": 0.6117954122512027, + "learning_rate": 2.2567754904854809e-07, + "logits/chosen": -1.8561725616455078, + "logits/rejected": -1.8074907064437866, + "logps/chosen": -47.85967254638672, + "logps/rejected": -78.99420928955078, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.509251356124878, + "rewards/margins": 11.366935729980469, + "rewards/rejected": -13.876188278198242, + "step": 777 + }, + { + "epoch": 4.6103703703703705, + "grad_norm": 0.1755774879417684, + "learning_rate": 2.2503108738453014e-07, + "logits/chosen": -1.6200981140136719, + "logits/rejected": -1.6129133701324463, + "logps/chosen": -41.06743240356445, + "logps/rejected": -74.25436401367188, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5940749645233154, + "rewards/margins": 10.168746948242188, + "rewards/rejected": -12.762822151184082, + "step": 778 + }, + { + "epoch": 4.616296296296296, + "grad_norm": 0.349414448813446, + "learning_rate": 2.243847943162257e-07, + "logits/chosen": -1.9405025243759155, + "logits/rejected": -1.9417346715927124, + "logps/chosen": -52.690181732177734, + "logps/rejected": -65.54427337646484, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6885979175567627, + "rewards/margins": 9.365793228149414, + "rewards/rejected": -12.054390907287598, + "step": 779 + }, + { + "epoch": 4.622222222222222, + "grad_norm": 0.20242192540609477, + "learning_rate": 2.23738674207551e-07, + "logits/chosen": -1.613708734512329, + "logits/rejected": -1.436056137084961, + "logps/chosen": -35.43689727783203, + "logps/rejected": -83.06486511230469, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6374375820159912, + "rewards/margins": 13.647027015686035, + "rewards/rejected": -14.284463882446289, + "step": 780 + }, + { + "epoch": 4.628148148148148, + "grad_norm": 0.5765752927321536, + "learning_rate": 2.230927314212543e-07, + "logits/chosen": -1.494924783706665, + "logits/rejected": -1.4253530502319336, + "logps/chosen": -45.896095275878906, + "logps/rejected": -68.88455200195312, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2405786514282227, + "rewards/margins": 9.971403121948242, + "rewards/rejected": -13.211981773376465, + "step": 781 + }, + { + "epoch": 4.634074074074074, + "grad_norm": 0.281505079478091, + "learning_rate": 2.2244697031888655e-07, + "logits/chosen": -1.8622199296951294, + "logits/rejected": -1.7759878635406494, + "logps/chosen": -47.928443908691406, + "logps/rejected": -77.3371353149414, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.063960075378418, + "rewards/margins": 9.88760757446289, + "rewards/rejected": -12.951568603515625, + "step": 782 + }, + { + "epoch": 4.64, + "grad_norm": 0.8954387493569548, + "learning_rate": 2.21801395260772e-07, + "logits/chosen": -1.552648901939392, + "logits/rejected": -1.3441781997680664, + "logps/chosen": -41.77521514892578, + "logps/rejected": -80.31526184082031, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.018218517303467, + "rewards/margins": 13.303080558776855, + "rewards/rejected": -16.321298599243164, + "step": 783 + }, + { + "epoch": 4.645925925925926, + "grad_norm": 0.3599276511446899, + "learning_rate": 2.2115601060597852e-07, + "logits/chosen": -2.0601389408111572, + "logits/rejected": -2.110264301300049, + "logps/chosen": -49.506591796875, + "logps/rejected": -65.16485595703125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.511070489883423, + "rewards/margins": 9.494369506835938, + "rewards/rejected": -12.005439758300781, + "step": 784 + }, + { + "epoch": 4.651851851851852, + "grad_norm": 0.2771432486865571, + "learning_rate": 2.2051082071228852e-07, + "logits/chosen": -2.0726568698883057, + "logits/rejected": -1.9759494066238403, + "logps/chosen": -42.74811553955078, + "logps/rejected": -66.74032592773438, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.455653190612793, + "rewards/margins": 9.912121772766113, + "rewards/rejected": -12.367774963378906, + "step": 785 + }, + { + "epoch": 4.657777777777778, + "grad_norm": 0.33053314179972837, + "learning_rate": 2.1986582993616925e-07, + "logits/chosen": -1.732776165008545, + "logits/rejected": -1.5467349290847778, + "logps/chosen": -39.45323181152344, + "logps/rejected": -81.57275390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0747121572494507, + "rewards/margins": 13.912720680236816, + "rewards/rejected": -14.987432479858398, + "step": 786 + }, + { + "epoch": 4.663703703703703, + "grad_norm": 0.6345719417992604, + "learning_rate": 2.192210426327435e-07, + "logits/chosen": -1.784833312034607, + "logits/rejected": -1.6461296081542969, + "logps/chosen": -43.437374114990234, + "logps/rejected": -69.29059600830078, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3814399242401123, + "rewards/margins": 10.256202697753906, + "rewards/rejected": -11.637643814086914, + "step": 787 + }, + { + "epoch": 4.66962962962963, + "grad_norm": 0.4536643965652212, + "learning_rate": 2.185764631557602e-07, + "logits/chosen": -1.6088899374008179, + "logits/rejected": -1.374220371246338, + "logps/chosen": -42.108734130859375, + "logps/rejected": -78.39088439941406, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.488034725189209, + "rewards/margins": 9.032186508178711, + "rewards/rejected": -11.520221710205078, + "step": 788 + }, + { + "epoch": 4.6755555555555555, + "grad_norm": 0.24775139302990495, + "learning_rate": 2.1793209585756482e-07, + "logits/chosen": -1.583560585975647, + "logits/rejected": -1.6787372827529907, + "logps/chosen": -75.60416412353516, + "logps/rejected": -101.32034301757812, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.8459086418151855, + "rewards/margins": 14.32331371307373, + "rewards/rejected": -19.16922378540039, + "step": 789 + }, + { + "epoch": 4.681481481481481, + "grad_norm": 0.352054728762041, + "learning_rate": 2.1728794508907038e-07, + "logits/chosen": -2.015982151031494, + "logits/rejected": -1.5659945011138916, + "logps/chosen": -44.24329376220703, + "logps/rejected": -109.96054077148438, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7237515449523926, + "rewards/margins": 13.699109077453613, + "rewards/rejected": -16.42285919189453, + "step": 790 + }, + { + "epoch": 4.687407407407408, + "grad_norm": 0.29454959273525827, + "learning_rate": 2.1664401519972774e-07, + "logits/chosen": -1.7378650903701782, + "logits/rejected": -1.8261386156082153, + "logps/chosen": -55.897666931152344, + "logps/rejected": -81.62759399414062, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7774252891540527, + "rewards/margins": 12.035506248474121, + "rewards/rejected": -15.812932014465332, + "step": 791 + }, + { + "epoch": 4.693333333333333, + "grad_norm": 0.2714000933964117, + "learning_rate": 2.1600031053749634e-07, + "logits/chosen": -1.463356614112854, + "logits/rejected": -1.3946796655654907, + "logps/chosen": -50.632022857666016, + "logps/rejected": -71.64886474609375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7824299335479736, + "rewards/margins": 8.319438934326172, + "rewards/rejected": -11.101869583129883, + "step": 792 + }, + { + "epoch": 4.699259259259259, + "grad_norm": 0.3956245737008001, + "learning_rate": 2.1535683544881478e-07, + "logits/chosen": -1.6728391647338867, + "logits/rejected": -1.6265099048614502, + "logps/chosen": -38.276920318603516, + "logps/rejected": -65.04293823242188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1510419845581055, + "rewards/margins": 10.412332534790039, + "rewards/rejected": -12.563374519348145, + "step": 793 + }, + { + "epoch": 4.705185185185185, + "grad_norm": 0.34770270483367516, + "learning_rate": 2.147135942785716e-07, + "logits/chosen": -1.6922452449798584, + "logits/rejected": -1.6048784255981445, + "logps/chosen": -38.79063415527344, + "logps/rejected": -68.69209289550781, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9936587810516357, + "rewards/margins": 9.991509437561035, + "rewards/rejected": -11.98516845703125, + "step": 794 + }, + { + "epoch": 4.711111111111111, + "grad_norm": 0.6662575958266204, + "learning_rate": 2.1407059137007583e-07, + "logits/chosen": -1.9861844778060913, + "logits/rejected": -2.161007881164551, + "logps/chosen": -49.49615478515625, + "logps/rejected": -65.79344177246094, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8835437297821045, + "rewards/margins": 10.038654327392578, + "rewards/rejected": -11.922198295593262, + "step": 795 + }, + { + "epoch": 4.717037037037037, + "grad_norm": 0.34706493537306654, + "learning_rate": 2.1342783106502777e-07, + "logits/chosen": -1.9054166078567505, + "logits/rejected": -1.8093022108078003, + "logps/chosen": -42.534156799316406, + "logps/rejected": -81.73444366455078, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.304574966430664, + "rewards/margins": 11.91904354095459, + "rewards/rejected": -13.22361946105957, + "step": 796 + }, + { + "epoch": 4.722962962962963, + "grad_norm": 0.28784218622198576, + "learning_rate": 2.1278531770348963e-07, + "logits/chosen": -1.6888959407806396, + "logits/rejected": -1.77483069896698, + "logps/chosen": -51.807090759277344, + "logps/rejected": -78.14078521728516, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.369813919067383, + "rewards/margins": 11.10791301727295, + "rewards/rejected": -13.477725982666016, + "step": 797 + }, + { + "epoch": 4.728888888888889, + "grad_norm": 0.38336438976817927, + "learning_rate": 2.121430556238559e-07, + "logits/chosen": -1.9089480638504028, + "logits/rejected": -1.8946913480758667, + "logps/chosen": -38.47759246826172, + "logps/rejected": -68.7850570678711, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1309518814086914, + "rewards/margins": 12.784750938415527, + "rewards/rejected": -14.915702819824219, + "step": 798 + }, + { + "epoch": 4.734814814814815, + "grad_norm": 0.2868183618195756, + "learning_rate": 2.115010491628247e-07, + "logits/chosen": -1.696946382522583, + "logits/rejected": -1.6048494577407837, + "logps/chosen": -34.717411041259766, + "logps/rejected": -65.3773422241211, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.898589849472046, + "rewards/margins": 9.88906478881836, + "rewards/rejected": -11.787654876708984, + "step": 799 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 0.22472378862968692, + "learning_rate": 2.1085930265536808e-07, + "logits/chosen": -2.0204107761383057, + "logits/rejected": -1.864630937576294, + "logps/chosen": -34.87665939331055, + "logps/rejected": -66.435791015625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.800518274307251, + "rewards/margins": 9.670722961425781, + "rewards/rejected": -12.471240043640137, + "step": 800 + }, + { + "epoch": 4.746666666666667, + "grad_norm": 0.6742747101060639, + "learning_rate": 2.1021782043470278e-07, + "logits/chosen": -1.3186049461364746, + "logits/rejected": -1.3445327281951904, + "logps/chosen": -54.38391876220703, + "logps/rejected": -77.40103149414062, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.349400281906128, + "rewards/margins": 11.073860168457031, + "rewards/rejected": -14.423260688781738, + "step": 801 + }, + { + "epoch": 4.752592592592593, + "grad_norm": 0.3771344334707704, + "learning_rate": 2.0957660683226103e-07, + "logits/chosen": -1.71254301071167, + "logits/rejected": -1.5470032691955566, + "logps/chosen": -37.44327163696289, + "logps/rejected": -69.07460021972656, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4861843585968018, + "rewards/margins": 11.189270973205566, + "rewards/rejected": -12.675455093383789, + "step": 802 + }, + { + "epoch": 4.758518518518518, + "grad_norm": 0.6275388078504144, + "learning_rate": 2.0893566617766126e-07, + "logits/chosen": -1.789711833000183, + "logits/rejected": -1.892749309539795, + "logps/chosen": -49.782630920410156, + "logps/rejected": -66.36517333984375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7289974689483643, + "rewards/margins": 9.127378463745117, + "rewards/rejected": -10.856375694274902, + "step": 803 + }, + { + "epoch": 4.764444444444445, + "grad_norm": 0.47449108482322694, + "learning_rate": 2.0829500279867891e-07, + "logits/chosen": -1.871337652206421, + "logits/rejected": -1.7207472324371338, + "logps/chosen": -28.964725494384766, + "logps/rejected": -69.4187240600586, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8920726776123047, + "rewards/margins": 11.862289428710938, + "rewards/rejected": -12.754362106323242, + "step": 804 + }, + { + "epoch": 4.770370370370371, + "grad_norm": 0.3037246861891076, + "learning_rate": 2.0765462102121719e-07, + "logits/chosen": -1.5920605659484863, + "logits/rejected": -1.5429118871688843, + "logps/chosen": -36.87894058227539, + "logps/rejected": -65.16922760009766, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5912890434265137, + "rewards/margins": 10.682775497436523, + "rewards/rejected": -13.274065017700195, + "step": 805 + }, + { + "epoch": 4.776296296296296, + "grad_norm": 0.3152244031437843, + "learning_rate": 2.0701452516927797e-07, + "logits/chosen": -1.61420738697052, + "logits/rejected": -1.4144175052642822, + "logps/chosen": -48.10711669921875, + "logps/rejected": -81.6900634765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1305041313171387, + "rewards/margins": 11.635595321655273, + "rewards/rejected": -14.766098022460938, + "step": 806 + }, + { + "epoch": 4.782222222222222, + "grad_norm": 0.6160323095063934, + "learning_rate": 2.0637471956493234e-07, + "logits/chosen": -1.8907852172851562, + "logits/rejected": -1.6985530853271484, + "logps/chosen": -38.62782287597656, + "logps/rejected": -78.48397064208984, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.042975425720215, + "rewards/margins": 11.772692680358887, + "rewards/rejected": -14.815667152404785, + "step": 807 + }, + { + "epoch": 4.7881481481481485, + "grad_norm": 0.477923686133842, + "learning_rate": 2.0573520852829164e-07, + "logits/chosen": -2.0220890045166016, + "logits/rejected": -2.0097062587738037, + "logps/chosen": -33.547210693359375, + "logps/rejected": -61.80625915527344, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2768610715866089, + "rewards/margins": 11.213603973388672, + "rewards/rejected": -12.49046516418457, + "step": 808 + }, + { + "epoch": 4.794074074074074, + "grad_norm": 0.27178947050649066, + "learning_rate": 2.0509599637747818e-07, + "logits/chosen": -1.4557392597198486, + "logits/rejected": -1.5568785667419434, + "logps/chosen": -43.41080856323242, + "logps/rejected": -74.368408203125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.034477710723877, + "rewards/margins": 12.510175704956055, + "rewards/rejected": -16.544654846191406, + "step": 809 + }, + { + "epoch": 4.8, + "grad_norm": 0.3937548138391649, + "learning_rate": 2.0445708742859625e-07, + "logits/chosen": -1.3229466676712036, + "logits/rejected": -1.1964821815490723, + "logps/chosen": -51.84530258178711, + "logps/rejected": -77.97311401367188, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1834235191345215, + "rewards/margins": 10.095224380493164, + "rewards/rejected": -13.278647422790527, + "step": 810 + }, + { + "epoch": 4.805925925925926, + "grad_norm": 0.3519512729455183, + "learning_rate": 2.0381848599570273e-07, + "logits/chosen": -1.6972732543945312, + "logits/rejected": -1.6273045539855957, + "logps/chosen": -32.62199020385742, + "logps/rejected": -56.509918212890625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6276986002922058, + "rewards/margins": 9.845847129821777, + "rewards/rejected": -10.473546028137207, + "step": 811 + }, + { + "epoch": 4.811851851851852, + "grad_norm": 0.21318796805172743, + "learning_rate": 2.0318019639077803e-07, + "logits/chosen": -1.5077314376831055, + "logits/rejected": -1.302263855934143, + "logps/chosen": -40.9090461730957, + "logps/rejected": -80.91336059570312, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8164610862731934, + "rewards/margins": 11.253703117370605, + "rewards/rejected": -14.07016372680664, + "step": 812 + }, + { + "epoch": 4.817777777777778, + "grad_norm": 0.3855192253106906, + "learning_rate": 2.0254222292369724e-07, + "logits/chosen": -1.6157485246658325, + "logits/rejected": -1.407730221748352, + "logps/chosen": -38.98833465576172, + "logps/rejected": -75.60125732421875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4220004081726074, + "rewards/margins": 9.778350830078125, + "rewards/rejected": -13.20035171508789, + "step": 813 + }, + { + "epoch": 4.823703703703703, + "grad_norm": 0.7157114320343476, + "learning_rate": 2.0190456990220055e-07, + "logits/chosen": -1.4571491479873657, + "logits/rejected": -1.4702848196029663, + "logps/chosen": -42.851844787597656, + "logps/rejected": -77.12294006347656, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4410855770111084, + "rewards/margins": 10.907438278198242, + "rewards/rejected": -13.34852409362793, + "step": 814 + }, + { + "epoch": 4.82962962962963, + "grad_norm": 0.5514766898574819, + "learning_rate": 2.0126724163186474e-07, + "logits/chosen": -1.4762797355651855, + "logits/rejected": -1.388708233833313, + "logps/chosen": -47.19092559814453, + "logps/rejected": -65.01657104492188, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.214632034301758, + "rewards/margins": 9.465707778930664, + "rewards/rejected": -12.680340766906738, + "step": 815 + }, + { + "epoch": 4.835555555555556, + "grad_norm": 0.3002635941047584, + "learning_rate": 2.006302424160735e-07, + "logits/chosen": -1.601129174232483, + "logits/rejected": -1.5539610385894775, + "logps/chosen": -38.64893341064453, + "logps/rejected": -69.77659606933594, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.940110445022583, + "rewards/margins": 12.507593154907227, + "rewards/rejected": -14.44770336151123, + "step": 816 + }, + { + "epoch": 4.841481481481481, + "grad_norm": 0.4205605025312493, + "learning_rate": 1.9999357655598891e-07, + "logits/chosen": -1.8134194612503052, + "logits/rejected": -1.7884035110473633, + "logps/chosen": -53.481422424316406, + "logps/rejected": -75.1780776977539, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.23048996925354, + "rewards/margins": 9.090320587158203, + "rewards/rejected": -12.320810317993164, + "step": 817 + }, + { + "epoch": 4.847407407407408, + "grad_norm": 0.2761329051467174, + "learning_rate": 1.9935724835052196e-07, + "logits/chosen": -1.1999740600585938, + "logits/rejected": -1.2345824241638184, + "logps/chosen": -57.651649475097656, + "logps/rejected": -86.1004867553711, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.300785541534424, + "rewards/margins": 11.698287010192871, + "rewards/rejected": -15.99907112121582, + "step": 818 + }, + { + "epoch": 4.8533333333333335, + "grad_norm": 0.6343615960273807, + "learning_rate": 1.987212620963038e-07, + "logits/chosen": -1.923182725906372, + "logits/rejected": -1.9407228231430054, + "logps/chosen": -51.311744689941406, + "logps/rejected": -82.07662200927734, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.584122657775879, + "rewards/margins": 10.869361877441406, + "rewards/rejected": -13.453484535217285, + "step": 819 + }, + { + "epoch": 4.859259259259259, + "grad_norm": 0.2782877911411568, + "learning_rate": 1.9808562208765663e-07, + "logits/chosen": -1.5519965887069702, + "logits/rejected": -1.4769684076309204, + "logps/chosen": -36.23551559448242, + "logps/rejected": -74.62303924560547, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2322258949279785, + "rewards/margins": 11.663935661315918, + "rewards/rejected": -12.896162033081055, + "step": 820 + }, + { + "epoch": 4.865185185185185, + "grad_norm": 0.21964861465933386, + "learning_rate": 1.9745033261656486e-07, + "logits/chosen": -1.6528503894805908, + "logits/rejected": -1.5674102306365967, + "logps/chosen": -47.28667068481445, + "logps/rejected": -75.80752563476562, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3995161056518555, + "rewards/margins": 9.84759521484375, + "rewards/rejected": -12.247111320495605, + "step": 821 + }, + { + "epoch": 4.871111111111111, + "grad_norm": 0.15734208549409717, + "learning_rate": 1.9681539797264578e-07, + "logits/chosen": -1.5805985927581787, + "logits/rejected": -1.45188307762146, + "logps/chosen": -57.12106704711914, + "logps/rejected": -85.31029510498047, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.592021942138672, + "rewards/margins": 10.575047492980957, + "rewards/rejected": -15.167069435119629, + "step": 822 + }, + { + "epoch": 4.877037037037037, + "grad_norm": 0.6675396598727341, + "learning_rate": 1.96180822443121e-07, + "logits/chosen": -1.1367294788360596, + "logits/rejected": -1.2640800476074219, + "logps/chosen": -44.80545425415039, + "logps/rejected": -70.67766571044922, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2347452640533447, + "rewards/margins": 11.651213645935059, + "rewards/rejected": -12.885958671569824, + "step": 823 + }, + { + "epoch": 4.882962962962963, + "grad_norm": 0.27873470720660054, + "learning_rate": 1.955466103127871e-07, + "logits/chosen": -1.770205020904541, + "logits/rejected": -1.7015005350112915, + "logps/chosen": -34.791748046875, + "logps/rejected": -72.49945068359375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5266883373260498, + "rewards/margins": 12.88920783996582, + "rewards/rejected": -14.415897369384766, + "step": 824 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.3097771173621495, + "learning_rate": 1.9491276586398715e-07, + "logits/chosen": -1.861426591873169, + "logits/rejected": -1.5631628036499023, + "logps/chosen": -33.725547790527344, + "logps/rejected": -79.86085510253906, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.760953903198242, + "rewards/margins": 14.066446304321289, + "rewards/rejected": -16.82740020751953, + "step": 825 + }, + { + "epoch": 4.894814814814815, + "grad_norm": 1.2522381877844386, + "learning_rate": 1.9427929337658126e-07, + "logits/chosen": -1.6294392347335815, + "logits/rejected": -1.5337265729904175, + "logps/chosen": -37.48204040527344, + "logps/rejected": -69.2812271118164, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.072634220123291, + "rewards/margins": 12.420723915100098, + "rewards/rejected": -14.493358612060547, + "step": 826 + }, + { + "epoch": 4.900740740740741, + "grad_norm": 0.20546866017138665, + "learning_rate": 1.9364619712791819e-07, + "logits/chosen": -1.5217711925506592, + "logits/rejected": -1.5121992826461792, + "logps/chosen": -44.170570373535156, + "logps/rejected": -73.65629577636719, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5526375770568848, + "rewards/margins": 11.5411376953125, + "rewards/rejected": -14.093774795532227, + "step": 827 + }, + { + "epoch": 4.906666666666666, + "grad_norm": 0.23822531337396938, + "learning_rate": 1.9301348139280627e-07, + "logits/chosen": -1.5563844442367554, + "logits/rejected": -1.7817423343658447, + "logps/chosen": -56.81804656982422, + "logps/rejected": -68.70649719238281, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0236644744873047, + "rewards/margins": 8.670280456542969, + "rewards/rejected": -11.69394588470459, + "step": 828 + }, + { + "epoch": 4.912592592592593, + "grad_norm": 0.21821591547354985, + "learning_rate": 1.9238115044348434e-07, + "logits/chosen": -1.8185888528823853, + "logits/rejected": -1.876836895942688, + "logps/chosen": -58.44865417480469, + "logps/rejected": -81.23777770996094, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8119595050811768, + "rewards/margins": 10.934724807739258, + "rewards/rejected": -13.746683120727539, + "step": 829 + }, + { + "epoch": 4.9185185185185185, + "grad_norm": 0.2266270583534369, + "learning_rate": 1.9174920854959322e-07, + "logits/chosen": -1.6655666828155518, + "logits/rejected": -1.4781509637832642, + "logps/chosen": -26.67877197265625, + "logps/rejected": -62.411434173583984, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.696338176727295, + "rewards/margins": 12.304670333862305, + "rewards/rejected": -14.001008033752441, + "step": 830 + }, + { + "epoch": 4.924444444444444, + "grad_norm": 0.2918176447105253, + "learning_rate": 1.9111765997814678e-07, + "logits/chosen": -1.8377158641815186, + "logits/rejected": -1.787956953048706, + "logps/chosen": -38.94948196411133, + "logps/rejected": -67.58212280273438, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.692730188369751, + "rewards/margins": 10.632195472717285, + "rewards/rejected": -12.324925422668457, + "step": 831 + }, + { + "epoch": 4.930370370370371, + "grad_norm": 0.26842566124743916, + "learning_rate": 1.904865089935029e-07, + "logits/chosen": -1.4635124206542969, + "logits/rejected": -1.3722989559173584, + "logps/chosen": -38.668697357177734, + "logps/rejected": -71.26251983642578, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.14703106880188, + "rewards/margins": 10.690751075744629, + "rewards/rejected": -12.837782859802246, + "step": 832 + }, + { + "epoch": 4.936296296296296, + "grad_norm": 0.36609333996244137, + "learning_rate": 1.8985575985733507e-07, + "logits/chosen": -1.3907794952392578, + "logits/rejected": -1.4159953594207764, + "logps/chosen": -46.419254302978516, + "logps/rejected": -76.58683776855469, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.326676607131958, + "rewards/margins": 9.860599517822266, + "rewards/rejected": -13.187276840209961, + "step": 833 + }, + { + "epoch": 4.942222222222222, + "grad_norm": 0.4311886487916821, + "learning_rate": 1.8922541682860326e-07, + "logits/chosen": -1.259995460510254, + "logits/rejected": -1.2965853214263916, + "logps/chosen": -30.577587127685547, + "logps/rejected": -63.5474739074707, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6297186613082886, + "rewards/margins": 12.237893104553223, + "rewards/rejected": -12.867610931396484, + "step": 834 + }, + { + "epoch": 4.948148148148148, + "grad_norm": 0.7173217469645337, + "learning_rate": 1.8859548416352536e-07, + "logits/chosen": -1.2326655387878418, + "logits/rejected": -1.1718957424163818, + "logps/chosen": -38.014225006103516, + "logps/rejected": -71.79017639160156, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3967584371566772, + "rewards/margins": 11.66148567199707, + "rewards/rejected": -13.058243751525879, + "step": 835 + }, + { + "epoch": 4.954074074074074, + "grad_norm": 0.25516273124859323, + "learning_rate": 1.8796596611554838e-07, + "logits/chosen": -1.6921401023864746, + "logits/rejected": -1.852120041847229, + "logps/chosen": -43.73085021972656, + "logps/rejected": -64.00594329833984, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6823835372924805, + "rewards/margins": 9.839491844177246, + "rewards/rejected": -11.521875381469727, + "step": 836 + }, + { + "epoch": 4.96, + "grad_norm": 0.538226909564372, + "learning_rate": 1.8733686693531982e-07, + "logits/chosen": -1.8975204229354858, + "logits/rejected": -1.7692875862121582, + "logps/chosen": -40.96481704711914, + "logps/rejected": -88.20527648925781, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7765021324157715, + "rewards/margins": 12.758014678955078, + "rewards/rejected": -15.534517288208008, + "step": 837 + }, + { + "epoch": 4.965925925925926, + "grad_norm": 0.43354396745461216, + "learning_rate": 1.8670819087065882e-07, + "logits/chosen": -1.4917025566101074, + "logits/rejected": -1.4172505140304565, + "logps/chosen": -45.58994674682617, + "logps/rejected": -68.04974365234375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9073712825775146, + "rewards/margins": 9.858301162719727, + "rewards/rejected": -12.765671730041504, + "step": 838 + }, + { + "epoch": 4.971851851851852, + "grad_norm": 0.3086137098014021, + "learning_rate": 1.8607994216652756e-07, + "logits/chosen": -1.8002091646194458, + "logits/rejected": -1.7989627122879028, + "logps/chosen": -37.95539855957031, + "logps/rejected": -91.1846694946289, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.227663516998291, + "rewards/margins": 15.667410850524902, + "rewards/rejected": -18.89507484436035, + "step": 839 + }, + { + "epoch": 4.977777777777778, + "grad_norm": 0.32236357616937117, + "learning_rate": 1.8545212506500257e-07, + "logits/chosen": -1.8015594482421875, + "logits/rejected": -1.7732892036437988, + "logps/chosen": -45.942161560058594, + "logps/rejected": -69.69486999511719, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7260780334472656, + "rewards/margins": 11.671018600463867, + "rewards/rejected": -14.39709758758545, + "step": 840 + }, + { + "epoch": 4.9837037037037035, + "grad_norm": 2.1936326995084725, + "learning_rate": 1.848247438052461e-07, + "logits/chosen": -2.0741078853607178, + "logits/rejected": -2.024766683578491, + "logps/chosen": -61.82673263549805, + "logps/rejected": -90.516845703125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.203417778015137, + "rewards/margins": 11.07644271850586, + "rewards/rejected": -16.279861450195312, + "step": 841 + }, + { + "epoch": 4.989629629629629, + "grad_norm": 0.5696743631038291, + "learning_rate": 1.8419780262347754e-07, + "logits/chosen": -1.3205444812774658, + "logits/rejected": -1.1502920389175415, + "logps/chosen": -46.6002082824707, + "logps/rejected": -77.9881362915039, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.359750509262085, + "rewards/margins": 12.488546371459961, + "rewards/rejected": -15.848296165466309, + "step": 842 + }, + { + "epoch": 4.995555555555556, + "grad_norm": 0.1911816493782355, + "learning_rate": 1.835713057529447e-07, + "logits/chosen": -2.0486555099487305, + "logits/rejected": -1.8096239566802979, + "logps/chosen": -41.883853912353516, + "logps/rejected": -102.98055267333984, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3345673084259033, + "rewards/margins": 14.44814682006836, + "rewards/rejected": -16.78271484375, + "step": 843 + }, + { + "epoch": 5.001481481481481, + "grad_norm": 0.3585733698368132, + "learning_rate": 1.8294525742389545e-07, + "logits/chosen": -1.695112705230713, + "logits/rejected": -1.7007943391799927, + "logps/chosen": -47.733699798583984, + "logps/rejected": -62.256568908691406, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1449780464172363, + "rewards/margins": 8.733757019042969, + "rewards/rejected": -10.878735542297363, + "step": 844 + }, + { + "epoch": 5.007407407407407, + "grad_norm": 0.148574929218555, + "learning_rate": 1.8231966186354881e-07, + "logits/chosen": -1.6611672639846802, + "logits/rejected": -1.492279052734375, + "logps/chosen": -44.596466064453125, + "logps/rejected": -72.3811264038086, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.640878915786743, + "rewards/margins": 10.067465782165527, + "rewards/rejected": -12.708344459533691, + "step": 845 + }, + { + "epoch": 5.013333333333334, + "grad_norm": 0.24940558308584396, + "learning_rate": 1.8169452329606666e-07, + "logits/chosen": -1.5457825660705566, + "logits/rejected": -1.3591402769088745, + "logps/chosen": -44.07053756713867, + "logps/rejected": -83.65834045410156, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.497549057006836, + "rewards/margins": 11.002856254577637, + "rewards/rejected": -14.500405311584473, + "step": 846 + }, + { + "epoch": 5.019259259259259, + "grad_norm": 0.13228197472605105, + "learning_rate": 1.810698459425254e-07, + "logits/chosen": -1.4919474124908447, + "logits/rejected": -1.4146510362625122, + "logps/chosen": -43.999305725097656, + "logps/rejected": -67.03208923339844, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.773082971572876, + "rewards/margins": 10.872330665588379, + "rewards/rejected": -13.645413398742676, + "step": 847 + }, + { + "epoch": 5.025185185185185, + "grad_norm": 0.17189656018878605, + "learning_rate": 1.8044563402088682e-07, + "logits/chosen": -1.6326994895935059, + "logits/rejected": -1.5319743156433105, + "logps/chosen": -40.37211227416992, + "logps/rejected": -79.12166595458984, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4987974166870117, + "rewards/margins": 12.085766792297363, + "rewards/rejected": -14.584563255310059, + "step": 848 + }, + { + "epoch": 5.0311111111111115, + "grad_norm": 0.16328383350438383, + "learning_rate": 1.7982189174597033e-07, + "logits/chosen": -1.8950517177581787, + "logits/rejected": -1.8567767143249512, + "logps/chosen": -56.13173294067383, + "logps/rejected": -81.51428985595703, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.818589687347412, + "rewards/margins": 13.249429702758789, + "rewards/rejected": -17.06801986694336, + "step": 849 + }, + { + "epoch": 5.037037037037037, + "grad_norm": 0.20542554218578457, + "learning_rate": 1.7919862332942398e-07, + "logits/chosen": -1.784167766571045, + "logits/rejected": -1.7963954210281372, + "logps/chosen": -44.83396530151367, + "logps/rejected": -67.42826843261719, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5964694023132324, + "rewards/margins": 11.381461143493652, + "rewards/rejected": -12.977930068969727, + "step": 850 + }, + { + "epoch": 5.042962962962963, + "grad_norm": 0.28807920712060275, + "learning_rate": 1.785758329796963e-07, + "logits/chosen": -1.703472375869751, + "logits/rejected": -1.6009771823883057, + "logps/chosen": -35.299949645996094, + "logps/rejected": -73.72513580322266, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.533238172531128, + "rewards/margins": 11.330462455749512, + "rewards/rejected": -13.863700866699219, + "step": 851 + }, + { + "epoch": 5.0488888888888885, + "grad_norm": 0.2189616708519809, + "learning_rate": 1.779535249020078e-07, + "logits/chosen": -1.5616728067398071, + "logits/rejected": -1.5174376964569092, + "logps/chosen": -38.645652770996094, + "logps/rejected": -67.2764663696289, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4764845371246338, + "rewards/margins": 11.848831176757812, + "rewards/rejected": -13.325315475463867, + "step": 852 + }, + { + "epoch": 5.054814814814815, + "grad_norm": 0.20874214506595973, + "learning_rate": 1.7733170329832262e-07, + "logits/chosen": -1.6223198175430298, + "logits/rejected": -1.7681981325149536, + "logps/chosen": -32.13164520263672, + "logps/rejected": -61.60198974609375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.745962381362915, + "rewards/margins": 10.576457977294922, + "rewards/rejected": -12.322421073913574, + "step": 853 + }, + { + "epoch": 5.060740740740741, + "grad_norm": 0.18766328646780403, + "learning_rate": 1.7671037236732012e-07, + "logits/chosen": -1.8586299419403076, + "logits/rejected": -1.9080753326416016, + "logps/chosen": -51.48103713989258, + "logps/rejected": -86.90821075439453, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9493086338043213, + "rewards/margins": 13.482855796813965, + "rewards/rejected": -17.432165145874023, + "step": 854 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 0.18720620307407093, + "learning_rate": 1.760895363043663e-07, + "logits/chosen": -2.031804084777832, + "logits/rejected": -1.9341238737106323, + "logps/chosen": -49.05330276489258, + "logps/rejected": -82.30054473876953, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.23289680480957, + "rewards/margins": 11.15815544128418, + "rewards/rejected": -15.39105224609375, + "step": 855 + }, + { + "epoch": 5.072592592592593, + "grad_norm": 0.15775224502125676, + "learning_rate": 1.7546919930148603e-07, + "logits/chosen": -1.9293596744537354, + "logits/rejected": -2.0265965461730957, + "logps/chosen": -69.87615966796875, + "logps/rejected": -80.0749740600586, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.907338857650757, + "rewards/margins": 10.003570556640625, + "rewards/rejected": -13.910908699035645, + "step": 856 + }, + { + "epoch": 5.078518518518519, + "grad_norm": 0.19674405918676902, + "learning_rate": 1.748493655473342e-07, + "logits/chosen": -1.8837556838989258, + "logits/rejected": -2.0229501724243164, + "logps/chosen": -39.472660064697266, + "logps/rejected": -61.644203186035156, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8183362483978271, + "rewards/margins": 9.928937911987305, + "rewards/rejected": -11.747274398803711, + "step": 857 + }, + { + "epoch": 5.084444444444444, + "grad_norm": 0.1675085843386872, + "learning_rate": 1.742300392271678e-07, + "logits/chosen": -1.8151192665100098, + "logits/rejected": -1.591550588607788, + "logps/chosen": -37.466732025146484, + "logps/rejected": -74.43209838867188, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0111122131347656, + "rewards/margins": 10.86896800994873, + "rewards/rejected": -12.880081176757812, + "step": 858 + }, + { + "epoch": 5.09037037037037, + "grad_norm": 0.28954876828630643, + "learning_rate": 1.7361122452281737e-07, + "logits/chosen": -1.2304236888885498, + "logits/rejected": -1.1013797521591187, + "logps/chosen": -41.00086212158203, + "logps/rejected": -64.92744445800781, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6675610542297363, + "rewards/margins": 9.110584259033203, + "rewards/rejected": -11.778144836425781, + "step": 859 + }, + { + "epoch": 5.0962962962962965, + "grad_norm": 0.12147094194677399, + "learning_rate": 1.72992925612659e-07, + "logits/chosen": -1.4669568538665771, + "logits/rejected": -1.2288365364074707, + "logps/chosen": -42.41246032714844, + "logps/rejected": -76.5176010131836, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.155447006225586, + "rewards/margins": 11.347990989685059, + "rewards/rejected": -13.503437995910645, + "step": 860 + }, + { + "epoch": 5.102222222222222, + "grad_norm": 0.15467732711878926, + "learning_rate": 1.7237514667158596e-07, + "logits/chosen": -1.5924859046936035, + "logits/rejected": -1.5197021961212158, + "logps/chosen": -45.47554397583008, + "logps/rejected": -75.77124786376953, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6827468872070312, + "rewards/margins": 12.081143379211426, + "rewards/rejected": -14.763890266418457, + "step": 861 + }, + { + "epoch": 5.108148148148148, + "grad_norm": 0.24260116050907574, + "learning_rate": 1.7175789187098055e-07, + "logits/chosen": -1.570918083190918, + "logits/rejected": -1.6058459281921387, + "logps/chosen": -33.30390167236328, + "logps/rejected": -70.89424133300781, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9541648626327515, + "rewards/margins": 12.669885635375977, + "rewards/rejected": -14.624051094055176, + "step": 862 + }, + { + "epoch": 5.114074074074074, + "grad_norm": 0.13271833595661878, + "learning_rate": 1.7114116537868612e-07, + "logits/chosen": -1.4068008661270142, + "logits/rejected": -1.4065369367599487, + "logps/chosen": -46.0545654296875, + "logps/rejected": -69.67312622070312, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1072192192077637, + "rewards/margins": 10.404853820800781, + "rewards/rejected": -13.512073516845703, + "step": 863 + }, + { + "epoch": 5.12, + "grad_norm": 0.13102584512640375, + "learning_rate": 1.705249713589786e-07, + "logits/chosen": -1.739338755607605, + "logits/rejected": -1.6088874340057373, + "logps/chosen": -52.92829132080078, + "logps/rejected": -97.14276123046875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2332658767700195, + "rewards/margins": 12.030122756958008, + "rewards/rejected": -16.263389587402344, + "step": 864 + }, + { + "epoch": 5.125925925925926, + "grad_norm": 0.22324793093063372, + "learning_rate": 1.699093139725386e-07, + "logits/chosen": -1.5772820711135864, + "logits/rejected": -1.6475763320922852, + "logps/chosen": -52.72998809814453, + "logps/rejected": -72.01866149902344, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5473053455352783, + "rewards/margins": 11.291367530822754, + "rewards/rejected": -13.838672637939453, + "step": 865 + }, + { + "epoch": 5.131851851851851, + "grad_norm": 0.1551815052938634, + "learning_rate": 1.6929419737642322e-07, + "logits/chosen": -2.116971492767334, + "logits/rejected": -2.0724849700927734, + "logps/chosen": -45.568336486816406, + "logps/rejected": -79.17977905273438, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3480639457702637, + "rewards/margins": 11.395853042602539, + "rewards/rejected": -14.743916511535645, + "step": 866 + }, + { + "epoch": 5.137777777777778, + "grad_norm": 0.18261474128420957, + "learning_rate": 1.686796257240381e-07, + "logits/chosen": -1.4726767539978027, + "logits/rejected": -1.2904458045959473, + "logps/chosen": -41.11164093017578, + "logps/rejected": -75.76146697998047, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5571563243865967, + "rewards/margins": 11.647672653198242, + "rewards/rejected": -14.204829216003418, + "step": 867 + }, + { + "epoch": 5.143703703703704, + "grad_norm": 0.16977709275009925, + "learning_rate": 1.680656031651093e-07, + "logits/chosen": -1.4244813919067383, + "logits/rejected": -1.2991671562194824, + "logps/chosen": -48.0638427734375, + "logps/rejected": -82.90692138671875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5564422607421875, + "rewards/margins": 11.91999626159668, + "rewards/rejected": -15.476438522338867, + "step": 868 + }, + { + "epoch": 5.149629629629629, + "grad_norm": 0.09172224696780171, + "learning_rate": 1.6745213384565516e-07, + "logits/chosen": -2.0797746181488037, + "logits/rejected": -2.0070877075195312, + "logps/chosen": -53.26316833496094, + "logps/rejected": -89.38504791259766, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.355812072753906, + "rewards/margins": 11.446057319641113, + "rewards/rejected": -15.80186939239502, + "step": 869 + }, + { + "epoch": 5.155555555555556, + "grad_norm": 0.17909443988936508, + "learning_rate": 1.6683922190795845e-07, + "logits/chosen": -2.0503368377685547, + "logits/rejected": -1.7288861274719238, + "logps/chosen": -40.119171142578125, + "logps/rejected": -82.29203796386719, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1652438640594482, + "rewards/margins": 14.119874000549316, + "rewards/rejected": -16.285120010375977, + "step": 870 + }, + { + "epoch": 5.161481481481482, + "grad_norm": 0.18436994864792003, + "learning_rate": 1.6622687149053844e-07, + "logits/chosen": -2.2823660373687744, + "logits/rejected": -2.0534257888793945, + "logps/chosen": -44.072784423828125, + "logps/rejected": -83.30123901367188, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.622286319732666, + "rewards/margins": 11.508301734924316, + "rewards/rejected": -15.130587577819824, + "step": 871 + }, + { + "epoch": 5.167407407407407, + "grad_norm": 0.1822933236225203, + "learning_rate": 1.6561508672812295e-07, + "logits/chosen": -1.9030749797821045, + "logits/rejected": -1.8649938106536865, + "logps/chosen": -50.30503845214844, + "logps/rejected": -77.90472412109375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4370203018188477, + "rewards/margins": 10.136503219604492, + "rewards/rejected": -12.57352352142334, + "step": 872 + }, + { + "epoch": 5.173333333333334, + "grad_norm": 0.11100254764300277, + "learning_rate": 1.650038717516203e-07, + "logits/chosen": -1.6390033960342407, + "logits/rejected": -1.7791786193847656, + "logps/chosen": -50.51274108886719, + "logps/rejected": -55.75922393798828, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.171482801437378, + "rewards/margins": 8.132706642150879, + "rewards/rejected": -10.304189682006836, + "step": 873 + }, + { + "epoch": 5.1792592592592595, + "grad_norm": 0.1709052348065624, + "learning_rate": 1.6439323068809137e-07, + "logits/chosen": -1.5960361957550049, + "logits/rejected": -1.5929274559020996, + "logps/chosen": -44.663734436035156, + "logps/rejected": -77.63716125488281, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5435336828231812, + "rewards/margins": 11.703277587890625, + "rewards/rejected": -13.246810913085938, + "step": 874 + }, + { + "epoch": 5.185185185185185, + "grad_norm": 0.18561882864161833, + "learning_rate": 1.6378316766072196e-07, + "logits/chosen": -1.4937806129455566, + "logits/rejected": -1.445252776145935, + "logps/chosen": -50.529808044433594, + "logps/rejected": -78.3757553100586, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8628158569335938, + "rewards/margins": 11.858318328857422, + "rewards/rejected": -15.721134185791016, + "step": 875 + }, + { + "epoch": 5.191111111111111, + "grad_norm": 0.26835891565560865, + "learning_rate": 1.6317368678879496e-07, + "logits/chosen": -1.8304246664047241, + "logits/rejected": -1.7924251556396484, + "logps/chosen": -41.48052978515625, + "logps/rejected": -60.102622985839844, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.755875587463379, + "rewards/margins": 8.978135108947754, + "rewards/rejected": -11.734010696411133, + "step": 876 + }, + { + "epoch": 5.197037037037037, + "grad_norm": 0.15704330760066193, + "learning_rate": 1.6256479218766212e-07, + "logits/chosen": -2.1413064002990723, + "logits/rejected": -1.9285932779312134, + "logps/chosen": -51.57318115234375, + "logps/rejected": -98.74991607666016, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0889475345611572, + "rewards/margins": 13.694742202758789, + "rewards/rejected": -16.783689498901367, + "step": 877 + }, + { + "epoch": 5.202962962962963, + "grad_norm": 0.11321601848002678, + "learning_rate": 1.6195648796871687e-07, + "logits/chosen": -1.5476150512695312, + "logits/rejected": -1.3866195678710938, + "logps/chosen": -36.852561950683594, + "logps/rejected": -72.84454345703125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.758419990539551, + "rewards/margins": 12.511301040649414, + "rewards/rejected": -15.269720077514648, + "step": 878 + }, + { + "epoch": 5.208888888888889, + "grad_norm": 0.10351774951876742, + "learning_rate": 1.6134877823936607e-07, + "logits/chosen": -2.093661308288574, + "logits/rejected": -2.1594038009643555, + "logps/chosen": -51.953643798828125, + "logps/rejected": -86.07762145996094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.956282138824463, + "rewards/margins": 11.716033935546875, + "rewards/rejected": -14.672317504882812, + "step": 879 + }, + { + "epoch": 5.214814814814815, + "grad_norm": 0.1434146009933508, + "learning_rate": 1.6074166710300247e-07, + "logits/chosen": -1.702378749847412, + "logits/rejected": -1.7260849475860596, + "logps/chosen": -51.182071685791016, + "logps/rejected": -73.42428588867188, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.504643440246582, + "rewards/margins": 12.243188858032227, + "rewards/rejected": -14.747832298278809, + "step": 880 + }, + { + "epoch": 5.220740740740741, + "grad_norm": 0.20945502929349574, + "learning_rate": 1.60135158658977e-07, + "logits/chosen": -2.109502077102661, + "logits/rejected": -1.807983160018921, + "logps/chosen": -58.1779670715332, + "logps/rejected": -100.36227416992188, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.7934346199035645, + "rewards/margins": 12.290433883666992, + "rewards/rejected": -17.08386993408203, + "step": 881 + }, + { + "epoch": 5.226666666666667, + "grad_norm": 0.20448547318787463, + "learning_rate": 1.5952925700257115e-07, + "logits/chosen": -1.705009937286377, + "logits/rejected": -1.6618585586547852, + "logps/chosen": -45.62565612792969, + "logps/rejected": -76.2286148071289, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5053670406341553, + "rewards/margins": 12.319999694824219, + "rewards/rejected": -15.825366020202637, + "step": 882 + }, + { + "epoch": 5.232592592592592, + "grad_norm": 0.18171957750482662, + "learning_rate": 1.5892396622496905e-07, + "logits/chosen": -1.3300485610961914, + "logits/rejected": -1.0937800407409668, + "logps/chosen": -59.10209655761719, + "logps/rejected": -103.3541030883789, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.020575523376465, + "rewards/margins": 15.489795684814453, + "rewards/rejected": -19.5103702545166, + "step": 883 + }, + { + "epoch": 5.238518518518519, + "grad_norm": 0.10331028555572916, + "learning_rate": 1.5831929041323023e-07, + "logits/chosen": -1.8355892896652222, + "logits/rejected": -1.873203992843628, + "logps/chosen": -57.147979736328125, + "logps/rejected": -82.013916015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6592514514923096, + "rewards/margins": 11.781926155090332, + "rewards/rejected": -15.441177368164062, + "step": 884 + }, + { + "epoch": 5.2444444444444445, + "grad_norm": 0.15900220224599235, + "learning_rate": 1.5771523365026175e-07, + "logits/chosen": -1.9305607080459595, + "logits/rejected": -1.7835602760314941, + "logps/chosen": -37.924530029296875, + "logps/rejected": -73.7749252319336, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2176930904388428, + "rewards/margins": 10.380413055419922, + "rewards/rejected": -12.598106384277344, + "step": 885 + }, + { + "epoch": 5.25037037037037, + "grad_norm": 0.1496584668105621, + "learning_rate": 1.5711180001479068e-07, + "logits/chosen": -1.603898525238037, + "logits/rejected": -1.5381674766540527, + "logps/chosen": -35.780635833740234, + "logps/rejected": -65.23573303222656, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7302260398864746, + "rewards/margins": 9.829032897949219, + "rewards/rejected": -12.559258460998535, + "step": 886 + }, + { + "epoch": 5.256296296296297, + "grad_norm": 0.091370988937047, + "learning_rate": 1.5650899358133667e-07, + "logits/chosen": -1.788956642150879, + "logits/rejected": -1.748964548110962, + "logps/chosen": -53.1939582824707, + "logps/rejected": -82.92172241210938, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7135212421417236, + "rewards/margins": 11.983846664428711, + "rewards/rejected": -15.697367668151855, + "step": 887 + }, + { + "epoch": 5.262222222222222, + "grad_norm": 0.14689421171326186, + "learning_rate": 1.5590681842018443e-07, + "logits/chosen": -1.099969744682312, + "logits/rejected": -1.2583948373794556, + "logps/chosen": -59.88705062866211, + "logps/rejected": -84.44697570800781, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.428144454956055, + "rewards/margins": 11.820623397827148, + "rewards/rejected": -16.248767852783203, + "step": 888 + }, + { + "epoch": 5.268148148148148, + "grad_norm": 0.10821955488468121, + "learning_rate": 1.5530527859735599e-07, + "logits/chosen": -2.088761806488037, + "logits/rejected": -1.9281114339828491, + "logps/chosen": -46.75210189819336, + "logps/rejected": -88.88577270507812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3439178466796875, + "rewards/margins": 14.868821144104004, + "rewards/rejected": -18.212738037109375, + "step": 889 + }, + { + "epoch": 5.274074074074074, + "grad_norm": 0.15466292314549412, + "learning_rate": 1.5470437817458355e-07, + "logits/chosen": -2.0101559162139893, + "logits/rejected": -1.789503574371338, + "logps/chosen": -49.836647033691406, + "logps/rejected": -89.97150421142578, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7755634784698486, + "rewards/margins": 11.662132263183594, + "rewards/rejected": -15.43769645690918, + "step": 890 + }, + { + "epoch": 5.28, + "grad_norm": 0.20928188129386854, + "learning_rate": 1.5410412120928186e-07, + "logits/chosen": -1.8343095779418945, + "logits/rejected": -1.8289053440093994, + "logps/chosen": -59.51985168457031, + "logps/rejected": -88.18516540527344, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.776739120483398, + "rewards/margins": 12.286046028137207, + "rewards/rejected": -17.062786102294922, + "step": 891 + }, + { + "epoch": 5.285925925925926, + "grad_norm": 0.09170725540184624, + "learning_rate": 1.53504511754521e-07, + "logits/chosen": -1.5599915981292725, + "logits/rejected": -1.3725429773330688, + "logps/chosen": -49.52803421020508, + "logps/rejected": -90.8835220336914, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.027318954467773, + "rewards/margins": 12.441356658935547, + "rewards/rejected": -16.46867561340332, + "step": 892 + }, + { + "epoch": 5.291851851851852, + "grad_norm": 0.18130879391822655, + "learning_rate": 1.5290555385899877e-07, + "logits/chosen": -1.7436559200286865, + "logits/rejected": -1.6874361038208008, + "logps/chosen": -47.117225646972656, + "logps/rejected": -79.31859588623047, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.854762077331543, + "rewards/margins": 11.091411590576172, + "rewards/rejected": -14.946174621582031, + "step": 893 + }, + { + "epoch": 5.297777777777778, + "grad_norm": 0.11334482840840347, + "learning_rate": 1.5230725156701373e-07, + "logits/chosen": -1.4855456352233887, + "logits/rejected": -1.4895267486572266, + "logps/chosen": -48.04728317260742, + "logps/rejected": -90.0076904296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4714088439941406, + "rewards/margins": 14.348779678344727, + "rewards/rejected": -17.820188522338867, + "step": 894 + }, + { + "epoch": 5.303703703703704, + "grad_norm": 0.09948661594392598, + "learning_rate": 1.517096089184375e-07, + "logits/chosen": -1.6243767738342285, + "logits/rejected": -1.8554785251617432, + "logps/chosen": -61.03385925292969, + "logps/rejected": -71.03145599365234, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3068552017211914, + "rewards/margins": 9.547409057617188, + "rewards/rejected": -12.854263305664062, + "step": 895 + }, + { + "epoch": 5.3096296296296295, + "grad_norm": 0.19297746323084378, + "learning_rate": 1.5111262994868756e-07, + "logits/chosen": -1.6317886114120483, + "logits/rejected": -1.5646547079086304, + "logps/chosen": -43.04448699951172, + "logps/rejected": -70.97728729248047, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.640531539916992, + "rewards/margins": 9.738162994384766, + "rewards/rejected": -12.37869644165039, + "step": 896 + }, + { + "epoch": 5.315555555555555, + "grad_norm": 0.14642107606223295, + "learning_rate": 1.5051631868870019e-07, + "logits/chosen": -1.7854773998260498, + "logits/rejected": -1.6659780740737915, + "logps/chosen": -43.154239654541016, + "logps/rejected": -80.3381118774414, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.406045913696289, + "rewards/margins": 12.070757865905762, + "rewards/rejected": -15.476802825927734, + "step": 897 + }, + { + "epoch": 5.321481481481482, + "grad_norm": 0.15732900222598203, + "learning_rate": 1.499206791649032e-07, + "logits/chosen": -1.9318976402282715, + "logits/rejected": -1.8870396614074707, + "logps/chosen": -49.68609619140625, + "logps/rejected": -80.17252349853516, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.818880558013916, + "rewards/margins": 12.839373588562012, + "rewards/rejected": -16.658254623413086, + "step": 898 + }, + { + "epoch": 5.327407407407407, + "grad_norm": 0.16901663436453515, + "learning_rate": 1.4932571539918854e-07, + "logits/chosen": -1.6694287061691284, + "logits/rejected": -1.674289345741272, + "logps/chosen": -58.69844055175781, + "logps/rejected": -86.66792297363281, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.718257904052734, + "rewards/margins": 11.202542304992676, + "rewards/rejected": -15.920801162719727, + "step": 899 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.0866407359161085, + "learning_rate": 1.4873143140888537e-07, + "logits/chosen": -1.7056925296783447, + "logits/rejected": -1.4066742658615112, + "logps/chosen": -53.29381561279297, + "logps/rejected": -96.85028076171875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.774338722229004, + "rewards/margins": 13.016897201538086, + "rewards/rejected": -18.791236877441406, + "step": 900 + }, + { + "epoch": 5.33925925925926, + "grad_norm": 0.1164374083908993, + "learning_rate": 1.481378312067329e-07, + "logits/chosen": -1.958636999130249, + "logits/rejected": -1.8225376605987549, + "logps/chosen": -49.395442962646484, + "logps/rejected": -72.3202896118164, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.258683443069458, + "rewards/margins": 12.200054168701172, + "rewards/rejected": -14.458738327026367, + "step": 901 + }, + { + "epoch": 5.345185185185185, + "grad_norm": 0.18450207404639837, + "learning_rate": 1.4754491880085317e-07, + "logits/chosen": -1.6482468843460083, + "logits/rejected": -1.6017494201660156, + "logps/chosen": -46.62718200683594, + "logps/rejected": -94.2940673828125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9291470050811768, + "rewards/margins": 12.749122619628906, + "rewards/rejected": -16.678268432617188, + "step": 902 + }, + { + "epoch": 5.351111111111111, + "grad_norm": 0.11574118575383069, + "learning_rate": 1.4695269819472403e-07, + "logits/chosen": -1.1760472059249878, + "logits/rejected": -1.199681282043457, + "logps/chosen": -64.5206069946289, + "logps/rejected": -74.74624633789062, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.804712772369385, + "rewards/margins": 9.701336860656738, + "rewards/rejected": -14.506050109863281, + "step": 903 + }, + { + "epoch": 5.357037037037037, + "grad_norm": 0.14066639890033453, + "learning_rate": 1.463611733871523e-07, + "logits/chosen": -1.8428750038146973, + "logits/rejected": -1.6979756355285645, + "logps/chosen": -45.86759567260742, + "logps/rejected": -88.10768127441406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3660061359405518, + "rewards/margins": 11.283397674560547, + "rewards/rejected": -13.649404525756836, + "step": 904 + }, + { + "epoch": 5.362962962962963, + "grad_norm": 0.2518767989627491, + "learning_rate": 1.457703483722466e-07, + "logits/chosen": -1.6185064315795898, + "logits/rejected": -1.5029176473617554, + "logps/chosen": -41.58538818359375, + "logps/rejected": -73.95729064941406, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.240269422531128, + "rewards/margins": 11.645746231079102, + "rewards/rejected": -13.886014938354492, + "step": 905 + }, + { + "epoch": 5.368888888888889, + "grad_norm": 0.12174019876649274, + "learning_rate": 1.4518022713938998e-07, + "logits/chosen": -2.0943048000335693, + "logits/rejected": -2.0399320125579834, + "logps/chosen": -45.09468078613281, + "logps/rejected": -80.62596130371094, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9401047229766846, + "rewards/margins": 11.51397705078125, + "rewards/rejected": -14.454082489013672, + "step": 906 + }, + { + "epoch": 5.3748148148148145, + "grad_norm": 0.20129319769151877, + "learning_rate": 1.4459081367321407e-07, + "logits/chosen": -1.5126574039459229, + "logits/rejected": -1.61480712890625, + "logps/chosen": -43.759063720703125, + "logps/rejected": -66.43977355957031, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.753220558166504, + "rewards/margins": 9.984600067138672, + "rewards/rejected": -14.73781967163086, + "step": 907 + }, + { + "epoch": 5.380740740740741, + "grad_norm": 0.1415414744220965, + "learning_rate": 1.4400211195357103e-07, + "logits/chosen": -1.6789093017578125, + "logits/rejected": -1.8044849634170532, + "logps/chosen": -56.00989532470703, + "logps/rejected": -77.61322784423828, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8247482776641846, + "rewards/margins": 12.38948917388916, + "rewards/rejected": -16.2142391204834, + "step": 908 + }, + { + "epoch": 5.386666666666667, + "grad_norm": 0.11781108239895481, + "learning_rate": 1.4341412595550724e-07, + "logits/chosen": -1.45395028591156, + "logits/rejected": -1.4600470066070557, + "logps/chosen": -38.969261169433594, + "logps/rejected": -77.78085327148438, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7798218727111816, + "rewards/margins": 11.665205001831055, + "rewards/rejected": -14.445026397705078, + "step": 909 + }, + { + "epoch": 5.392592592592592, + "grad_norm": 0.16066753379617685, + "learning_rate": 1.428268596492364e-07, + "logits/chosen": -1.5352404117584229, + "logits/rejected": -1.5462236404418945, + "logps/chosen": -37.77696228027344, + "logps/rejected": -72.236328125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9697984457015991, + "rewards/margins": 12.46438980102539, + "rewards/rejected": -14.434188842773438, + "step": 910 + }, + { + "epoch": 5.398518518518518, + "grad_norm": 0.1829332241319529, + "learning_rate": 1.4224031700011286e-07, + "logits/chosen": -1.6134045124053955, + "logits/rejected": -1.6110115051269531, + "logps/chosen": -42.56840133666992, + "logps/rejected": -79.94451904296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0541820526123047, + "rewards/margins": 12.98902702331543, + "rewards/rejected": -16.043209075927734, + "step": 911 + }, + { + "epoch": 5.404444444444445, + "grad_norm": 0.12167838129787681, + "learning_rate": 1.416545019686042e-07, + "logits/chosen": -2.096259355545044, + "logits/rejected": -2.0034162998199463, + "logps/chosen": -46.483367919921875, + "logps/rejected": -88.5770034790039, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.900122880935669, + "rewards/margins": 14.230045318603516, + "rewards/rejected": -18.130168914794922, + "step": 912 + }, + { + "epoch": 5.41037037037037, + "grad_norm": 0.15784798830656716, + "learning_rate": 1.4106941851026544e-07, + "logits/chosen": -1.9469846487045288, + "logits/rejected": -1.8676340579986572, + "logps/chosen": -49.274776458740234, + "logps/rejected": -83.13460540771484, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6215996742248535, + "rewards/margins": 12.980171203613281, + "rewards/rejected": -16.601770401000977, + "step": 913 + }, + { + "epoch": 5.416296296296296, + "grad_norm": 0.1413161041155283, + "learning_rate": 1.4048507057571164e-07, + "logits/chosen": -1.4655190706253052, + "logits/rejected": -1.5878344774246216, + "logps/chosen": -49.412841796875, + "logps/rejected": -74.7137680053711, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.394691467285156, + "rewards/margins": 10.30486011505127, + "rewards/rejected": -14.699551582336426, + "step": 914 + }, + { + "epoch": 5.4222222222222225, + "grad_norm": 0.1583348852334666, + "learning_rate": 1.3990146211059139e-07, + "logits/chosen": -1.909976840019226, + "logits/rejected": -1.8862512111663818, + "logps/chosen": -45.07548522949219, + "logps/rejected": -66.51271057128906, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0548486709594727, + "rewards/margins": 10.233278274536133, + "rewards/rejected": -12.288125991821289, + "step": 915 + }, + { + "epoch": 5.428148148148148, + "grad_norm": 0.3217765341519717, + "learning_rate": 1.3931859705556052e-07, + "logits/chosen": -1.720643162727356, + "logits/rejected": -1.6782926321029663, + "logps/chosen": -41.83833312988281, + "logps/rejected": -73.55179595947266, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.188084840774536, + "rewards/margins": 11.182394981384277, + "rewards/rejected": -13.370479583740234, + "step": 916 + }, + { + "epoch": 5.434074074074074, + "grad_norm": 0.11661976888036214, + "learning_rate": 1.387364793462548e-07, + "logits/chosen": -1.5590168237686157, + "logits/rejected": -1.3452208042144775, + "logps/chosen": -47.607704162597656, + "logps/rejected": -85.09840393066406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.510599374771118, + "rewards/margins": 10.469550132751465, + "rewards/rejected": -12.98015022277832, + "step": 917 + }, + { + "epoch": 5.44, + "grad_norm": 0.1397991323159031, + "learning_rate": 1.38155112913264e-07, + "logits/chosen": -1.1578762531280518, + "logits/rejected": -1.1332764625549316, + "logps/chosen": -43.81012725830078, + "logps/rejected": -73.04928588867188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3922626972198486, + "rewards/margins": 11.932851791381836, + "rewards/rejected": -14.325115203857422, + "step": 918 + }, + { + "epoch": 5.445925925925926, + "grad_norm": 0.17044941604549793, + "learning_rate": 1.37574501682105e-07, + "logits/chosen": -1.7931560277938843, + "logits/rejected": -1.6720855236053467, + "logps/chosen": -47.04766845703125, + "logps/rejected": -90.76607513427734, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.885108232498169, + "rewards/margins": 14.144776344299316, + "rewards/rejected": -18.029884338378906, + "step": 919 + }, + { + "epoch": 5.451851851851852, + "grad_norm": 0.17803670632400478, + "learning_rate": 1.369946495731954e-07, + "logits/chosen": -1.4030263423919678, + "logits/rejected": -1.4680297374725342, + "logps/chosen": -58.498477935791016, + "logps/rejected": -81.02774047851562, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9646613597869873, + "rewards/margins": 11.905306816101074, + "rewards/rejected": -15.86996841430664, + "step": 920 + }, + { + "epoch": 5.457777777777777, + "grad_norm": 0.1274841560573801, + "learning_rate": 1.3641556050182707e-07, + "logits/chosen": -1.7464091777801514, + "logits/rejected": -1.6411049365997314, + "logps/chosen": -38.34832000732422, + "logps/rejected": -70.27515411376953, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4438483715057373, + "rewards/margins": 10.543989181518555, + "rewards/rejected": -12.987837791442871, + "step": 921 + }, + { + "epoch": 5.463703703703704, + "grad_norm": 0.12059465093251162, + "learning_rate": 1.3583723837813964e-07, + "logits/chosen": -1.6953315734863281, + "logits/rejected": -1.5744541883468628, + "logps/chosen": -50.8625373840332, + "logps/rejected": -85.90507507324219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.873828887939453, + "rewards/margins": 13.21931266784668, + "rewards/rejected": -17.093143463134766, + "step": 922 + }, + { + "epoch": 5.46962962962963, + "grad_norm": 0.1785860228395741, + "learning_rate": 1.3525968710709415e-07, + "logits/chosen": -2.02950382232666, + "logits/rejected": -1.9261343479156494, + "logps/chosen": -38.81111526489258, + "logps/rejected": -77.9141845703125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2182492017745972, + "rewards/margins": 14.254678726196289, + "rewards/rejected": -15.472929000854492, + "step": 923 + }, + { + "epoch": 5.475555555555555, + "grad_norm": 0.11719304740531708, + "learning_rate": 1.346829105884467e-07, + "logits/chosen": -1.734592318534851, + "logits/rejected": -1.4210288524627686, + "logps/chosen": -37.435325622558594, + "logps/rejected": -80.92240905761719, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.19169282913208, + "rewards/margins": 13.293989181518555, + "rewards/rejected": -15.485682487487793, + "step": 924 + }, + { + "epoch": 5.481481481481482, + "grad_norm": 0.17100357178124784, + "learning_rate": 1.3410691271672206e-07, + "logits/chosen": -1.3211579322814941, + "logits/rejected": -1.3886321783065796, + "logps/chosen": -43.99436950683594, + "logps/rejected": -67.71986389160156, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.496647596359253, + "rewards/margins": 9.868618965148926, + "rewards/rejected": -13.365266799926758, + "step": 925 + }, + { + "epoch": 5.4874074074074075, + "grad_norm": 0.13742557760347932, + "learning_rate": 1.335316973811874e-07, + "logits/chosen": -1.7404249906539917, + "logits/rejected": -1.7618457078933716, + "logps/chosen": -42.16370391845703, + "logps/rejected": -68.19352722167969, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6484298706054688, + "rewards/margins": 10.290946960449219, + "rewards/rejected": -12.939376831054688, + "step": 926 + }, + { + "epoch": 5.493333333333333, + "grad_norm": 0.12442842572287284, + "learning_rate": 1.32957268465826e-07, + "logits/chosen": -1.5197845697402954, + "logits/rejected": -1.4984042644500732, + "logps/chosen": -47.004085540771484, + "logps/rejected": -76.21795654296875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.030613899230957, + "rewards/margins": 11.936113357543945, + "rewards/rejected": -15.966728210449219, + "step": 927 + }, + { + "epoch": 5.499259259259259, + "grad_norm": 0.15299953515000728, + "learning_rate": 1.3238362984931113e-07, + "logits/chosen": -1.8709659576416016, + "logits/rejected": -1.7127676010131836, + "logps/chosen": -41.45587921142578, + "logps/rejected": -80.52196502685547, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.514099597930908, + "rewards/margins": 12.647805213928223, + "rewards/rejected": -16.161903381347656, + "step": 928 + }, + { + "epoch": 5.505185185185185, + "grad_norm": 0.09435887433938309, + "learning_rate": 1.318107854049797e-07, + "logits/chosen": -2.078130006790161, + "logits/rejected": -2.1746225357055664, + "logps/chosen": -44.262779235839844, + "logps/rejected": -68.68415832519531, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1330409049987793, + "rewards/margins": 11.835768699645996, + "rewards/rejected": -14.968809127807617, + "step": 929 + }, + { + "epoch": 5.511111111111111, + "grad_norm": 0.09000734814570381, + "learning_rate": 1.3123873900080628e-07, + "logits/chosen": -1.8118696212768555, + "logits/rejected": -1.8935637474060059, + "logps/chosen": -45.19469451904297, + "logps/rejected": -65.81236267089844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.033607006072998, + "rewards/margins": 10.029970169067383, + "rewards/rejected": -14.063577651977539, + "step": 930 + }, + { + "epoch": 5.517037037037037, + "grad_norm": 0.1047861210360843, + "learning_rate": 1.306674944993768e-07, + "logits/chosen": -1.4601209163665771, + "logits/rejected": -1.4323076009750366, + "logps/chosen": -45.547691345214844, + "logps/rejected": -81.56329345703125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.131673812866211, + "rewards/margins": 13.62175178527832, + "rewards/rejected": -16.75342559814453, + "step": 931 + }, + { + "epoch": 5.522962962962963, + "grad_norm": 0.11003995547439586, + "learning_rate": 1.3009705575786268e-07, + "logits/chosen": -1.8155300617218018, + "logits/rejected": -1.9376707077026367, + "logps/chosen": -46.22417449951172, + "logps/rejected": -73.66895294189453, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8170642852783203, + "rewards/margins": 10.779097557067871, + "rewards/rejected": -14.596162796020508, + "step": 932 + }, + { + "epoch": 5.528888888888889, + "grad_norm": 0.09141229143535569, + "learning_rate": 1.295274266279945e-07, + "logits/chosen": -1.5078794956207275, + "logits/rejected": -1.5662742853164673, + "logps/chosen": -50.77909851074219, + "logps/rejected": -76.8865966796875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6053050756454468, + "rewards/margins": 12.83316421508789, + "rewards/rejected": -14.438468933105469, + "step": 933 + }, + { + "epoch": 5.534814814814815, + "grad_norm": 0.21281575708183442, + "learning_rate": 1.2895861095603632e-07, + "logits/chosen": -1.824397087097168, + "logits/rejected": -1.7250351905822754, + "logps/chosen": -34.78361511230469, + "logps/rejected": -60.876487731933594, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5677258968353271, + "rewards/margins": 9.703927993774414, + "rewards/rejected": -11.271653175354004, + "step": 934 + }, + { + "epoch": 5.540740740740741, + "grad_norm": 0.16706462984138418, + "learning_rate": 1.2839061258275946e-07, + "logits/chosen": -1.4025630950927734, + "logits/rejected": -1.3066201210021973, + "logps/chosen": -44.123268127441406, + "logps/rejected": -76.74554443359375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.301388740539551, + "rewards/margins": 12.366952896118164, + "rewards/rejected": -15.668341636657715, + "step": 935 + }, + { + "epoch": 5.546666666666667, + "grad_norm": 0.1170849878787737, + "learning_rate": 1.2782343534341665e-07, + "logits/chosen": -1.8335165977478027, + "logits/rejected": -1.8167027235031128, + "logps/chosen": -46.001007080078125, + "logps/rejected": -69.01423645019531, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7690696716308594, + "rewards/margins": 10.074502944946289, + "rewards/rejected": -13.843572616577148, + "step": 936 + }, + { + "epoch": 5.5525925925925925, + "grad_norm": 0.1706348543998457, + "learning_rate": 1.2725708306771618e-07, + "logits/chosen": -1.396052360534668, + "logits/rejected": -1.3702348470687866, + "logps/chosen": -46.654876708984375, + "logps/rejected": -72.2405014038086, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5897727012634277, + "rewards/margins": 11.45168399810791, + "rewards/rejected": -15.041457176208496, + "step": 937 + }, + { + "epoch": 5.558518518518518, + "grad_norm": 0.12103662627467811, + "learning_rate": 1.266915595797961e-07, + "logits/chosen": -1.9351294040679932, + "logits/rejected": -1.8262648582458496, + "logps/chosen": -41.44812774658203, + "logps/rejected": -79.66790008544922, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.912602424621582, + "rewards/margins": 12.376932144165039, + "rewards/rejected": -16.289533615112305, + "step": 938 + }, + { + "epoch": 5.564444444444445, + "grad_norm": 0.11976343348272687, + "learning_rate": 1.2612686869819817e-07, + "logits/chosen": -1.5141785144805908, + "logits/rejected": -1.415678858757019, + "logps/chosen": -36.035640716552734, + "logps/rejected": -74.10150146484375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3020496368408203, + "rewards/margins": 12.624713897705078, + "rewards/rejected": -14.926763534545898, + "step": 939 + }, + { + "epoch": 5.57037037037037, + "grad_norm": 0.11787248363508873, + "learning_rate": 1.2556301423584208e-07, + "logits/chosen": -1.495579481124878, + "logits/rejected": -1.2809770107269287, + "logps/chosen": -56.97976303100586, + "logps/rejected": -86.33170318603516, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.188412666320801, + "rewards/margins": 10.436057090759277, + "rewards/rejected": -14.624469757080078, + "step": 940 + }, + { + "epoch": 5.576296296296296, + "grad_norm": 0.0907026532078704, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -1.603977084159851, + "logits/rejected": -1.4511686563491821, + "logps/chosen": -45.15902328491211, + "logps/rejected": -80.01466369628906, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9305174350738525, + "rewards/margins": 10.783876419067383, + "rewards/rejected": -13.714394569396973, + "step": 941 + }, + { + "epoch": 5.582222222222223, + "grad_norm": 0.1487765153354055, + "learning_rate": 1.2443782979227082e-07, + "logits/chosen": -1.8944647312164307, + "logits/rejected": -1.905172348022461, + "logps/chosen": -35.225990295410156, + "logps/rejected": -67.60298156738281, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6557745933532715, + "rewards/margins": 10.987051010131836, + "rewards/rejected": -13.64282512664795, + "step": 942 + }, + { + "epoch": 5.588148148148148, + "grad_norm": 0.17628256299454922, + "learning_rate": 1.2387650740855406e-07, + "logits/chosen": -1.564589023590088, + "logits/rejected": -1.7260979413986206, + "logps/chosen": -46.70269775390625, + "logps/rejected": -62.99655532836914, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.87866473197937, + "rewards/margins": 10.071403503417969, + "rewards/rejected": -12.950067520141602, + "step": 943 + }, + { + "epoch": 5.594074074074074, + "grad_norm": 0.12012338864560455, + "learning_rate": 1.2331603663902475e-07, + "logits/chosen": -1.7884104251861572, + "logits/rejected": -1.7316097021102905, + "logps/chosen": -49.966835021972656, + "logps/rejected": -80.75395202636719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.379611015319824, + "rewards/margins": 14.045755386352539, + "rewards/rejected": -17.425365447998047, + "step": 944 + }, + { + "epoch": 5.6, + "grad_norm": 0.14786919502739027, + "learning_rate": 1.2275642126810762e-07, + "logits/chosen": -1.4788850545883179, + "logits/rejected": -1.4324711561203003, + "logps/chosen": -40.342864990234375, + "logps/rejected": -71.33074188232422, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.499598264694214, + "rewards/margins": 11.36124324798584, + "rewards/rejected": -14.860841751098633, + "step": 945 + }, + { + "epoch": 5.605925925925926, + "grad_norm": 0.18333393490548783, + "learning_rate": 1.2219766507445144e-07, + "logits/chosen": -1.8778958320617676, + "logits/rejected": -1.5382031202316284, + "logps/chosen": -44.424560546875, + "logps/rejected": -91.03005981445312, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4865312576293945, + "rewards/margins": 14.280929565429688, + "rewards/rejected": -17.7674617767334, + "step": 946 + }, + { + "epoch": 5.611851851851852, + "grad_norm": 0.12945141594778764, + "learning_rate": 1.2163977183090368e-07, + "logits/chosen": -1.5234986543655396, + "logits/rejected": -1.3622126579284668, + "logps/chosen": -42.00763702392578, + "logps/rejected": -87.643798828125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1348018646240234, + "rewards/margins": 13.84468936920166, + "rewards/rejected": -16.9794921875, + "step": 947 + }, + { + "epoch": 5.6177777777777775, + "grad_norm": 0.21178020754931173, + "learning_rate": 1.210827453044851e-07, + "logits/chosen": -2.04561448097229, + "logits/rejected": -1.9825788736343384, + "logps/chosen": -45.74441909790039, + "logps/rejected": -82.15046691894531, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.860677719116211, + "rewards/margins": 10.71257495880127, + "rewards/rejected": -13.57325267791748, + "step": 948 + }, + { + "epoch": 5.623703703703704, + "grad_norm": 0.1455054608296158, + "learning_rate": 1.2052658925636405e-07, + "logits/chosen": -1.5713088512420654, + "logits/rejected": -1.5190423727035522, + "logps/chosen": -36.49808883666992, + "logps/rejected": -64.18106079101562, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.315652847290039, + "rewards/margins": 10.477497100830078, + "rewards/rejected": -12.7931489944458, + "step": 949 + }, + { + "epoch": 5.62962962962963, + "grad_norm": 0.2980500440824699, + "learning_rate": 1.1997130744183124e-07, + "logits/chosen": -1.6031593084335327, + "logits/rejected": -1.5546696186065674, + "logps/chosen": -63.49602508544922, + "logps/rejected": -96.17118835449219, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9462623596191406, + "rewards/margins": 13.48281192779541, + "rewards/rejected": -17.429073333740234, + "step": 950 + }, + { + "epoch": 5.635555555555555, + "grad_norm": 0.1307241810970709, + "learning_rate": 1.194169036102743e-07, + "logits/chosen": -1.7493221759796143, + "logits/rejected": -1.605499267578125, + "logps/chosen": -46.23032760620117, + "logps/rejected": -85.92477416992188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.037178993225098, + "rewards/margins": 13.696718215942383, + "rewards/rejected": -17.733896255493164, + "step": 951 + }, + { + "epoch": 5.641481481481481, + "grad_norm": 0.14975321955144041, + "learning_rate": 1.1886338150515268e-07, + "logits/chosen": -1.851197361946106, + "logits/rejected": -1.5876126289367676, + "logps/chosen": -55.22184753417969, + "logps/rejected": -99.03150177001953, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.070620536804199, + "rewards/margins": 13.874107360839844, + "rewards/rejected": -18.944726943969727, + "step": 952 + }, + { + "epoch": 5.647407407407408, + "grad_norm": 0.13874169383297966, + "learning_rate": 1.1831074486397217e-07, + "logits/chosen": -1.6220589876174927, + "logits/rejected": -1.5277056694030762, + "logps/chosen": -45.646156311035156, + "logps/rejected": -82.58694458007812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.766599655151367, + "rewards/margins": 13.377978324890137, + "rewards/rejected": -16.14457893371582, + "step": 953 + }, + { + "epoch": 5.653333333333333, + "grad_norm": 0.13578476971641049, + "learning_rate": 1.1775899741825945e-07, + "logits/chosen": -1.6966445446014404, + "logits/rejected": -1.4161500930786133, + "logps/chosen": -51.15569305419922, + "logps/rejected": -96.25025939941406, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.151477813720703, + "rewards/margins": 13.05290412902832, + "rewards/rejected": -17.204381942749023, + "step": 954 + }, + { + "epoch": 5.659259259259259, + "grad_norm": 0.12481862557973492, + "learning_rate": 1.172081428935375e-07, + "logits/chosen": -2.3386316299438477, + "logits/rejected": -2.2986228466033936, + "logps/chosen": -47.3796501159668, + "logps/rejected": -74.82670593261719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.846057415008545, + "rewards/margins": 12.71998405456543, + "rewards/rejected": -15.5660400390625, + "step": 955 + }, + { + "epoch": 5.6651851851851855, + "grad_norm": 0.1821793357669405, + "learning_rate": 1.1665818500929986e-07, + "logits/chosen": -1.7978324890136719, + "logits/rejected": -1.7424900531768799, + "logps/chosen": -57.95037841796875, + "logps/rejected": -88.53657531738281, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9283673763275146, + "rewards/margins": 12.124265670776367, + "rewards/rejected": -15.052633285522461, + "step": 956 + }, + { + "epoch": 5.671111111111111, + "grad_norm": 0.14658210734864752, + "learning_rate": 1.1610912747898605e-07, + "logits/chosen": -2.2586140632629395, + "logits/rejected": -2.166628122329712, + "logps/chosen": -44.212364196777344, + "logps/rejected": -71.91984558105469, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.876743793487549, + "rewards/margins": 10.460264205932617, + "rewards/rejected": -13.337007522583008, + "step": 957 + }, + { + "epoch": 5.677037037037037, + "grad_norm": 0.21052053428277495, + "learning_rate": 1.1556097400995585e-07, + "logits/chosen": -1.2835049629211426, + "logits/rejected": -1.3611830472946167, + "logps/chosen": -66.21566009521484, + "logps/rejected": -90.28056335449219, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.6806111335754395, + "rewards/margins": 12.183880805969238, + "rewards/rejected": -17.864492416381836, + "step": 958 + }, + { + "epoch": 5.6829629629629625, + "grad_norm": 0.12157817956464971, + "learning_rate": 1.1501372830346482e-07, + "logits/chosen": -1.7824749946594238, + "logits/rejected": -1.8065423965454102, + "logps/chosen": -35.580074310302734, + "logps/rejected": -61.021427154541016, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39305993914604187, + "rewards/margins": 9.523391723632812, + "rewards/rejected": -9.916450500488281, + "step": 959 + }, + { + "epoch": 5.688888888888889, + "grad_norm": 0.17255666494982388, + "learning_rate": 1.1446739405463899e-07, + "logits/chosen": -1.3453398942947388, + "logits/rejected": -1.2265667915344238, + "logps/chosen": -31.389596939086914, + "logps/rejected": -63.46915817260742, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0556116104125977, + "rewards/margins": 11.092000961303711, + "rewards/rejected": -13.147612571716309, + "step": 960 + }, + { + "epoch": 5.694814814814815, + "grad_norm": 0.11405289684306127, + "learning_rate": 1.1392197495245015e-07, + "logits/chosen": -1.5662834644317627, + "logits/rejected": -1.4976131916046143, + "logps/chosen": -39.92075729370117, + "logps/rejected": -71.36894226074219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.032357692718506, + "rewards/margins": 11.438801765441895, + "rewards/rejected": -13.471158981323242, + "step": 961 + }, + { + "epoch": 5.70074074074074, + "grad_norm": 0.14616719178130974, + "learning_rate": 1.1337747467969069e-07, + "logits/chosen": -1.4800591468811035, + "logits/rejected": -1.4703744649887085, + "logps/chosen": -49.12568664550781, + "logps/rejected": -86.93719482421875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.348463535308838, + "rewards/margins": 12.572733879089355, + "rewards/rejected": -16.92119789123535, + "step": 962 + }, + { + "epoch": 5.706666666666667, + "grad_norm": 0.23009834739386595, + "learning_rate": 1.1283389691294893e-07, + "logits/chosen": -2.024674415588379, + "logits/rejected": -2.035721778869629, + "logps/chosen": -53.45521545410156, + "logps/rejected": -89.40463256835938, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8616273403167725, + "rewards/margins": 12.572017669677734, + "rewards/rejected": -15.433645248413086, + "step": 963 + }, + { + "epoch": 5.712592592592593, + "grad_norm": 0.10650246877796829, + "learning_rate": 1.1229124532258421e-07, + "logits/chosen": -1.6839030981063843, + "logits/rejected": -1.6192753314971924, + "logps/chosen": -54.92471694946289, + "logps/rejected": -82.94303894042969, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.707098960876465, + "rewards/margins": 11.277144432067871, + "rewards/rejected": -15.984243392944336, + "step": 964 + }, + { + "epoch": 5.718518518518518, + "grad_norm": 0.1015957402950518, + "learning_rate": 1.1174952357270212e-07, + "logits/chosen": -1.8444169759750366, + "logits/rejected": -1.8085654973983765, + "logps/chosen": -46.73295593261719, + "logps/rejected": -75.91047668457031, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.431227207183838, + "rewards/margins": 11.139023780822754, + "rewards/rejected": -14.57025146484375, + "step": 965 + }, + { + "epoch": 5.724444444444444, + "grad_norm": 0.12271793759807745, + "learning_rate": 1.112087353211297e-07, + "logits/chosen": -2.244523286819458, + "logits/rejected": -2.1519553661346436, + "logps/chosen": -39.844268798828125, + "logps/rejected": -74.11243438720703, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3675413131713867, + "rewards/margins": 12.443099021911621, + "rewards/rejected": -14.810640335083008, + "step": 966 + }, + { + "epoch": 5.730370370370371, + "grad_norm": 0.13586249354982624, + "learning_rate": 1.1066888421939092e-07, + "logits/chosen": -1.9669585227966309, + "logits/rejected": -1.8986517190933228, + "logps/chosen": -56.03706359863281, + "logps/rejected": -96.92056274414062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9295215606689453, + "rewards/margins": 12.924680709838867, + "rewards/rejected": -15.854202270507812, + "step": 967 + }, + { + "epoch": 5.736296296296296, + "grad_norm": 0.12060708329390671, + "learning_rate": 1.1012997391268177e-07, + "logits/chosen": -1.5202209949493408, + "logits/rejected": -1.5482919216156006, + "logps/chosen": -43.142616271972656, + "logps/rejected": -70.00810241699219, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.034687042236328, + "rewards/margins": 11.336004257202148, + "rewards/rejected": -14.370691299438477, + "step": 968 + }, + { + "epoch": 5.742222222222222, + "grad_norm": 0.25218587071541326, + "learning_rate": 1.095920080398459e-07, + "logits/chosen": -1.751842975616455, + "logits/rejected": -1.7139880657196045, + "logps/chosen": -38.157073974609375, + "logps/rejected": -70.97021484375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3091049194335938, + "rewards/margins": 11.698443412780762, + "rewards/rejected": -13.007549285888672, + "step": 969 + }, + { + "epoch": 5.7481481481481485, + "grad_norm": 0.14363264584749322, + "learning_rate": 1.0905499023334979e-07, + "logits/chosen": -1.8324824571609497, + "logits/rejected": -1.9127658605575562, + "logps/chosen": -51.50800704956055, + "logps/rejected": -76.4848403930664, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.799527168273926, + "rewards/margins": 11.289259910583496, + "rewards/rejected": -15.088786125183105, + "step": 970 + }, + { + "epoch": 5.754074074074074, + "grad_norm": 0.07308955653243333, + "learning_rate": 1.0851892411925856e-07, + "logits/chosen": -1.664994239807129, + "logits/rejected": -1.5534952878952026, + "logps/chosen": -45.636905670166016, + "logps/rejected": -76.47474670410156, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9818623065948486, + "rewards/margins": 10.973186492919922, + "rewards/rejected": -12.955049514770508, + "step": 971 + }, + { + "epoch": 5.76, + "grad_norm": 0.09610117014811514, + "learning_rate": 1.0798381331721107e-07, + "logits/chosen": -1.9020330905914307, + "logits/rejected": -1.8605923652648926, + "logps/chosen": -41.162384033203125, + "logps/rejected": -84.16923522949219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8716068267822266, + "rewards/margins": 14.272635459899902, + "rewards/rejected": -17.144243240356445, + "step": 972 + }, + { + "epoch": 5.7659259259259255, + "grad_norm": 0.21835290195862744, + "learning_rate": 1.0744966144039588e-07, + "logits/chosen": -1.87041437625885, + "logits/rejected": -1.7399128675460815, + "logps/chosen": -47.70142364501953, + "logps/rejected": -84.19062805175781, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7133491039276123, + "rewards/margins": 12.153305053710938, + "rewards/rejected": -15.866655349731445, + "step": 973 + }, + { + "epoch": 5.771851851851852, + "grad_norm": 0.18795920746502257, + "learning_rate": 1.0691647209552654e-07, + "logits/chosen": -1.8339283466339111, + "logits/rejected": -1.7644309997558594, + "logps/chosen": -38.69776916503906, + "logps/rejected": -65.06787109375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1597390174865723, + "rewards/margins": 9.444230079650879, + "rewards/rejected": -11.60396957397461, + "step": 974 + }, + { + "epoch": 5.777777777777778, + "grad_norm": 0.17740674289925806, + "learning_rate": 1.0638424888281744e-07, + "logits/chosen": -1.702333927154541, + "logits/rejected": -1.5982693433761597, + "logps/chosen": -49.459510803222656, + "logps/rejected": -93.81587219238281, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.351852893829346, + "rewards/margins": 13.706995964050293, + "rewards/rejected": -18.058849334716797, + "step": 975 + }, + { + "epoch": 5.783703703703703, + "grad_norm": 0.1424353181362849, + "learning_rate": 1.0585299539595943e-07, + "logits/chosen": -1.8163714408874512, + "logits/rejected": -1.9260118007659912, + "logps/chosen": -61.91559600830078, + "logps/rejected": -82.7620849609375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.653960704803467, + "rewards/margins": 10.99573040008545, + "rewards/rejected": -15.649691581726074, + "step": 976 + }, + { + "epoch": 5.78962962962963, + "grad_norm": 0.07940058375309675, + "learning_rate": 1.0532271522209551e-07, + "logits/chosen": -1.5750871896743774, + "logits/rejected": -1.374831199645996, + "logps/chosen": -44.500892639160156, + "logps/rejected": -88.57928466796875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.226994514465332, + "rewards/margins": 14.695962905883789, + "rewards/rejected": -18.922958374023438, + "step": 977 + }, + { + "epoch": 5.795555555555556, + "grad_norm": 0.1764725717312812, + "learning_rate": 1.0479341194179658e-07, + "logits/chosen": -1.331404209136963, + "logits/rejected": -1.1948274374008179, + "logps/chosen": -38.005226135253906, + "logps/rejected": -88.23011016845703, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5174663066864014, + "rewards/margins": 14.888310432434082, + "rewards/rejected": -16.405776977539062, + "step": 978 + }, + { + "epoch": 5.801481481481481, + "grad_norm": 0.12298277994646507, + "learning_rate": 1.0426508912903764e-07, + "logits/chosen": -1.1262080669403076, + "logits/rejected": -1.158602237701416, + "logps/chosen": -50.767574310302734, + "logps/rejected": -76.22100830078125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.1396355628967285, + "rewards/margins": 11.057031631469727, + "rewards/rejected": -16.196666717529297, + "step": 979 + }, + { + "epoch": 5.807407407407408, + "grad_norm": 0.15841005567455482, + "learning_rate": 1.0373775035117305e-07, + "logits/chosen": -2.0430521965026855, + "logits/rejected": -1.843505620956421, + "logps/chosen": -34.70548629760742, + "logps/rejected": -72.10562133789062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6738539934158325, + "rewards/margins": 12.656837463378906, + "rewards/rejected": -14.33069133758545, + "step": 980 + }, + { + "epoch": 5.8133333333333335, + "grad_norm": 0.09126868696163637, + "learning_rate": 1.0321139916891281e-07, + "logits/chosen": -1.9275261163711548, + "logits/rejected": -1.5792516469955444, + "logps/chosen": -51.00112533569336, + "logps/rejected": -107.09458923339844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.374115228652954, + "rewards/margins": 14.925018310546875, + "rewards/rejected": -18.29913330078125, + "step": 981 + }, + { + "epoch": 5.819259259259259, + "grad_norm": 0.11668536193593289, + "learning_rate": 1.0268603913629858e-07, + "logits/chosen": -1.510830044746399, + "logits/rejected": -1.533048391342163, + "logps/chosen": -37.096092224121094, + "logps/rejected": -62.85737991333008, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.515608549118042, + "rewards/margins": 11.164163589477539, + "rewards/rejected": -12.67977237701416, + "step": 982 + }, + { + "epoch": 5.825185185185185, + "grad_norm": 0.23361770696046172, + "learning_rate": 1.0216167380067927e-07, + "logits/chosen": -1.7323989868164062, + "logits/rejected": -1.747807502746582, + "logps/chosen": -34.62275695800781, + "logps/rejected": -72.12071228027344, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8253674507141113, + "rewards/margins": 13.908540725708008, + "rewards/rejected": -15.733907699584961, + "step": 983 + }, + { + "epoch": 5.831111111111111, + "grad_norm": 0.277833354779492, + "learning_rate": 1.0163830670268767e-07, + "logits/chosen": -2.256314754486084, + "logits/rejected": -2.0917136669158936, + "logps/chosen": -52.86377716064453, + "logps/rejected": -85.62020874023438, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4890899658203125, + "rewards/margins": 11.622297286987305, + "rewards/rejected": -16.111387252807617, + "step": 984 + }, + { + "epoch": 5.837037037037037, + "grad_norm": 0.10893117436507617, + "learning_rate": 1.0111594137621613e-07, + "logits/chosen": -1.61838698387146, + "logits/rejected": -1.6808353662490845, + "logps/chosen": -61.41318130493164, + "logps/rejected": -90.2042465209961, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.989392280578613, + "rewards/margins": 11.840309143066406, + "rewards/rejected": -16.829700469970703, + "step": 985 + }, + { + "epoch": 5.842962962962963, + "grad_norm": 0.14083883793143315, + "learning_rate": 1.0059458134839277e-07, + "logits/chosen": -1.809838056564331, + "logits/rejected": -1.7997772693634033, + "logps/chosen": -38.19802474975586, + "logps/rejected": -81.2254638671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.70159912109375, + "rewards/margins": 13.643616676330566, + "rewards/rejected": -15.345215797424316, + "step": 986 + }, + { + "epoch": 5.848888888888889, + "grad_norm": 0.2029646340596989, + "learning_rate": 1.0007423013955782e-07, + "logits/chosen": -1.7847646474838257, + "logits/rejected": -1.6058578491210938, + "logps/chosen": -45.05035400390625, + "logps/rejected": -73.54215240478516, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.438556671142578, + "rewards/margins": 9.645893096923828, + "rewards/rejected": -13.084449768066406, + "step": 987 + }, + { + "epoch": 5.854814814814815, + "grad_norm": 0.1166149245738814, + "learning_rate": 9.955489126323954e-08, + "logits/chosen": -1.264709234237671, + "logits/rejected": -1.1719449758529663, + "logps/chosen": -39.547542572021484, + "logps/rejected": -72.57160949707031, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.485950469970703, + "rewards/margins": 12.804004669189453, + "rewards/rejected": -16.289955139160156, + "step": 988 + }, + { + "epoch": 5.860740740740741, + "grad_norm": 0.189855556650095, + "learning_rate": 9.903656822613099e-08, + "logits/chosen": -2.148167133331299, + "logits/rejected": -2.095820426940918, + "logps/chosen": -42.154842376708984, + "logps/rejected": -80.5337142944336, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.442440986633301, + "rewards/margins": 13.176408767700195, + "rewards/rejected": -15.618850708007812, + "step": 989 + }, + { + "epoch": 5.866666666666667, + "grad_norm": 0.1277843814429211, + "learning_rate": 9.851926452806583e-08, + "logits/chosen": -2.021697759628296, + "logits/rejected": -1.9521626234054565, + "logps/chosen": -53.14356994628906, + "logps/rejected": -80.20213317871094, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5833356380462646, + "rewards/margins": 11.104471206665039, + "rewards/rejected": -14.687807083129883, + "step": 990 + }, + { + "epoch": 5.872592592592593, + "grad_norm": 0.10600699030812348, + "learning_rate": 9.800298366199497e-08, + "logits/chosen": -1.6231588125228882, + "logits/rejected": -1.6354756355285645, + "logps/chosen": -51.428890228271484, + "logps/rejected": -94.6219711303711, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.853515148162842, + "rewards/margins": 14.988431930541992, + "rewards/rejected": -19.841947555541992, + "step": 991 + }, + { + "epoch": 5.8785185185185185, + "grad_norm": 0.11596245744295286, + "learning_rate": 9.748772911396291e-08, + "logits/chosen": -1.3082904815673828, + "logits/rejected": -1.233626365661621, + "logps/chosen": -40.92805099487305, + "logps/rejected": -65.74427032470703, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9067976474761963, + "rewards/margins": 10.380163192749023, + "rewards/rejected": -14.28696060180664, + "step": 992 + }, + { + "epoch": 5.884444444444444, + "grad_norm": 0.10483820314945619, + "learning_rate": 9.697350436308427e-08, + "logits/chosen": -1.7032501697540283, + "logits/rejected": -1.6479954719543457, + "logps/chosen": -39.51605987548828, + "logps/rejected": -76.45269775390625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6380763053894043, + "rewards/margins": 11.980184555053711, + "rewards/rejected": -14.618260383605957, + "step": 993 + }, + { + "epoch": 5.890370370370371, + "grad_norm": 0.11282242003298107, + "learning_rate": 9.646031288152021e-08, + "logits/chosen": -2.1721444129943848, + "logits/rejected": -2.001124382019043, + "logps/chosen": -42.80406188964844, + "logps/rejected": -88.69567108154297, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4269580841064453, + "rewards/margins": 12.612770080566406, + "rewards/rejected": -16.03972816467285, + "step": 994 + }, + { + "epoch": 5.896296296296296, + "grad_norm": 0.17922823799988089, + "learning_rate": 9.5948158134455e-08, + "logits/chosen": -1.8580384254455566, + "logits/rejected": -1.887601375579834, + "logps/chosen": -57.25939178466797, + "logps/rejected": -74.04570770263672, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.449359655380249, + "rewards/margins": 12.284698486328125, + "rewards/rejected": -15.73405647277832, + "step": 995 + }, + { + "epoch": 5.902222222222222, + "grad_norm": 0.14419271465393926, + "learning_rate": 9.543704358007279e-08, + "logits/chosen": -1.651800274848938, + "logits/rejected": -1.4914857149124146, + "logps/chosen": -36.71223449707031, + "logps/rejected": -63.70142364501953, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.186145305633545, + "rewards/margins": 10.299652099609375, + "rewards/rejected": -11.485797882080078, + "step": 996 + }, + { + "epoch": 5.908148148148149, + "grad_norm": 0.16193844665413645, + "learning_rate": 9.492697266953373e-08, + "logits/chosen": -1.8199375867843628, + "logits/rejected": -1.6269965171813965, + "logps/chosen": -47.34810256958008, + "logps/rejected": -75.77220153808594, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.279019355773926, + "rewards/margins": 10.331778526306152, + "rewards/rejected": -14.610797882080078, + "step": 997 + }, + { + "epoch": 5.914074074074074, + "grad_norm": 0.11881887411981226, + "learning_rate": 9.44179488469516e-08, + "logits/chosen": -1.6865270137786865, + "logits/rejected": -1.2623865604400635, + "logps/chosen": -44.04072952270508, + "logps/rejected": -99.73695373535156, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.052587032318115, + "rewards/margins": 15.76125431060791, + "rewards/rejected": -19.8138427734375, + "step": 998 + }, + { + "epoch": 5.92, + "grad_norm": 0.10314018796390148, + "learning_rate": 9.390997554936964e-08, + "logits/chosen": -1.4690736532211304, + "logits/rejected": -1.3281701803207397, + "logps/chosen": -53.75056457519531, + "logps/rejected": -96.96499633789062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.602512836456299, + "rewards/margins": 14.359382629394531, + "rewards/rejected": -19.961896896362305, + "step": 999 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.1513517755495629, + "learning_rate": 9.340305620673778e-08, + "logits/chosen": -1.65963876247406, + "logits/rejected": -1.7874866724014282, + "logps/chosen": -54.26740264892578, + "logps/rejected": -76.15953063964844, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9122443199157715, + "rewards/margins": 11.574843406677246, + "rewards/rejected": -15.487088203430176, + "step": 1000 + }, + { + "epoch": 5.931851851851852, + "grad_norm": 0.14425098132355896, + "learning_rate": 9.289719424188947e-08, + "logits/chosen": -2.188652276992798, + "logits/rejected": -2.323061943054199, + "logps/chosen": -58.438350677490234, + "logps/rejected": -85.29507446289062, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2114763259887695, + "rewards/margins": 11.773921966552734, + "rewards/rejected": -16.985397338867188, + "step": 1001 + }, + { + "epoch": 5.937777777777778, + "grad_norm": 0.177912208496194, + "learning_rate": 9.239239307051841e-08, + "logits/chosen": -2.1309244632720947, + "logits/rejected": -1.9711205959320068, + "logps/chosen": -40.25239562988281, + "logps/rejected": -60.29902267456055, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9188445806503296, + "rewards/margins": 9.246747970581055, + "rewards/rejected": -11.165592193603516, + "step": 1002 + }, + { + "epoch": 5.9437037037037035, + "grad_norm": 0.1561294024334232, + "learning_rate": 9.18886561011557e-08, + "logits/chosen": -1.3059730529785156, + "logits/rejected": -1.2804628610610962, + "logps/chosen": -46.57466125488281, + "logps/rejected": -81.99845123291016, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.611976146697998, + "rewards/margins": 11.705613136291504, + "rewards/rejected": -16.317588806152344, + "step": 1003 + }, + { + "epoch": 5.94962962962963, + "grad_norm": 0.2993503902397938, + "learning_rate": 9.13859867351466e-08, + "logits/chosen": -1.8890063762664795, + "logits/rejected": -2.08756685256958, + "logps/chosen": -59.0401496887207, + "logps/rejected": -76.09051513671875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.641585826873779, + "rewards/margins": 11.28979778289795, + "rewards/rejected": -15.93138313293457, + "step": 1004 + }, + { + "epoch": 5.955555555555556, + "grad_norm": 0.10981688272526485, + "learning_rate": 9.088438836662777e-08, + "logits/chosen": -1.4090893268585205, + "logits/rejected": -1.3857533931732178, + "logps/chosen": -54.12238311767578, + "logps/rejected": -91.73143005371094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.333917140960693, + "rewards/margins": 12.519246101379395, + "rewards/rejected": -17.853164672851562, + "step": 1005 + }, + { + "epoch": 5.961481481481481, + "grad_norm": 0.13439049466641606, + "learning_rate": 9.038386438250414e-08, + "logits/chosen": -1.2438244819641113, + "logits/rejected": -1.1741102933883667, + "logps/chosen": -39.77931213378906, + "logps/rejected": -71.62962341308594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.493358612060547, + "rewards/margins": 12.272336959838867, + "rewards/rejected": -14.765695571899414, + "step": 1006 + }, + { + "epoch": 5.967407407407407, + "grad_norm": 0.3528541714993207, + "learning_rate": 8.988441816242629e-08, + "logits/chosen": -1.874366044998169, + "logits/rejected": -1.843646764755249, + "logps/chosen": -46.88833999633789, + "logps/rejected": -72.9251708984375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3884449005126953, + "rewards/margins": 10.11141586303711, + "rewards/rejected": -13.499860763549805, + "step": 1007 + }, + { + "epoch": 5.973333333333334, + "grad_norm": 0.11757249904059189, + "learning_rate": 8.938605307876736e-08, + "logits/chosen": -1.7174410820007324, + "logits/rejected": -1.5187230110168457, + "logps/chosen": -38.19911575317383, + "logps/rejected": -68.39817810058594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4435136318206787, + "rewards/margins": 10.123954772949219, + "rewards/rejected": -13.56746768951416, + "step": 1008 + }, + { + "epoch": 5.979259259259259, + "grad_norm": 0.1236027401279087, + "learning_rate": 8.888877249660052e-08, + "logits/chosen": -1.4056321382522583, + "logits/rejected": -1.3889738321304321, + "logps/chosen": -46.628700256347656, + "logps/rejected": -76.94029235839844, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.220608711242676, + "rewards/margins": 10.921125411987305, + "rewards/rejected": -14.141735076904297, + "step": 1009 + }, + { + "epoch": 5.985185185185185, + "grad_norm": 0.14761964298924235, + "learning_rate": 8.839257977367609e-08, + "logits/chosen": -1.4390403032302856, + "logits/rejected": -1.1668685674667358, + "logps/chosen": -38.51154708862305, + "logps/rejected": -84.68218994140625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.108654260635376, + "rewards/margins": 14.42491340637207, + "rewards/rejected": -16.5335693359375, + "step": 1010 + }, + { + "epoch": 5.9911111111111115, + "grad_norm": 0.16578379927474013, + "learning_rate": 8.789747826039893e-08, + "logits/chosen": -1.4390288591384888, + "logits/rejected": -1.4251270294189453, + "logps/chosen": -45.86073684692383, + "logps/rejected": -74.61750793457031, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.397583961486816, + "rewards/margins": 10.961638450622559, + "rewards/rejected": -15.359222412109375, + "step": 1011 + }, + { + "epoch": 5.997037037037037, + "grad_norm": 0.23256467980724813, + "learning_rate": 8.74034712998058e-08, + "logits/chosen": -1.9970195293426514, + "logits/rejected": -1.8830376863479614, + "logps/chosen": -50.25756072998047, + "logps/rejected": -91.27782440185547, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.501843452453613, + "rewards/margins": 13.392400741577148, + "rewards/rejected": -17.894245147705078, + "step": 1012 + }, + { + "epoch": 6.002962962962963, + "grad_norm": 0.14358647909840425, + "learning_rate": 8.69105622275428e-08, + "logits/chosen": -1.5321400165557861, + "logits/rejected": -1.4904074668884277, + "logps/chosen": -34.789794921875, + "logps/rejected": -68.898193359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4096457958221436, + "rewards/margins": 13.383646965026855, + "rewards/rejected": -14.793292999267578, + "step": 1013 + }, + { + "epoch": 6.0088888888888885, + "grad_norm": 0.12844382297364182, + "learning_rate": 8.641875437184287e-08, + "logits/chosen": -1.7536540031433105, + "logits/rejected": -1.660245656967163, + "logps/chosen": -35.77042770385742, + "logps/rejected": -82.79251098632812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1235854625701904, + "rewards/margins": 14.947321891784668, + "rewards/rejected": -18.070907592773438, + "step": 1014 + }, + { + "epoch": 6.014814814814815, + "grad_norm": 0.05247327611483253, + "learning_rate": 8.592805105350326e-08, + "logits/chosen": -2.009796619415283, + "logits/rejected": -1.900392770767212, + "logps/chosen": -41.918434143066406, + "logps/rejected": -77.7911376953125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0733580589294434, + "rewards/margins": 13.40345573425293, + "rewards/rejected": -15.476814270019531, + "step": 1015 + }, + { + "epoch": 6.020740740740741, + "grad_norm": 0.10007031711708843, + "learning_rate": 8.543845558586307e-08, + "logits/chosen": -1.3085033893585205, + "logits/rejected": -1.413244366645813, + "logps/chosen": -42.41469955444336, + "logps/rejected": -72.19366455078125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.757518768310547, + "rewards/margins": 10.87429428100586, + "rewards/rejected": -13.631814956665039, + "step": 1016 + }, + { + "epoch": 6.026666666666666, + "grad_norm": 0.08269761942657197, + "learning_rate": 8.494997127478109e-08, + "logits/chosen": -1.6426849365234375, + "logits/rejected": -1.4693927764892578, + "logps/chosen": -46.25225830078125, + "logps/rejected": -79.94685363769531, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9081649780273438, + "rewards/margins": 12.332348823547363, + "rewards/rejected": -16.24051284790039, + "step": 1017 + }, + { + "epoch": 6.032592592592593, + "grad_norm": 0.0965133655534609, + "learning_rate": 8.44626014186132e-08, + "logits/chosen": -1.5757637023925781, + "logits/rejected": -1.5381988286972046, + "logps/chosen": -42.592041015625, + "logps/rejected": -73.64163970947266, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.198291540145874, + "rewards/margins": 12.328102111816406, + "rewards/rejected": -15.526392936706543, + "step": 1018 + }, + { + "epoch": 6.038518518518519, + "grad_norm": 0.09484154834530545, + "learning_rate": 8.39763493081902e-08, + "logits/chosen": -1.8162059783935547, + "logits/rejected": -1.6350376605987549, + "logps/chosen": -42.69355773925781, + "logps/rejected": -72.34944915771484, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1462531089782715, + "rewards/margins": 11.239422798156738, + "rewards/rejected": -14.385677337646484, + "step": 1019 + }, + { + "epoch": 6.044444444444444, + "grad_norm": 0.10897687127100829, + "learning_rate": 8.349121822679589e-08, + "logits/chosen": -1.4719338417053223, + "logits/rejected": -1.5809326171875, + "logps/chosen": -40.893314361572266, + "logps/rejected": -71.21835327148438, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4168126583099365, + "rewards/margins": 11.799530029296875, + "rewards/rejected": -13.216341972351074, + "step": 1020 + }, + { + "epoch": 6.05037037037037, + "grad_norm": 0.1015909348981711, + "learning_rate": 8.300721145014434e-08, + "logits/chosen": -1.1651694774627686, + "logits/rejected": -1.0104308128356934, + "logps/chosen": -44.911109924316406, + "logps/rejected": -69.9439926147461, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.591246128082275, + "rewards/margins": 12.101785659790039, + "rewards/rejected": -16.693031311035156, + "step": 1021 + }, + { + "epoch": 6.0562962962962965, + "grad_norm": 0.12301399347924664, + "learning_rate": 8.252433224635816e-08, + "logits/chosen": -1.5464973449707031, + "logits/rejected": -1.5874733924865723, + "logps/chosen": -44.601627349853516, + "logps/rejected": -84.60299682617188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1868271827697754, + "rewards/margins": 12.496557235717773, + "rewards/rejected": -15.68338394165039, + "step": 1022 + }, + { + "epoch": 6.062222222222222, + "grad_norm": 0.14071709766452006, + "learning_rate": 8.204258387594634e-08, + "logits/chosen": -1.8018912076950073, + "logits/rejected": -2.0076708793640137, + "logps/chosen": -55.555274963378906, + "logps/rejected": -76.36517333984375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.126642227172852, + "rewards/margins": 12.043182373046875, + "rewards/rejected": -17.169824600219727, + "step": 1023 + }, + { + "epoch": 6.068148148148148, + "grad_norm": 0.09827836009768653, + "learning_rate": 8.15619695917823e-08, + "logits/chosen": -1.4957119226455688, + "logits/rejected": -1.577928066253662, + "logps/chosen": -59.20314025878906, + "logps/rejected": -71.88645935058594, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.856942653656006, + "rewards/margins": 11.350852966308594, + "rewards/rejected": -16.207796096801758, + "step": 1024 + }, + { + "epoch": 6.074074074074074, + "grad_norm": 0.11371787712657735, + "learning_rate": 8.108249263908163e-08, + "logits/chosen": -1.4241392612457275, + "logits/rejected": -1.4908593893051147, + "logps/chosen": -54.07698059082031, + "logps/rejected": -98.04219055175781, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.959539890289307, + "rewards/margins": 13.624971389770508, + "rewards/rejected": -18.584510803222656, + "step": 1025 + }, + { + "epoch": 6.08, + "grad_norm": 0.13302313121136608, + "learning_rate": 8.060415625538059e-08, + "logits/chosen": -1.4159773588180542, + "logits/rejected": -1.369776725769043, + "logps/chosen": -40.464969635009766, + "logps/rejected": -70.99534606933594, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.73652982711792, + "rewards/margins": 10.462564468383789, + "rewards/rejected": -13.199094772338867, + "step": 1026 + }, + { + "epoch": 6.085925925925926, + "grad_norm": 0.11523165898367618, + "learning_rate": 8.012696367051409e-08, + "logits/chosen": -1.6489384174346924, + "logits/rejected": -1.7299697399139404, + "logps/chosen": -45.44413375854492, + "logps/rejected": -78.03041076660156, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.226395606994629, + "rewards/margins": 12.231207847595215, + "rewards/rejected": -14.45760440826416, + "step": 1027 + }, + { + "epoch": 6.091851851851851, + "grad_norm": 0.1678387929900549, + "learning_rate": 7.965091810659369e-08, + "logits/chosen": -1.6275016069412231, + "logits/rejected": -1.4610252380371094, + "logps/chosen": -38.6914176940918, + "logps/rejected": -71.39549255371094, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7591590881347656, + "rewards/margins": 10.687475204467773, + "rewards/rejected": -13.446634292602539, + "step": 1028 + }, + { + "epoch": 6.097777777777778, + "grad_norm": 0.11180465890014177, + "learning_rate": 7.917602277798612e-08, + "logits/chosen": -1.6225529909133911, + "logits/rejected": -1.6014906167984009, + "logps/chosen": -46.327056884765625, + "logps/rejected": -84.01023864746094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.628958225250244, + "rewards/margins": 13.545883178710938, + "rewards/rejected": -18.174840927124023, + "step": 1029 + }, + { + "epoch": 6.103703703703704, + "grad_norm": 0.137214967452007, + "learning_rate": 7.870228089129155e-08, + "logits/chosen": -1.7564111948013306, + "logits/rejected": -1.6935635805130005, + "logps/chosen": -38.401329040527344, + "logps/rejected": -61.10845184326172, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4874920845031738, + "rewards/margins": 11.215259552001953, + "rewards/rejected": -12.702751159667969, + "step": 1030 + }, + { + "epoch": 6.109629629629629, + "grad_norm": 0.1671060221459912, + "learning_rate": 7.822969564532167e-08, + "logits/chosen": -1.7741791009902954, + "logits/rejected": -1.744217872619629, + "logps/chosen": -38.02019500732422, + "logps/rejected": -76.86927032470703, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3427977561950684, + "rewards/margins": 13.43283462524414, + "rewards/rejected": -15.77563190460205, + "step": 1031 + }, + { + "epoch": 6.115555555555556, + "grad_norm": 0.1536051341773224, + "learning_rate": 7.775827023107834e-08, + "logits/chosen": -1.8468081951141357, + "logits/rejected": -1.5555062294006348, + "logps/chosen": -42.24263000488281, + "logps/rejected": -78.72967529296875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6049342155456543, + "rewards/margins": 12.029871940612793, + "rewards/rejected": -14.634806632995605, + "step": 1032 + }, + { + "epoch": 6.1214814814814815, + "grad_norm": 0.10753768585043083, + "learning_rate": 7.728800783173201e-08, + "logits/chosen": -2.1814775466918945, + "logits/rejected": -2.0472769737243652, + "logps/chosen": -40.55844497680664, + "logps/rejected": -85.31634521484375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7944538593292236, + "rewards/margins": 13.156540870666504, + "rewards/rejected": -15.950995445251465, + "step": 1033 + }, + { + "epoch": 6.127407407407407, + "grad_norm": 0.10291282391822812, + "learning_rate": 7.681891162260015e-08, + "logits/chosen": -1.929794430732727, + "logits/rejected": -1.872850775718689, + "logps/chosen": -39.1888427734375, + "logps/rejected": -68.86357879638672, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3173604011535645, + "rewards/margins": 10.948533058166504, + "rewards/rejected": -13.265893936157227, + "step": 1034 + }, + { + "epoch": 6.133333333333334, + "grad_norm": 0.09519974691947208, + "learning_rate": 7.635098477112587e-08, + "logits/chosen": -1.5099825859069824, + "logits/rejected": -1.426225185394287, + "logps/chosen": -42.998443603515625, + "logps/rejected": -72.8267593383789, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.638385534286499, + "rewards/margins": 10.71774673461914, + "rewards/rejected": -13.356132507324219, + "step": 1035 + }, + { + "epoch": 6.139259259259259, + "grad_norm": 0.07922820550503175, + "learning_rate": 7.588423043685646e-08, + "logits/chosen": -1.6852991580963135, + "logits/rejected": -1.421472430229187, + "logps/chosen": -49.89509201049805, + "logps/rejected": -78.23391723632812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7343664169311523, + "rewards/margins": 12.89884090423584, + "rewards/rejected": -15.633207321166992, + "step": 1036 + }, + { + "epoch": 6.145185185185185, + "grad_norm": 0.14171862753933648, + "learning_rate": 7.541865177142223e-08, + "logits/chosen": -1.5457457304000854, + "logits/rejected": -1.4155274629592896, + "logps/chosen": -42.30841827392578, + "logps/rejected": -88.99565887451172, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4843955039978027, + "rewards/margins": 15.793426513671875, + "rewards/rejected": -19.277822494506836, + "step": 1037 + }, + { + "epoch": 6.151111111111111, + "grad_norm": 0.14581198869002696, + "learning_rate": 7.4954251918515e-08, + "logits/chosen": -1.3942396640777588, + "logits/rejected": -1.353884220123291, + "logps/chosen": -44.77840042114258, + "logps/rejected": -69.60354614257812, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.442391872406006, + "rewards/margins": 10.158035278320312, + "rewards/rejected": -12.600425720214844, + "step": 1038 + }, + { + "epoch": 6.157037037037037, + "grad_norm": 0.10854236092797581, + "learning_rate": 7.449103401386702e-08, + "logits/chosen": -2.0815091133117676, + "logits/rejected": -1.9354617595672607, + "logps/chosen": -37.36747360229492, + "logps/rejected": -79.1553726196289, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.691523551940918, + "rewards/margins": 12.638469696044922, + "rewards/rejected": -15.329992294311523, + "step": 1039 + }, + { + "epoch": 6.162962962962963, + "grad_norm": 0.12985455462798487, + "learning_rate": 7.402900118522978e-08, + "logits/chosen": -1.8217581510543823, + "logits/rejected": -1.6328381299972534, + "logps/chosen": -43.85091781616211, + "logps/rejected": -96.46054077148438, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.261449337005615, + "rewards/margins": 17.501453399658203, + "rewards/rejected": -21.76290512084961, + "step": 1040 + }, + { + "epoch": 6.168888888888889, + "grad_norm": 0.12535486058516476, + "learning_rate": 7.356815655235286e-08, + "logits/chosen": -1.4930031299591064, + "logits/rejected": -1.5464279651641846, + "logps/chosen": -48.69932556152344, + "logps/rejected": -79.17095184326172, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.835171222686768, + "rewards/margins": 11.31641960144043, + "rewards/rejected": -16.15159034729004, + "step": 1041 + }, + { + "epoch": 6.174814814814815, + "grad_norm": 0.11849196685478744, + "learning_rate": 7.310850322696283e-08, + "logits/chosen": -1.7987161874771118, + "logits/rejected": -1.7363475561141968, + "logps/chosen": -35.88686752319336, + "logps/rejected": -63.077423095703125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5645644664764404, + "rewards/margins": 11.16557788848877, + "rewards/rejected": -13.730142593383789, + "step": 1042 + }, + { + "epoch": 6.180740740740741, + "grad_norm": 0.1458720969738312, + "learning_rate": 7.265004431274236e-08, + "logits/chosen": -1.8062262535095215, + "logits/rejected": -1.7760688066482544, + "logps/chosen": -41.64423370361328, + "logps/rejected": -69.88961029052734, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.090489625930786, + "rewards/margins": 13.364202499389648, + "rewards/rejected": -15.454690933227539, + "step": 1043 + }, + { + "epoch": 6.1866666666666665, + "grad_norm": 0.15732193479335854, + "learning_rate": 7.219278290530909e-08, + "logits/chosen": -1.9366602897644043, + "logits/rejected": -1.9759104251861572, + "logps/chosen": -51.57427978515625, + "logps/rejected": -78.43927001953125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0921897888183594, + "rewards/margins": 12.398733139038086, + "rewards/rejected": -15.490922927856445, + "step": 1044 + }, + { + "epoch": 6.192592592592592, + "grad_norm": 0.162687196124052, + "learning_rate": 7.173672209219494e-08, + "logits/chosen": -1.7788739204406738, + "logits/rejected": -1.6164450645446777, + "logps/chosen": -45.710723876953125, + "logps/rejected": -83.01245880126953, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6430821418762207, + "rewards/margins": 12.03801441192627, + "rewards/rejected": -14.681096076965332, + "step": 1045 + }, + { + "epoch": 6.198518518518519, + "grad_norm": 0.12348007532130807, + "learning_rate": 7.128186495282507e-08, + "logits/chosen": -2.1181507110595703, + "logits/rejected": -2.0345942974090576, + "logps/chosen": -39.752044677734375, + "logps/rejected": -77.72023010253906, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7648587226867676, + "rewards/margins": 11.238019943237305, + "rewards/rejected": -14.00287914276123, + "step": 1046 + }, + { + "epoch": 6.204444444444444, + "grad_norm": 0.13581994327741484, + "learning_rate": 7.082821455849717e-08, + "logits/chosen": -1.4847235679626465, + "logits/rejected": -1.4669365882873535, + "logps/chosen": -53.27600860595703, + "logps/rejected": -86.11213684082031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.276714324951172, + "rewards/margins": 12.011651992797852, + "rewards/rejected": -17.288368225097656, + "step": 1047 + }, + { + "epoch": 6.21037037037037, + "grad_norm": 0.08876930704947257, + "learning_rate": 7.037577397236074e-08, + "logits/chosen": -1.8768885135650635, + "logits/rejected": -1.6755867004394531, + "logps/chosen": -52.622314453125, + "logps/rejected": -87.13565063476562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.662373065948486, + "rewards/margins": 13.155412673950195, + "rewards/rejected": -17.817785263061523, + "step": 1048 + }, + { + "epoch": 6.216296296296297, + "grad_norm": 0.15555493957770292, + "learning_rate": 6.992454624939636e-08, + "logits/chosen": -2.0798707008361816, + "logits/rejected": -1.913529872894287, + "logps/chosen": -46.629173278808594, + "logps/rejected": -91.40234375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7776923179626465, + "rewards/margins": 12.142569541931152, + "rewards/rejected": -15.920262336730957, + "step": 1049 + }, + { + "epoch": 6.222222222222222, + "grad_norm": 0.12171810468034984, + "learning_rate": 6.947453443639514e-08, + "logits/chosen": -1.5369203090667725, + "logits/rejected": -1.4309501647949219, + "logps/chosen": -45.19132995605469, + "logps/rejected": -82.57927703857422, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4305639266967773, + "rewards/margins": 12.945052146911621, + "rewards/rejected": -16.3756160736084, + "step": 1050 + }, + { + "epoch": 6.228148148148148, + "grad_norm": 0.18943800217365878, + "learning_rate": 6.902574157193794e-08, + "logits/chosen": -1.2610238790512085, + "logits/rejected": -1.3482708930969238, + "logps/chosen": -56.56427764892578, + "logps/rejected": -74.21857452392578, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.313958168029785, + "rewards/margins": 10.111957550048828, + "rewards/rejected": -15.42591667175293, + "step": 1051 + }, + { + "epoch": 6.234074074074074, + "grad_norm": 0.1220262778690326, + "learning_rate": 6.857817068637526e-08, + "logits/chosen": -1.7904787063598633, + "logits/rejected": -1.857863426208496, + "logps/chosen": -53.34080123901367, + "logps/rejected": -70.02799224853516, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.458836317062378, + "rewards/margins": 11.524408340454102, + "rewards/rejected": -12.983244895935059, + "step": 1052 + }, + { + "epoch": 6.24, + "grad_norm": 0.13902229219240764, + "learning_rate": 6.81318248018064e-08, + "logits/chosen": -1.5270187854766846, + "logits/rejected": -1.3275320529937744, + "logps/chosen": -49.507606506347656, + "logps/rejected": -101.7363052368164, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.40425443649292, + "rewards/margins": 15.39886474609375, + "rewards/rejected": -18.803119659423828, + "step": 1053 + }, + { + "epoch": 6.245925925925926, + "grad_norm": 0.1279638964163061, + "learning_rate": 6.7686706932059e-08, + "logits/chosen": -1.7549011707305908, + "logits/rejected": -1.5517879724502563, + "logps/chosen": -47.363651275634766, + "logps/rejected": -83.65386962890625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2582550048828125, + "rewards/margins": 12.157743453979492, + "rewards/rejected": -16.415998458862305, + "step": 1054 + }, + { + "epoch": 6.2518518518518515, + "grad_norm": 0.09007238556981374, + "learning_rate": 6.72428200826691e-08, + "logits/chosen": -2.5198426246643066, + "logits/rejected": -2.465240001678467, + "logps/chosen": -55.144351959228516, + "logps/rejected": -88.67027282714844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7151663303375244, + "rewards/margins": 13.719581604003906, + "rewards/rejected": -16.43474769592285, + "step": 1055 + }, + { + "epoch": 6.257777777777778, + "grad_norm": 0.1288368518732618, + "learning_rate": 6.680016725086052e-08, + "logits/chosen": -1.9952564239501953, + "logits/rejected": -1.8888041973114014, + "logps/chosen": -43.91956329345703, + "logps/rejected": -81.23291778564453, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3017802238464355, + "rewards/margins": 12.755228042602539, + "rewards/rejected": -16.057010650634766, + "step": 1056 + }, + { + "epoch": 6.263703703703704, + "grad_norm": 0.11925320073673232, + "learning_rate": 6.635875142552475e-08, + "logits/chosen": -1.750340223312378, + "logits/rejected": -1.4810287952423096, + "logps/chosen": -48.30830764770508, + "logps/rejected": -90.56777954101562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.03714656829834, + "rewards/margins": 14.28243637084961, + "rewards/rejected": -18.319583892822266, + "step": 1057 + }, + { + "epoch": 6.269629629629629, + "grad_norm": 0.14219106176572774, + "learning_rate": 6.591857558720071e-08, + "logits/chosen": -1.4469225406646729, + "logits/rejected": -1.4411016702651978, + "logps/chosen": -36.428924560546875, + "logps/rejected": -62.26043701171875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5755970478057861, + "rewards/margins": 11.159796714782715, + "rewards/rejected": -12.735394477844238, + "step": 1058 + }, + { + "epoch": 6.275555555555556, + "grad_norm": 0.14888832078955183, + "learning_rate": 6.547964270805467e-08, + "logits/chosen": -1.8655922412872314, + "logits/rejected": -1.5784927606582642, + "logps/chosen": -34.269927978515625, + "logps/rejected": -79.37550354003906, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8423409461975098, + "rewards/margins": 14.538299560546875, + "rewards/rejected": -16.380640029907227, + "step": 1059 + }, + { + "epoch": 6.281481481481482, + "grad_norm": 0.16797342298925552, + "learning_rate": 6.504195575186008e-08, + "logits/chosen": -1.7597219944000244, + "logits/rejected": -1.5347667932510376, + "logps/chosen": -45.811729431152344, + "logps/rejected": -86.42766571044922, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.779290199279785, + "rewards/margins": 11.868831634521484, + "rewards/rejected": -15.64812183380127, + "step": 1060 + }, + { + "epoch": 6.287407407407407, + "grad_norm": 0.17570328072046215, + "learning_rate": 6.460551767397784e-08, + "logits/chosen": -1.8049561977386475, + "logits/rejected": -1.6960246562957764, + "logps/chosen": -43.38131332397461, + "logps/rejected": -77.1976318359375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0591611862182617, + "rewards/margins": 13.242414474487305, + "rewards/rejected": -16.301576614379883, + "step": 1061 + }, + { + "epoch": 6.293333333333333, + "grad_norm": 0.12263604264249418, + "learning_rate": 6.417033142133593e-08, + "logits/chosen": -1.7794426679611206, + "logits/rejected": -1.6573376655578613, + "logps/chosen": -36.931121826171875, + "logps/rejected": -74.627685546875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.626795530319214, + "rewards/margins": 12.069509506225586, + "rewards/rejected": -14.696305274963379, + "step": 1062 + }, + { + "epoch": 6.29925925925926, + "grad_norm": 0.1182645145918132, + "learning_rate": 6.37363999324098e-08, + "logits/chosen": -1.6106207370758057, + "logits/rejected": -1.441240906715393, + "logps/chosen": -37.177764892578125, + "logps/rejected": -76.4002914428711, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1715197563171387, + "rewards/margins": 14.079158782958984, + "rewards/rejected": -16.25067901611328, + "step": 1063 + }, + { + "epoch": 6.305185185185185, + "grad_norm": 0.1829491566195646, + "learning_rate": 6.330372613720247e-08, + "logits/chosen": -1.360914945602417, + "logits/rejected": -1.3094801902770996, + "logps/chosen": -45.058780670166016, + "logps/rejected": -70.208984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.909440755844116, + "rewards/margins": 9.265682220458984, + "rewards/rejected": -13.175124168395996, + "step": 1064 + }, + { + "epoch": 6.311111111111111, + "grad_norm": 0.10702577184483547, + "learning_rate": 6.28723129572247e-08, + "logits/chosen": -1.6841546297073364, + "logits/rejected": -1.691527247428894, + "logps/chosen": -57.962867736816406, + "logps/rejected": -83.77704620361328, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.677487373352051, + "rewards/margins": 12.261930465698242, + "rewards/rejected": -15.939416885375977, + "step": 1065 + }, + { + "epoch": 6.3170370370370375, + "grad_norm": 0.10785794121226135, + "learning_rate": 6.244216330547533e-08, + "logits/chosen": -1.9304916858673096, + "logits/rejected": -1.8183112144470215, + "logps/chosen": -39.911956787109375, + "logps/rejected": -63.655853271484375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.665860891342163, + "rewards/margins": 12.657873153686523, + "rewards/rejected": -15.323734283447266, + "step": 1066 + }, + { + "epoch": 6.322962962962963, + "grad_norm": 0.1432749380231685, + "learning_rate": 6.201328008642159e-08, + "logits/chosen": -1.8575468063354492, + "logits/rejected": -1.846190333366394, + "logps/chosen": -42.935150146484375, + "logps/rejected": -63.82366943359375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.917294502258301, + "rewards/margins": 11.173959732055664, + "rewards/rejected": -14.091255187988281, + "step": 1067 + }, + { + "epoch": 6.328888888888889, + "grad_norm": 0.1765937601001302, + "learning_rate": 6.158566619597932e-08, + "logits/chosen": -1.4488952159881592, + "logits/rejected": -1.4074013233184814, + "logps/chosen": -36.881935119628906, + "logps/rejected": -72.05622863769531, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9991321563720703, + "rewards/margins": 12.601561546325684, + "rewards/rejected": -16.60069465637207, + "step": 1068 + }, + { + "epoch": 6.3348148148148145, + "grad_norm": 0.15153245583303798, + "learning_rate": 6.115932452149372e-08, + "logits/chosen": -1.8595564365386963, + "logits/rejected": -1.6728187799453735, + "logps/chosen": -38.90936279296875, + "logps/rejected": -69.89384460449219, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4666457176208496, + "rewards/margins": 11.830467224121094, + "rewards/rejected": -13.297114372253418, + "step": 1069 + }, + { + "epoch": 6.340740740740741, + "grad_norm": 0.11794101889037722, + "learning_rate": 6.07342579417196e-08, + "logits/chosen": -1.581072449684143, + "logits/rejected": -1.4845892190933228, + "logps/chosen": -43.72598648071289, + "logps/rejected": -79.68162536621094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1735429763793945, + "rewards/margins": 13.215681076049805, + "rewards/rejected": -16.389225006103516, + "step": 1070 + }, + { + "epoch": 6.346666666666667, + "grad_norm": 0.08378610393720609, + "learning_rate": 6.031046932680229e-08, + "logits/chosen": -2.0058701038360596, + "logits/rejected": -1.950735092163086, + "logps/chosen": -50.23748016357422, + "logps/rejected": -87.69313049316406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.4842119216918945, + "rewards/margins": 12.081066131591797, + "rewards/rejected": -16.565277099609375, + "step": 1071 + }, + { + "epoch": 6.352592592592592, + "grad_norm": 0.0721461908064281, + "learning_rate": 5.988796153825768e-08, + "logits/chosen": -1.5199872255325317, + "logits/rejected": -1.5319875478744507, + "logps/chosen": -61.35587692260742, + "logps/rejected": -80.61036682128906, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.392558574676514, + "rewards/margins": 11.08071231842041, + "rewards/rejected": -16.473270416259766, + "step": 1072 + }, + { + "epoch": 6.358518518518519, + "grad_norm": 0.1428585794653907, + "learning_rate": 5.9466737428953444e-08, + "logits/chosen": -2.0055718421936035, + "logits/rejected": -1.690445899963379, + "logps/chosen": -50.62556457519531, + "logps/rejected": -103.78714752197266, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.993103981018066, + "rewards/margins": 14.962833404541016, + "rewards/rejected": -19.955936431884766, + "step": 1073 + }, + { + "epoch": 6.364444444444445, + "grad_norm": 0.09808909586979048, + "learning_rate": 5.9046799843089464e-08, + "logits/chosen": -1.5556331872940063, + "logits/rejected": -1.5716986656188965, + "logps/chosen": -37.94038772583008, + "logps/rejected": -64.6926040649414, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3914966583251953, + "rewards/margins": 11.77543830871582, + "rewards/rejected": -14.166935920715332, + "step": 1074 + }, + { + "epoch": 6.37037037037037, + "grad_norm": 0.08266518492599137, + "learning_rate": 5.862815161617879e-08, + "logits/chosen": -1.463660717010498, + "logits/rejected": -1.4374220371246338, + "logps/chosen": -50.42168045043945, + "logps/rejected": -88.21818542480469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8865365982055664, + "rewards/margins": 14.44270133972168, + "rewards/rejected": -17.329238891601562, + "step": 1075 + }, + { + "epoch": 6.376296296296296, + "grad_norm": 0.09520254743607896, + "learning_rate": 5.8210795575028395e-08, + "logits/chosen": -1.9621179103851318, + "logits/rejected": -1.7960467338562012, + "logps/chosen": -48.25514221191406, + "logps/rejected": -96.09195709228516, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.190402507781982, + "rewards/margins": 13.114240646362305, + "rewards/rejected": -17.304643630981445, + "step": 1076 + }, + { + "epoch": 6.3822222222222225, + "grad_norm": 0.10568535832114431, + "learning_rate": 5.7794734537720156e-08, + "logits/chosen": -1.6859076023101807, + "logits/rejected": -1.7142068147659302, + "logps/chosen": -60.96441650390625, + "logps/rejected": -74.63279724121094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.792672157287598, + "rewards/margins": 9.826908111572266, + "rewards/rejected": -14.61958122253418, + "step": 1077 + }, + { + "epoch": 6.388148148148148, + "grad_norm": 0.12543603244981047, + "learning_rate": 5.7379971313591736e-08, + "logits/chosen": -1.7731202840805054, + "logits/rejected": -1.796823263168335, + "logps/chosen": -63.01026916503906, + "logps/rejected": -96.41645812988281, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.990747928619385, + "rewards/margins": 12.424217224121094, + "rewards/rejected": -18.41496467590332, + "step": 1078 + }, + { + "epoch": 6.394074074074074, + "grad_norm": 0.1373856939972385, + "learning_rate": 5.69665087032177e-08, + "logits/chosen": -1.725167989730835, + "logits/rejected": -1.743959903717041, + "logps/chosen": -42.21112060546875, + "logps/rejected": -65.99535369873047, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9228241443634033, + "rewards/margins": 9.872057914733887, + "rewards/rejected": -12.794881820678711, + "step": 1079 + }, + { + "epoch": 6.4, + "grad_norm": 0.07565936247852624, + "learning_rate": 5.6554349498390606e-08, + "logits/chosen": -1.7092314958572388, + "logits/rejected": -1.7479329109191895, + "logps/chosen": -42.521568298339844, + "logps/rejected": -76.67975616455078, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7571520805358887, + "rewards/margins": 11.863056182861328, + "rewards/rejected": -15.620209693908691, + "step": 1080 + }, + { + "epoch": 6.405925925925926, + "grad_norm": 0.11025606213310722, + "learning_rate": 5.614349648210212e-08, + "logits/chosen": -1.960831642150879, + "logits/rejected": -1.9402929544448853, + "logps/chosen": -46.01992416381836, + "logps/rejected": -71.65948486328125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4495725631713867, + "rewards/margins": 11.952836990356445, + "rewards/rejected": -14.402410507202148, + "step": 1081 + }, + { + "epoch": 6.411851851851852, + "grad_norm": 0.11391905861785986, + "learning_rate": 5.573395242852416e-08, + "logits/chosen": -1.9040777683258057, + "logits/rejected": -1.6277384757995605, + "logps/chosen": -45.58112335205078, + "logps/rejected": -93.3453598022461, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9349770545959473, + "rewards/margins": 14.115599632263184, + "rewards/rejected": -18.05057716369629, + "step": 1082 + }, + { + "epoch": 6.417777777777777, + "grad_norm": 0.15868471217100444, + "learning_rate": 5.532572010299033e-08, + "logits/chosen": -1.5244793891906738, + "logits/rejected": -1.5053770542144775, + "logps/chosen": -38.31887435913086, + "logps/rejected": -68.69171142578125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4086740016937256, + "rewards/margins": 10.506559371948242, + "rewards/rejected": -11.91523265838623, + "step": 1083 + }, + { + "epoch": 6.423703703703704, + "grad_norm": 0.11512703826874703, + "learning_rate": 5.4918802261977067e-08, + "logits/chosen": -1.4734156131744385, + "logits/rejected": -1.5725305080413818, + "logps/chosen": -47.85783386230469, + "logps/rejected": -72.31632995605469, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.557512044906616, + "rewards/margins": 10.896509170532227, + "rewards/rejected": -14.454021453857422, + "step": 1084 + }, + { + "epoch": 6.42962962962963, + "grad_norm": 0.13099659731085148, + "learning_rate": 5.451320165308518e-08, + "logits/chosen": -1.9137905836105347, + "logits/rejected": -1.7296216487884521, + "logps/chosen": -47.043785095214844, + "logps/rejected": -89.14530944824219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.782802581787109, + "rewards/margins": 12.57824420928955, + "rewards/rejected": -17.361045837402344, + "step": 1085 + }, + { + "epoch": 6.435555555555555, + "grad_norm": 0.15877214990532035, + "learning_rate": 5.410892101502118e-08, + "logits/chosen": -1.1672827005386353, + "logits/rejected": -1.379494309425354, + "logps/chosen": -50.326271057128906, + "logps/rejected": -83.04608154296875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.681979179382324, + "rewards/margins": 14.329689979553223, + "rewards/rejected": -19.011669158935547, + "step": 1086 + }, + { + "epoch": 6.441481481481482, + "grad_norm": 0.08272935062465174, + "learning_rate": 5.370596307757885e-08, + "logits/chosen": -1.2756975889205933, + "logits/rejected": -1.0763273239135742, + "logps/chosen": -39.87290954589844, + "logps/rejected": -84.71408081054688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9266140460968018, + "rewards/margins": 14.486490249633789, + "rewards/rejected": -17.413105010986328, + "step": 1087 + }, + { + "epoch": 6.4474074074074075, + "grad_norm": 0.14231261967741088, + "learning_rate": 5.330433056162084e-08, + "logits/chosen": -1.8586760759353638, + "logits/rejected": -1.7465415000915527, + "logps/chosen": -44.88026428222656, + "logps/rejected": -62.242008209228516, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.546351432800293, + "rewards/margins": 10.57691478729248, + "rewards/rejected": -13.123266220092773, + "step": 1088 + }, + { + "epoch": 6.453333333333333, + "grad_norm": 0.1266739271924691, + "learning_rate": 5.29040261790602e-08, + "logits/chosen": -1.321254849433899, + "logits/rejected": -1.3049745559692383, + "logps/chosen": -63.391944885253906, + "logps/rejected": -92.37260437011719, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.090176105499268, + "rewards/margins": 13.020833015441895, + "rewards/rejected": -17.11100959777832, + "step": 1089 + }, + { + "epoch": 6.459259259259259, + "grad_norm": 0.1378265215323224, + "learning_rate": 5.2505052632842187e-08, + "logits/chosen": -2.24951171875, + "logits/rejected": -2.298269033432007, + "logps/chosen": -40.54319381713867, + "logps/rejected": -66.77867889404297, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7393558025360107, + "rewards/margins": 11.914773941040039, + "rewards/rejected": -13.654130935668945, + "step": 1090 + }, + { + "epoch": 6.465185185185185, + "grad_norm": 0.06869704455843761, + "learning_rate": 5.210741261692586e-08, + "logits/chosen": -2.1489977836608887, + "logits/rejected": -2.0202672481536865, + "logps/chosen": -35.78162384033203, + "logps/rejected": -75.97494506835938, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.267564535140991, + "rewards/margins": 13.613672256469727, + "rewards/rejected": -15.881235122680664, + "step": 1091 + }, + { + "epoch": 6.471111111111111, + "grad_norm": 0.11844219734103974, + "learning_rate": 5.171110881626603e-08, + "logits/chosen": -1.2343411445617676, + "logits/rejected": -1.0903527736663818, + "logps/chosen": -46.51551055908203, + "logps/rejected": -68.22870635986328, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7705905437469482, + "rewards/margins": 10.409194946289062, + "rewards/rejected": -14.179784774780273, + "step": 1092 + }, + { + "epoch": 6.477037037037037, + "grad_norm": 0.0798285794405465, + "learning_rate": 5.1316143906795175e-08, + "logits/chosen": -1.5782904624938965, + "logits/rejected": -1.5185699462890625, + "logps/chosen": -50.782135009765625, + "logps/rejected": -86.63662719726562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.781415939331055, + "rewards/margins": 13.581457138061523, + "rewards/rejected": -18.362873077392578, + "step": 1093 + }, + { + "epoch": 6.482962962962963, + "grad_norm": 0.08253145895304921, + "learning_rate": 5.092252055540513e-08, + "logits/chosen": -2.0359816551208496, + "logits/rejected": -1.9892759323120117, + "logps/chosen": -48.8431396484375, + "logps/rejected": -75.51835632324219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.010666608810425, + "rewards/margins": 10.410385131835938, + "rewards/rejected": -13.421051979064941, + "step": 1094 + }, + { + "epoch": 6.488888888888889, + "grad_norm": 0.16430761634687271, + "learning_rate": 5.053024141992934e-08, + "logits/chosen": -1.6572614908218384, + "logits/rejected": -1.7257788181304932, + "logps/chosen": -35.166404724121094, + "logps/rejected": -56.80643844604492, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8248682022094727, + "rewards/margins": 10.291166305541992, + "rewards/rejected": -12.116033554077148, + "step": 1095 + }, + { + "epoch": 6.494814814814815, + "grad_norm": 0.11363606507794657, + "learning_rate": 5.013930914912476e-08, + "logits/chosen": -1.8092420101165771, + "logits/rejected": -1.643286943435669, + "logps/chosen": -36.295989990234375, + "logps/rejected": -69.26307678222656, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8994760513305664, + "rewards/margins": 11.408160209655762, + "rewards/rejected": -14.307637214660645, + "step": 1096 + }, + { + "epoch": 6.50074074074074, + "grad_norm": 0.1640644182662116, + "learning_rate": 4.97497263826539e-08, + "logits/chosen": -1.8906135559082031, + "logits/rejected": -1.6718571186065674, + "logps/chosen": -32.947410583496094, + "logps/rejected": -79.05558776855469, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.182114601135254, + "rewards/margins": 13.58568000793457, + "rewards/rejected": -14.767793655395508, + "step": 1097 + }, + { + "epoch": 6.506666666666667, + "grad_norm": 0.21762442747119778, + "learning_rate": 4.936149575106727e-08, + "logits/chosen": -2.068748950958252, + "logits/rejected": -1.930734395980835, + "logps/chosen": -56.75259780883789, + "logps/rejected": -74.71015930175781, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.925926923751831, + "rewards/margins": 10.583144187927246, + "rewards/rejected": -14.509071350097656, + "step": 1098 + }, + { + "epoch": 6.5125925925925925, + "grad_norm": 0.1661638343862068, + "learning_rate": 4.897461987578541e-08, + "logits/chosen": -1.9975709915161133, + "logits/rejected": -2.0516114234924316, + "logps/chosen": -34.44652557373047, + "logps/rejected": -62.443790435791016, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.581756353378296, + "rewards/margins": 11.710376739501953, + "rewards/rejected": -13.292133331298828, + "step": 1099 + }, + { + "epoch": 6.518518518518518, + "grad_norm": 0.11832971936961147, + "learning_rate": 4.8589101369081235e-08, + "logits/chosen": -2.3408203125, + "logits/rejected": -2.202461004257202, + "logps/chosen": -41.1694450378418, + "logps/rejected": -70.74188232421875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2778067588806152, + "rewards/margins": 10.766231536865234, + "rewards/rejected": -13.044036865234375, + "step": 1100 + }, + { + "epoch": 6.524444444444445, + "grad_norm": 0.11115353794278574, + "learning_rate": 4.8204942834062373e-08, + "logits/chosen": -1.5817288160324097, + "logits/rejected": -1.3780860900878906, + "logps/chosen": -32.763893127441406, + "logps/rejected": -64.86897277832031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.493917942047119, + "rewards/margins": 10.884139060974121, + "rewards/rejected": -13.378057479858398, + "step": 1101 + }, + { + "epoch": 6.53037037037037, + "grad_norm": 0.0986118294670484, + "learning_rate": 4.7822146864653744e-08, + "logits/chosen": -1.5795146226882935, + "logits/rejected": -1.5201057195663452, + "logps/chosen": -52.62881088256836, + "logps/rejected": -87.83228302001953, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7620527744293213, + "rewards/margins": 13.284611701965332, + "rewards/rejected": -17.04666519165039, + "step": 1102 + }, + { + "epoch": 6.536296296296296, + "grad_norm": 0.1151527660718973, + "learning_rate": 4.744071604557978e-08, + "logits/chosen": -1.3033947944641113, + "logits/rejected": -1.3502681255340576, + "logps/chosen": -41.56209945678711, + "logps/rejected": -65.2261734008789, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.61061954498291, + "rewards/margins": 9.524845123291016, + "rewards/rejected": -13.135465621948242, + "step": 1103 + }, + { + "epoch": 6.542222222222223, + "grad_norm": 0.11668221228478101, + "learning_rate": 4.706065295234718e-08, + "logits/chosen": -1.6934748888015747, + "logits/rejected": -1.5031492710113525, + "logps/chosen": -42.92441177368164, + "logps/rejected": -85.13064575195312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7842092514038086, + "rewards/margins": 12.543503761291504, + "rewards/rejected": -15.327713012695312, + "step": 1104 + }, + { + "epoch": 6.548148148148148, + "grad_norm": 0.08872367783103548, + "learning_rate": 4.668196015122736e-08, + "logits/chosen": -1.4549543857574463, + "logits/rejected": -1.4374938011169434, + "logps/chosen": -45.51918411254883, + "logps/rejected": -70.88013458251953, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.88723087310791, + "rewards/margins": 9.359696388244629, + "rewards/rejected": -13.246927261352539, + "step": 1105 + }, + { + "epoch": 6.554074074074074, + "grad_norm": 0.09173901658647882, + "learning_rate": 4.630464019923932e-08, + "logits/chosen": -1.9627599716186523, + "logits/rejected": -1.9463365077972412, + "logps/chosen": -40.94268798828125, + "logps/rejected": -73.16978454589844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.460191488265991, + "rewards/margins": 12.181102752685547, + "rewards/rejected": -14.641294479370117, + "step": 1106 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 0.08523040261229967, + "learning_rate": 4.5928695644132266e-08, + "logits/chosen": -1.5671050548553467, + "logits/rejected": -1.4865435361862183, + "logps/chosen": -39.919410705566406, + "logps/rejected": -78.70149230957031, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.508589506149292, + "rewards/margins": 13.942964553833008, + "rewards/rejected": -17.451553344726562, + "step": 1107 + }, + { + "epoch": 6.565925925925926, + "grad_norm": 0.11606576737889197, + "learning_rate": 4.5554129024368334e-08, + "logits/chosen": -1.3970017433166504, + "logits/rejected": -1.3437788486480713, + "logps/chosen": -45.93317413330078, + "logps/rejected": -86.33053588867188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.457387924194336, + "rewards/margins": 13.490986824035645, + "rewards/rejected": -17.948373794555664, + "step": 1108 + }, + { + "epoch": 6.571851851851852, + "grad_norm": 0.14023652410181175, + "learning_rate": 4.5180942869105594e-08, + "logits/chosen": -1.8171439170837402, + "logits/rejected": -1.9706523418426514, + "logps/chosen": -50.49400329589844, + "logps/rejected": -75.61033630371094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.006141662597656, + "rewards/margins": 11.371904373168945, + "rewards/rejected": -16.3780460357666, + "step": 1109 + }, + { + "epoch": 6.5777777777777775, + "grad_norm": 0.15347892294400567, + "learning_rate": 4.480913969818098e-08, + "logits/chosen": -1.5099637508392334, + "logits/rejected": -1.4124467372894287, + "logps/chosen": -46.44249725341797, + "logps/rejected": -85.35438537597656, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.218708038330078, + "rewards/margins": 13.186939239501953, + "rewards/rejected": -17.40564727783203, + "step": 1110 + }, + { + "epoch": 6.583703703703704, + "grad_norm": 0.13839792487158512, + "learning_rate": 4.4438722022092925e-08, + "logits/chosen": -1.5241343975067139, + "logits/rejected": -1.473794937133789, + "logps/chosen": -47.51785659790039, + "logps/rejected": -78.66014099121094, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.042856216430664, + "rewards/margins": 13.097108840942383, + "rewards/rejected": -17.139965057373047, + "step": 1111 + }, + { + "epoch": 6.58962962962963, + "grad_norm": 0.10521400647939427, + "learning_rate": 4.406969234198507e-08, + "logits/chosen": -1.8187470436096191, + "logits/rejected": -1.762420654296875, + "logps/chosen": -46.16997528076172, + "logps/rejected": -89.59684753417969, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4138479232788086, + "rewards/margins": 14.086780548095703, + "rewards/rejected": -17.500629425048828, + "step": 1112 + }, + { + "epoch": 6.595555555555555, + "grad_norm": 0.19039851900446791, + "learning_rate": 4.370205314962872e-08, + "logits/chosen": -1.5640621185302734, + "logits/rejected": -1.5517284870147705, + "logps/chosen": -49.66551971435547, + "logps/rejected": -68.79605865478516, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7969613075256348, + "rewards/margins": 9.614969253540039, + "rewards/rejected": -13.411931037902832, + "step": 1113 + }, + { + "epoch": 6.601481481481482, + "grad_norm": 0.11915859572952245, + "learning_rate": 4.333580692740643e-08, + "logits/chosen": -1.5478522777557373, + "logits/rejected": -1.3028783798217773, + "logps/chosen": -29.00416374206543, + "logps/rejected": -68.0338134765625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0459089279174805, + "rewards/margins": 11.805730819702148, + "rewards/rejected": -12.851640701293945, + "step": 1114 + }, + { + "epoch": 6.607407407407408, + "grad_norm": 0.11920592216417088, + "learning_rate": 4.2970956148295075e-08, + "logits/chosen": -1.2512449026107788, + "logits/rejected": -1.1777236461639404, + "logps/chosen": -33.25419235229492, + "logps/rejected": -63.37179946899414, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9790241718292236, + "rewards/margins": 10.882867813110352, + "rewards/rejected": -12.861892700195312, + "step": 1115 + }, + { + "epoch": 6.613333333333333, + "grad_norm": 0.13864941881061246, + "learning_rate": 4.260750327584911e-08, + "logits/chosen": -1.652343988418579, + "logits/rejected": -1.5353381633758545, + "logps/chosen": -45.20256042480469, + "logps/rejected": -73.66735076904297, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5702972412109375, + "rewards/margins": 11.877695083618164, + "rewards/rejected": -14.447990417480469, + "step": 1116 + }, + { + "epoch": 6.619259259259259, + "grad_norm": 0.11481299769727026, + "learning_rate": 4.2245450764184095e-08, + "logits/chosen": -1.8509702682495117, + "logits/rejected": -1.8344902992248535, + "logps/chosen": -48.48194122314453, + "logps/rejected": -85.86804962158203, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4326438903808594, + "rewards/margins": 13.968793869018555, + "rewards/rejected": -16.401439666748047, + "step": 1117 + }, + { + "epoch": 6.6251851851851855, + "grad_norm": 0.101719771379674, + "learning_rate": 4.188480105796005e-08, + "logits/chosen": -1.7616229057312012, + "logits/rejected": -1.6706197261810303, + "logps/chosen": -38.84326934814453, + "logps/rejected": -66.91342163085938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4328653812408447, + "rewards/margins": 11.19166374206543, + "rewards/rejected": -13.624527931213379, + "step": 1118 + }, + { + "epoch": 6.631111111111111, + "grad_norm": 0.11980446027636844, + "learning_rate": 4.1525556592364843e-08, + "logits/chosen": -2.0169639587402344, + "logits/rejected": -2.028402805328369, + "logps/chosen": -58.9628791809082, + "logps/rejected": -84.61741638183594, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.251021385192871, + "rewards/margins": 11.501065254211426, + "rewards/rejected": -16.752086639404297, + "step": 1119 + }, + { + "epoch": 6.637037037037037, + "grad_norm": 0.18917276649888526, + "learning_rate": 4.116771979309797e-08, + "logits/chosen": -1.4528882503509521, + "logits/rejected": -1.2737305164337158, + "logps/chosen": -33.34897994995117, + "logps/rejected": -87.13203430175781, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.971200466156006, + "rewards/margins": 14.857851028442383, + "rewards/rejected": -17.829051971435547, + "step": 1120 + }, + { + "epoch": 6.642962962962963, + "grad_norm": 0.10275853238339516, + "learning_rate": 4.081129307635389e-08, + "logits/chosen": -1.6249778270721436, + "logits/rejected": -1.575737476348877, + "logps/chosen": -39.56550598144531, + "logps/rejected": -67.03308868408203, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.602834701538086, + "rewards/margins": 10.713581085205078, + "rewards/rejected": -13.316415786743164, + "step": 1121 + }, + { + "epoch": 6.648888888888889, + "grad_norm": 0.1430084669140178, + "learning_rate": 4.045627884880606e-08, + "logits/chosen": -1.4841854572296143, + "logits/rejected": -1.1887649297714233, + "logps/chosen": -48.28791427612305, + "logps/rejected": -97.2742919921875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.086124897003174, + "rewards/margins": 15.608391761779785, + "rewards/rejected": -18.694517135620117, + "step": 1122 + }, + { + "epoch": 6.654814814814815, + "grad_norm": 0.09521747618211378, + "learning_rate": 4.010267950759025e-08, + "logits/chosen": -1.9114545583724976, + "logits/rejected": -1.8105525970458984, + "logps/chosen": -42.81127166748047, + "logps/rejected": -84.86613464355469, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7864162921905518, + "rewards/margins": 13.881184577941895, + "rewards/rejected": -15.667600631713867, + "step": 1123 + }, + { + "epoch": 6.66074074074074, + "grad_norm": 0.07782578972378952, + "learning_rate": 3.9750497440288935e-08, + "logits/chosen": -1.5309398174285889, + "logits/rejected": -1.6354602575302124, + "logps/chosen": -57.512237548828125, + "logps/rejected": -79.76363372802734, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.54788875579834, + "rewards/margins": 10.909296035766602, + "rewards/rejected": -16.457183837890625, + "step": 1124 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.13416727252612903, + "learning_rate": 3.9399735024914475e-08, + "logits/chosen": -1.5638493299484253, + "logits/rejected": -1.5976860523223877, + "logps/chosen": -40.705963134765625, + "logps/rejected": -60.32302474975586, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.148594379425049, + "rewards/margins": 9.42080307006836, + "rewards/rejected": -12.56939697265625, + "step": 1125 + }, + { + "epoch": 6.672592592592593, + "grad_norm": 0.10613156696033546, + "learning_rate": 3.905039462989365e-08, + "logits/chosen": -1.9716284275054932, + "logits/rejected": -1.9705533981323242, + "logps/chosen": -50.73835754394531, + "logps/rejected": -82.14369201660156, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9389829635620117, + "rewards/margins": 13.061878204345703, + "rewards/rejected": -17.00086212158203, + "step": 1126 + }, + { + "epoch": 6.678518518518518, + "grad_norm": 0.22781953538865385, + "learning_rate": 3.8702478614051345e-08, + "logits/chosen": -1.71630859375, + "logits/rejected": -1.6615371704101562, + "logps/chosen": -33.904579162597656, + "logps/rejected": -61.124977111816406, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4092910587787628, + "rewards/margins": 9.47354793548584, + "rewards/rejected": -9.88283920288086, + "step": 1127 + }, + { + "epoch": 6.684444444444445, + "grad_norm": 0.1070585196057158, + "learning_rate": 3.835598932659476e-08, + "logits/chosen": -1.8027238845825195, + "logits/rejected": -1.7290318012237549, + "logps/chosen": -49.52033996582031, + "logps/rejected": -89.80229949951172, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.248178482055664, + "rewards/margins": 14.71170425415039, + "rewards/rejected": -17.959882736206055, + "step": 1128 + }, + { + "epoch": 6.6903703703703705, + "grad_norm": 0.09133588650231188, + "learning_rate": 3.801092910709749e-08, + "logits/chosen": -1.7302844524383545, + "logits/rejected": -1.5114187002182007, + "logps/chosen": -43.30107498168945, + "logps/rejected": -77.09686279296875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.723008155822754, + "rewards/margins": 10.668754577636719, + "rewards/rejected": -13.391761779785156, + "step": 1129 + }, + { + "epoch": 6.696296296296296, + "grad_norm": 0.09223332224300108, + "learning_rate": 3.766730028548376e-08, + "logits/chosen": -1.7790238857269287, + "logits/rejected": -1.750356912612915, + "logps/chosen": -45.946449279785156, + "logps/rejected": -80.86420440673828, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.758995532989502, + "rewards/margins": 12.887953758239746, + "rewards/rejected": -16.646949768066406, + "step": 1130 + }, + { + "epoch": 6.702222222222222, + "grad_norm": 0.19653263190443007, + "learning_rate": 3.732510518201265e-08, + "logits/chosen": -1.8037636280059814, + "logits/rejected": -1.7017388343811035, + "logps/chosen": -56.00579833984375, + "logps/rejected": -79.54396057128906, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.373067855834961, + "rewards/margins": 11.808576583862305, + "rewards/rejected": -17.181644439697266, + "step": 1131 + }, + { + "epoch": 6.708148148148148, + "grad_norm": 0.1298054144861316, + "learning_rate": 3.698434610726245e-08, + "logits/chosen": -1.7327450513839722, + "logits/rejected": -1.633286952972412, + "logps/chosen": -41.85569381713867, + "logps/rejected": -84.49916076660156, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.524967670440674, + "rewards/margins": 14.326324462890625, + "rewards/rejected": -17.85129165649414, + "step": 1132 + }, + { + "epoch": 6.714074074074074, + "grad_norm": 0.09808405051715205, + "learning_rate": 3.6645025362115e-08, + "logits/chosen": -1.9530103206634521, + "logits/rejected": -1.9716355800628662, + "logps/chosen": -51.91473388671875, + "logps/rejected": -81.49636840820312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.951500415802002, + "rewards/margins": 12.983062744140625, + "rewards/rejected": -15.934563636779785, + "step": 1133 + }, + { + "epoch": 6.72, + "grad_norm": 0.08128562051403439, + "learning_rate": 3.630714523774042e-08, + "logits/chosen": -1.4830005168914795, + "logits/rejected": -1.3773740530014038, + "logps/chosen": -46.95196533203125, + "logps/rejected": -93.6692886352539, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.364588737487793, + "rewards/margins": 13.5150785446167, + "rewards/rejected": -17.879667282104492, + "step": 1134 + }, + { + "epoch": 6.725925925925926, + "grad_norm": 0.10695265097038231, + "learning_rate": 3.597070801558122e-08, + "logits/chosen": -1.9932217597961426, + "logits/rejected": -1.7008832693099976, + "logps/chosen": -39.68993377685547, + "logps/rejected": -88.78868103027344, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.271031379699707, + "rewards/margins": 15.981220245361328, + "rewards/rejected": -19.25225257873535, + "step": 1135 + }, + { + "epoch": 6.731851851851852, + "grad_norm": 0.1242820848290576, + "learning_rate": 3.563571596733722e-08, + "logits/chosen": -1.7718029022216797, + "logits/rejected": -1.7178691625595093, + "logps/chosen": -47.097042083740234, + "logps/rejected": -83.14295196533203, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.281190395355225, + "rewards/margins": 11.175911903381348, + "rewards/rejected": -15.45710277557373, + "step": 1136 + }, + { + "epoch": 6.737777777777778, + "grad_norm": 0.09366512262058838, + "learning_rate": 3.530217135495006e-08, + "logits/chosen": -2.007751703262329, + "logits/rejected": -1.9636316299438477, + "logps/chosen": -38.424827575683594, + "logps/rejected": -86.53182983398438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3459277153015137, + "rewards/margins": 14.052453994750977, + "rewards/rejected": -17.398380279541016, + "step": 1137 + }, + { + "epoch": 6.743703703703703, + "grad_norm": 0.16347283237886825, + "learning_rate": 3.4970076430588027e-08, + "logits/chosen": -2.208000659942627, + "logits/rejected": -2.010059118270874, + "logps/chosen": -32.67522430419922, + "logps/rejected": -93.86961364746094, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.409874200820923, + "rewards/margins": 14.899282455444336, + "rewards/rejected": -17.30915641784668, + "step": 1138 + }, + { + "epoch": 6.74962962962963, + "grad_norm": 0.06627283701796888, + "learning_rate": 3.463943343663065e-08, + "logits/chosen": -1.7873049974441528, + "logits/rejected": -1.7198983430862427, + "logps/chosen": -48.43455505371094, + "logps/rejected": -89.15277862548828, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.063439846038818, + "rewards/margins": 12.770740509033203, + "rewards/rejected": -16.834178924560547, + "step": 1139 + }, + { + "epoch": 6.7555555555555555, + "grad_norm": 0.12790584904297592, + "learning_rate": 3.4310244605653795e-08, + "logits/chosen": -1.747499942779541, + "logits/rejected": -1.7736481428146362, + "logps/chosen": -59.638973236083984, + "logps/rejected": -84.97341918945312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.622666835784912, + "rewards/margins": 12.510490417480469, + "rewards/rejected": -17.13315773010254, + "step": 1140 + }, + { + "epoch": 6.761481481481481, + "grad_norm": 0.054293900686124816, + "learning_rate": 3.3982512160414505e-08, + "logits/chosen": -1.5526971817016602, + "logits/rejected": -1.3642184734344482, + "logps/chosen": -51.45740509033203, + "logps/rejected": -93.18586730957031, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.02336311340332, + "rewards/margins": 13.009660720825195, + "rewards/rejected": -17.033023834228516, + "step": 1141 + }, + { + "epoch": 6.767407407407408, + "grad_norm": 0.11040010049183906, + "learning_rate": 3.365623831383599e-08, + "logits/chosen": -1.9688798189163208, + "logits/rejected": -1.9011162519454956, + "logps/chosen": -44.33605194091797, + "logps/rejected": -81.40846252441406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4609012603759766, + "rewards/margins": 13.169167518615723, + "rewards/rejected": -16.630069732666016, + "step": 1142 + }, + { + "epoch": 6.773333333333333, + "grad_norm": 0.17505663368553703, + "learning_rate": 3.3331425268992547e-08, + "logits/chosen": -1.7000912427902222, + "logits/rejected": -1.7651917934417725, + "logps/chosen": -37.774322509765625, + "logps/rejected": -68.81632995605469, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9970366954803467, + "rewards/margins": 11.976476669311523, + "rewards/rejected": -14.973514556884766, + "step": 1143 + }, + { + "epoch": 6.779259259259259, + "grad_norm": 0.16025890402331744, + "learning_rate": 3.3008075219095045e-08, + "logits/chosen": -2.3175556659698486, + "logits/rejected": -2.2146213054656982, + "logps/chosen": -54.18362808227539, + "logps/rejected": -82.90629577636719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.281198263168335, + "rewards/margins": 11.708627700805664, + "rewards/rejected": -13.989827156066895, + "step": 1144 + }, + { + "epoch": 6.785185185185185, + "grad_norm": 0.09533316032443495, + "learning_rate": 3.268619034747566e-08, + "logits/chosen": -1.9728548526763916, + "logits/rejected": -1.731310248374939, + "logps/chosen": -43.94758224487305, + "logps/rejected": -86.31636047363281, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.777540683746338, + "rewards/margins": 13.461470603942871, + "rewards/rejected": -18.239011764526367, + "step": 1145 + }, + { + "epoch": 6.791111111111111, + "grad_norm": 0.20000749435302137, + "learning_rate": 3.236577282757347e-08, + "logits/chosen": -1.510556936264038, + "logits/rejected": -1.6342051029205322, + "logps/chosen": -50.926918029785156, + "logps/rejected": -73.72340393066406, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.107477188110352, + "rewards/margins": 12.7825927734375, + "rewards/rejected": -16.89006996154785, + "step": 1146 + }, + { + "epoch": 6.797037037037037, + "grad_norm": 0.13788508775382483, + "learning_rate": 3.204682482291959e-08, + "logits/chosen": -1.755540370941162, + "logits/rejected": -1.726999044418335, + "logps/chosen": -41.485496520996094, + "logps/rejected": -68.89039611816406, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.579153060913086, + "rewards/margins": 10.574809074401855, + "rewards/rejected": -13.153963088989258, + "step": 1147 + }, + { + "epoch": 6.802962962962963, + "grad_norm": 0.08315953883685909, + "learning_rate": 3.172934848712272e-08, + "logits/chosen": -2.2260100841522217, + "logits/rejected": -1.9643924236297607, + "logps/chosen": -36.83528137207031, + "logps/rejected": -80.32170104980469, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6309359073638916, + "rewards/margins": 12.441807746887207, + "rewards/rejected": -15.072744369506836, + "step": 1148 + }, + { + "epoch": 6.808888888888889, + "grad_norm": 0.1377713974555495, + "learning_rate": 3.141334596385447e-08, + "logits/chosen": -2.1011102199554443, + "logits/rejected": -1.9738458395004272, + "logps/chosen": -42.32820129394531, + "logps/rejected": -70.49113464355469, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5573666095733643, + "rewards/margins": 11.54711627960205, + "rewards/rejected": -14.104482650756836, + "step": 1149 + }, + { + "epoch": 6.814814814814815, + "grad_norm": 0.09606738124748687, + "learning_rate": 3.109881938683492e-08, + "logits/chosen": -1.3309812545776367, + "logits/rejected": -1.147838830947876, + "logps/chosen": -32.87968444824219, + "logps/rejected": -77.53007507324219, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.162910223007202, + "rewards/margins": 12.752610206604004, + "rewards/rejected": -14.915519714355469, + "step": 1150 + }, + { + "epoch": 6.8207407407407405, + "grad_norm": 0.10629644897987281, + "learning_rate": 3.078577087981832e-08, + "logits/chosen": -1.8460427522659302, + "logits/rejected": -1.6246216297149658, + "logps/chosen": -46.028106689453125, + "logps/rejected": -93.64574432373047, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4755055904388428, + "rewards/margins": 14.272528648376465, + "rewards/rejected": -17.748035430908203, + "step": 1151 + }, + { + "epoch": 6.826666666666666, + "grad_norm": 0.138240279213544, + "learning_rate": 3.047420255657851e-08, + "logits/chosen": -1.4086066484451294, + "logits/rejected": -1.3227324485778809, + "logps/chosen": -47.98810577392578, + "logps/rejected": -78.81871032714844, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2726149559021, + "rewards/margins": 12.465730667114258, + "rewards/rejected": -16.738346099853516, + "step": 1152 + }, + { + "epoch": 6.832592592592593, + "grad_norm": 0.06213028998609326, + "learning_rate": 3.016411652089493e-08, + "logits/chosen": -1.773051381111145, + "logits/rejected": -1.5124402046203613, + "logps/chosen": -38.5433235168457, + "logps/rejected": -79.8031234741211, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6178457736968994, + "rewards/margins": 13.169754028320312, + "rewards/rejected": -15.787599563598633, + "step": 1153 + }, + { + "epoch": 6.838518518518518, + "grad_norm": 0.09071262739179312, + "learning_rate": 2.985551486653823e-08, + "logits/chosen": -1.7397191524505615, + "logits/rejected": -1.4591525793075562, + "logps/chosen": -43.423797607421875, + "logps/rejected": -88.40834045410156, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5840625762939453, + "rewards/margins": 14.329594612121582, + "rewards/rejected": -16.913658142089844, + "step": 1154 + }, + { + "epoch": 6.844444444444444, + "grad_norm": 0.1022541987842882, + "learning_rate": 2.954839967725617e-08, + "logits/chosen": -1.8566625118255615, + "logits/rejected": -1.992887258529663, + "logps/chosen": -53.398765563964844, + "logps/rejected": -72.7064208984375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7381906509399414, + "rewards/margins": 12.036535263061523, + "rewards/rejected": -15.774726867675781, + "step": 1155 + }, + { + "epoch": 6.850370370370371, + "grad_norm": 0.12377028777884179, + "learning_rate": 2.924277302675962e-08, + "logits/chosen": -1.9297981262207031, + "logits/rejected": -1.9528807401657104, + "logps/chosen": -46.666847229003906, + "logps/rejected": -66.36315155029297, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1120994091033936, + "rewards/margins": 10.05501937866211, + "rewards/rejected": -13.167118072509766, + "step": 1156 + }, + { + "epoch": 6.856296296296296, + "grad_norm": 0.14259671866915122, + "learning_rate": 2.893863697870841e-08, + "logits/chosen": -1.6172630786895752, + "logits/rejected": -1.6391055583953857, + "logps/chosen": -49.92350769042969, + "logps/rejected": -71.48528289794922, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.502070426940918, + "rewards/margins": 11.089262008666992, + "rewards/rejected": -15.591333389282227, + "step": 1157 + }, + { + "epoch": 6.862222222222222, + "grad_norm": 0.10511089653127045, + "learning_rate": 2.863599358669755e-08, + "logits/chosen": -1.3771047592163086, + "logits/rejected": -1.4124231338500977, + "logps/chosen": -46.10894012451172, + "logps/rejected": -82.56547546386719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.284719467163086, + "rewards/margins": 13.74131965637207, + "rewards/rejected": -16.026039123535156, + "step": 1158 + }, + { + "epoch": 6.868148148148148, + "grad_norm": 0.1426796176134692, + "learning_rate": 2.8334844894243287e-08, + "logits/chosen": -2.127091646194458, + "logits/rejected": -2.0703983306884766, + "logps/chosen": -73.11487579345703, + "logps/rejected": -92.82722473144531, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.963024139404297, + "rewards/margins": 11.033499717712402, + "rewards/rejected": -16.996522903442383, + "step": 1159 + }, + { + "epoch": 6.874074074074074, + "grad_norm": 0.10377975946996132, + "learning_rate": 2.803519293476936e-08, + "logits/chosen": -1.9476478099822998, + "logits/rejected": -2.002642869949341, + "logps/chosen": -51.1083984375, + "logps/rejected": -82.77229309082031, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.5188798904418945, + "rewards/margins": 14.644804000854492, + "rewards/rejected": -19.16368293762207, + "step": 1160 + }, + { + "epoch": 6.88, + "grad_norm": 0.14750986834387664, + "learning_rate": 2.7737039731593138e-08, + "logits/chosen": -1.6242280006408691, + "logits/rejected": -1.6870909929275513, + "logps/chosen": -59.50197982788086, + "logps/rejected": -80.40160369873047, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.18382453918457, + "rewards/margins": 10.354097366333008, + "rewards/rejected": -14.537919998168945, + "step": 1161 + }, + { + "epoch": 6.885925925925926, + "grad_norm": 0.15189056943975968, + "learning_rate": 2.7440387297912122e-08, + "logits/chosen": -1.9823570251464844, + "logits/rejected": -2.019186019897461, + "logps/chosen": -40.22568130493164, + "logps/rejected": -73.96324920654297, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.187943935394287, + "rewards/margins": 12.574700355529785, + "rewards/rejected": -14.762643814086914, + "step": 1162 + }, + { + "epoch": 6.891851851851852, + "grad_norm": 0.06879956370628074, + "learning_rate": 2.7145237636790276e-08, + "logits/chosen": -1.6803816556930542, + "logits/rejected": -1.5852973461151123, + "logps/chosen": -54.0800895690918, + "logps/rejected": -79.88774871826172, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.2677531242370605, + "rewards/margins": 10.459793090820312, + "rewards/rejected": -15.727546691894531, + "step": 1163 + }, + { + "epoch": 6.897777777777778, + "grad_norm": 0.0806033063593912, + "learning_rate": 2.685159274114443e-08, + "logits/chosen": -1.5133965015411377, + "logits/rejected": -1.4351083040237427, + "logps/chosen": -46.148075103759766, + "logps/rejected": -72.73712921142578, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.488842010498047, + "rewards/margins": 10.604263305664062, + "rewards/rejected": -15.09310531616211, + "step": 1164 + }, + { + "epoch": 6.9037037037037035, + "grad_norm": 0.08411857989859493, + "learning_rate": 2.6559454593731072e-08, + "logits/chosen": -1.4596953392028809, + "logits/rejected": -1.104498028755188, + "logps/chosen": -47.473480224609375, + "logps/rejected": -97.63139343261719, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.91493558883667, + "rewards/margins": 16.23548126220703, + "rewards/rejected": -21.150415420532227, + "step": 1165 + }, + { + "epoch": 6.90962962962963, + "grad_norm": 0.07035183659263981, + "learning_rate": 2.6268825167132636e-08, + "logits/chosen": -1.459460735321045, + "logits/rejected": -1.4765464067459106, + "logps/chosen": -45.411216735839844, + "logps/rejected": -77.99122619628906, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.320858955383301, + "rewards/margins": 11.635019302368164, + "rewards/rejected": -14.955877304077148, + "step": 1166 + }, + { + "epoch": 6.915555555555556, + "grad_norm": 0.15470041672532103, + "learning_rate": 2.5979706423744392e-08, + "logits/chosen": -1.671322226524353, + "logits/rejected": -1.7293105125427246, + "logps/chosen": -45.03539276123047, + "logps/rejected": -71.42250061035156, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.933180809020996, + "rewards/margins": 11.5748929977417, + "rewards/rejected": -15.508073806762695, + "step": 1167 + }, + { + "epoch": 6.921481481481481, + "grad_norm": 0.1191864531412148, + "learning_rate": 2.5692100315761023e-08, + "logits/chosen": -2.120941162109375, + "logits/rejected": -2.0219168663024902, + "logps/chosen": -71.7652587890625, + "logps/rejected": -108.49716186523438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.246131896972656, + "rewards/margins": 12.393702507019043, + "rewards/rejected": -18.639835357666016, + "step": 1168 + }, + { + "epoch": 6.927407407407408, + "grad_norm": 0.1348876028904682, + "learning_rate": 2.5406008785163717e-08, + "logits/chosen": -1.3151806592941284, + "logits/rejected": -1.417667269706726, + "logps/chosen": -53.786460876464844, + "logps/rejected": -87.53273010253906, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.237397193908691, + "rewards/margins": 12.491253852844238, + "rewards/rejected": -16.728652954101562, + "step": 1169 + }, + { + "epoch": 6.933333333333334, + "grad_norm": 0.07570071684837183, + "learning_rate": 2.512143376370682e-08, + "logits/chosen": -1.1454825401306152, + "logits/rejected": -1.2216933965682983, + "logps/chosen": -37.72205352783203, + "logps/rejected": -65.66807556152344, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8132212162017822, + "rewards/margins": 11.190628051757812, + "rewards/rejected": -13.003849029541016, + "step": 1170 + }, + { + "epoch": 6.939259259259259, + "grad_norm": 0.2254278351780427, + "learning_rate": 2.4838377172904907e-08, + "logits/chosen": -1.7721288204193115, + "logits/rejected": -1.6282906532287598, + "logps/chosen": -56.43745803833008, + "logps/rejected": -84.1759033203125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.852671146392822, + "rewards/margins": 11.335627555847168, + "rewards/rejected": -16.18829917907715, + "step": 1171 + }, + { + "epoch": 6.945185185185185, + "grad_norm": 0.048586698362349696, + "learning_rate": 2.455684092401969e-08, + "logits/chosen": -1.8530497550964355, + "logits/rejected": -1.7356008291244507, + "logps/chosen": -31.423507690429688, + "logps/rejected": -75.07907104492188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9417579174041748, + "rewards/margins": 14.43879222869873, + "rewards/rejected": -16.38054847717285, + "step": 1172 + }, + { + "epoch": 6.9511111111111115, + "grad_norm": 0.08382029726208863, + "learning_rate": 2.4276826918047277e-08, + "logits/chosen": -1.500767469406128, + "logits/rejected": -1.3265349864959717, + "logps/chosen": -54.78949737548828, + "logps/rejected": -90.59709167480469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.294247150421143, + "rewards/margins": 13.05902099609375, + "rewards/rejected": -17.353267669677734, + "step": 1173 + }, + { + "epoch": 6.957037037037037, + "grad_norm": 0.1279084717839541, + "learning_rate": 2.399833704570517e-08, + "logits/chosen": -1.5946778059005737, + "logits/rejected": -1.6028144359588623, + "logps/chosen": -35.326480865478516, + "logps/rejected": -65.81742095947266, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.336696982383728, + "rewards/margins": 11.668586730957031, + "rewards/rejected": -13.00528335571289, + "step": 1174 + }, + { + "epoch": 6.962962962962963, + "grad_norm": 0.16211559794676547, + "learning_rate": 2.372137318741968e-08, + "logits/chosen": -2.113690137863159, + "logits/rejected": -1.9204535484313965, + "logps/chosen": -61.13996505737305, + "logps/rejected": -97.43546295166016, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8684582710266113, + "rewards/margins": 13.243110656738281, + "rewards/rejected": -17.111570358276367, + "step": 1175 + }, + { + "epoch": 6.968888888888889, + "grad_norm": 0.19733756655662368, + "learning_rate": 2.3445937213313062e-08, + "logits/chosen": -1.8909013271331787, + "logits/rejected": -1.9597067832946777, + "logps/chosen": -68.76529693603516, + "logps/rejected": -95.87532043457031, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.259731292724609, + "rewards/margins": 13.89039421081543, + "rewards/rejected": -20.150123596191406, + "step": 1176 + }, + { + "epoch": 6.974814814814815, + "grad_norm": 0.12202352629812559, + "learning_rate": 2.3172030983190926e-08, + "logits/chosen": -1.5236730575561523, + "logits/rejected": -1.5534064769744873, + "logps/chosen": -35.21092224121094, + "logps/rejected": -61.23134994506836, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4225057363510132, + "rewards/margins": 10.738033294677734, + "rewards/rejected": -12.160539627075195, + "step": 1177 + }, + { + "epoch": 6.980740740740741, + "grad_norm": 0.16135077452071456, + "learning_rate": 2.2899656346529768e-08, + "logits/chosen": -1.932398796081543, + "logits/rejected": -2.1537177562713623, + "logps/chosen": -48.79818344116211, + "logps/rejected": -61.05226516723633, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.045696258544922, + "rewards/margins": 9.089506149291992, + "rewards/rejected": -14.135202407836914, + "step": 1178 + }, + { + "epoch": 6.986666666666666, + "grad_norm": 0.07643197243759331, + "learning_rate": 2.2628815142464342e-08, + "logits/chosen": -1.6719024181365967, + "logits/rejected": -1.4917726516723633, + "logps/chosen": -45.62458419799805, + "logps/rejected": -90.16924285888672, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.244566917419434, + "rewards/margins": 13.924651145935059, + "rewards/rejected": -18.169218063354492, + "step": 1179 + }, + { + "epoch": 6.992592592592593, + "grad_norm": 0.126597010412571, + "learning_rate": 2.2359509199775446e-08, + "logits/chosen": -1.7478291988372803, + "logits/rejected": -1.7482573986053467, + "logps/chosen": -52.008060455322266, + "logps/rejected": -87.12303161621094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8778417110443115, + "rewards/margins": 13.71127700805664, + "rewards/rejected": -16.58911895751953, + "step": 1180 + }, + { + "epoch": 6.998518518518519, + "grad_norm": 0.12244130358438209, + "learning_rate": 2.2091740336877358e-08, + "logits/chosen": -1.7030671834945679, + "logits/rejected": -1.458512783050537, + "logps/chosen": -53.036376953125, + "logps/rejected": -104.52532958984375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.178653717041016, + "rewards/margins": 13.309745788574219, + "rewards/rejected": -18.488399505615234, + "step": 1181 + }, + { + "epoch": 7.004444444444444, + "grad_norm": 0.10438348279258286, + "learning_rate": 2.1825510361805576e-08, + "logits/chosen": -1.7498127222061157, + "logits/rejected": -1.8160085678100586, + "logps/chosen": -35.32305145263672, + "logps/rejected": -67.66831970214844, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5238306522369385, + "rewards/margins": 12.23428726196289, + "rewards/rejected": -13.75811767578125, + "step": 1182 + }, + { + "epoch": 7.010370370370371, + "grad_norm": 0.10306756779932433, + "learning_rate": 2.156082107220486e-08, + "logits/chosen": -1.6162192821502686, + "logits/rejected": -1.6883609294891357, + "logps/chosen": -39.13275909423828, + "logps/rejected": -70.13416290283203, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.180786371231079, + "rewards/margins": 11.219168663024902, + "rewards/rejected": -14.399954795837402, + "step": 1183 + }, + { + "epoch": 7.0162962962962965, + "grad_norm": 0.15017006309609432, + "learning_rate": 2.129767425531673e-08, + "logits/chosen": -2.2658438682556152, + "logits/rejected": -2.1510543823242188, + "logps/chosen": -53.666160583496094, + "logps/rejected": -82.80941772460938, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.077384948730469, + "rewards/margins": 12.220909118652344, + "rewards/rejected": -16.298294067382812, + "step": 1184 + }, + { + "epoch": 7.022222222222222, + "grad_norm": 0.11969844631684366, + "learning_rate": 2.1036071687967783e-08, + "logits/chosen": -1.3385977745056152, + "logits/rejected": -1.4503644704818726, + "logps/chosen": -63.32877731323242, + "logps/rejected": -79.03787994384766, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -7.450685501098633, + "rewards/margins": 10.056939125061035, + "rewards/rejected": -17.50762367248535, + "step": 1185 + }, + { + "epoch": 7.028148148148148, + "grad_norm": 0.09289746206047685, + "learning_rate": 2.077601513655733e-08, + "logits/chosen": -1.334474802017212, + "logits/rejected": -1.4268428087234497, + "logps/chosen": -42.259456634521484, + "logps/rejected": -63.025047302246094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.244335174560547, + "rewards/margins": 10.316512107849121, + "rewards/rejected": -13.560847282409668, + "step": 1186 + }, + { + "epoch": 7.034074074074074, + "grad_norm": 0.0993388595523258, + "learning_rate": 2.0517506357045715e-08, + "logits/chosen": -1.8795228004455566, + "logits/rejected": -1.7030866146087646, + "logps/chosen": -50.661319732666016, + "logps/rejected": -95.91410827636719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.647890090942383, + "rewards/margins": 13.566815376281738, + "rewards/rejected": -18.214706420898438, + "step": 1187 + }, + { + "epoch": 7.04, + "grad_norm": 0.13225732135263005, + "learning_rate": 2.0260547094942348e-08, + "logits/chosen": -1.479757308959961, + "logits/rejected": -1.536267638206482, + "logps/chosen": -43.72503662109375, + "logps/rejected": -73.44718933105469, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7695412635803223, + "rewards/margins": 12.269949913024902, + "rewards/rejected": -16.03948974609375, + "step": 1188 + }, + { + "epoch": 7.045925925925926, + "grad_norm": 0.15398598189046675, + "learning_rate": 2.0005139085293942e-08, + "logits/chosen": -1.760892391204834, + "logits/rejected": -1.7211508750915527, + "logps/chosen": -59.702056884765625, + "logps/rejected": -85.88159942626953, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.470168590545654, + "rewards/margins": 11.876346588134766, + "rewards/rejected": -17.346511840820312, + "step": 1189 + }, + { + "epoch": 7.051851851851852, + "grad_norm": 0.1244116390199716, + "learning_rate": 1.9751284052672873e-08, + "logits/chosen": -1.8358004093170166, + "logits/rejected": -1.6322147846221924, + "logps/chosen": -49.1774787902832, + "logps/rejected": -78.3975830078125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.361578941345215, + "rewards/margins": 11.528911590576172, + "rewards/rejected": -15.890491485595703, + "step": 1190 + }, + { + "epoch": 7.057777777777778, + "grad_norm": 0.11892932478519877, + "learning_rate": 1.9498983711165345e-08, + "logits/chosen": -1.858366847038269, + "logits/rejected": -1.6905018091201782, + "logps/chosen": -42.442138671875, + "logps/rejected": -95.295166015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.855883836746216, + "rewards/margins": 16.18788719177246, + "rewards/rejected": -20.043773651123047, + "step": 1191 + }, + { + "epoch": 7.063703703703704, + "grad_norm": 0.1370620549733962, + "learning_rate": 1.9248239764360048e-08, + "logits/chosen": -1.8696699142456055, + "logits/rejected": -2.089273691177368, + "logps/chosen": -54.89391326904297, + "logps/rejected": -64.32718658447266, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9205586910247803, + "rewards/margins": 9.19931411743164, + "rewards/rejected": -11.119873046875, + "step": 1192 + }, + { + "epoch": 7.069629629629629, + "grad_norm": 0.1198645761373536, + "learning_rate": 1.899905390533649e-08, + "logits/chosen": -2.04950213432312, + "logits/rejected": -1.9079272747039795, + "logps/chosen": -45.07698059082031, + "logps/rejected": -74.14395904541016, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3388748168945312, + "rewards/margins": 11.454214096069336, + "rewards/rejected": -13.793089866638184, + "step": 1193 + }, + { + "epoch": 7.075555555555556, + "grad_norm": 0.10462021921279856, + "learning_rate": 1.8751427816653618e-08, + "logits/chosen": -1.4157465696334839, + "logits/rejected": -1.3236104249954224, + "logps/chosen": -38.29302215576172, + "logps/rejected": -66.2728271484375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.617685317993164, + "rewards/margins": 11.805870056152344, + "rewards/rejected": -15.423555374145508, + "step": 1194 + }, + { + "epoch": 7.0814814814814815, + "grad_norm": 0.10647414220192886, + "learning_rate": 1.8505363170338517e-08, + "logits/chosen": -1.7032594680786133, + "logits/rejected": -1.5659446716308594, + "logps/chosen": -52.43858337402344, + "logps/rejected": -85.01385498046875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4055228233337402, + "rewards/margins": 12.859024047851562, + "rewards/rejected": -16.264545440673828, + "step": 1195 + }, + { + "epoch": 7.087407407407407, + "grad_norm": 0.1217673476066229, + "learning_rate": 1.826086162787499e-08, + "logits/chosen": -1.2698402404785156, + "logits/rejected": -1.432144045829773, + "logps/chosen": -45.8774528503418, + "logps/rejected": -66.63996887207031, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.07426118850708, + "rewards/margins": 9.285086631774902, + "rewards/rejected": -12.35934829711914, + "step": 1196 + }, + { + "epoch": 7.093333333333334, + "grad_norm": 0.130593236991155, + "learning_rate": 1.8017924840192433e-08, + "logits/chosen": -1.5936301946640015, + "logits/rejected": -1.5533777475357056, + "logps/chosen": -41.87461853027344, + "logps/rejected": -65.9970932006836, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.350405216217041, + "rewards/margins": 11.087535858154297, + "rewards/rejected": -13.437941551208496, + "step": 1197 + }, + { + "epoch": 7.099259259259259, + "grad_norm": 0.08742307775397884, + "learning_rate": 1.7776554447654717e-08, + "logits/chosen": -1.670688271522522, + "logits/rejected": -1.7398487329483032, + "logps/chosen": -49.48249816894531, + "logps/rejected": -74.95970153808594, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.721832275390625, + "rewards/margins": 11.39289665222168, + "rewards/rejected": -16.114727020263672, + "step": 1198 + }, + { + "epoch": 7.105185185185185, + "grad_norm": 0.22187158357419212, + "learning_rate": 1.7536752080048955e-08, + "logits/chosen": -1.444218635559082, + "logits/rejected": -1.2818149328231812, + "logps/chosen": -54.006370544433594, + "logps/rejected": -89.37923431396484, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.9230194091796875, + "rewards/margins": 12.409586906433105, + "rewards/rejected": -17.332605361938477, + "step": 1199 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.11655345767008983, + "learning_rate": 1.7298519356574726e-08, + "logits/chosen": -1.6009925603866577, + "logits/rejected": -1.6573829650878906, + "logps/chosen": -44.316978454589844, + "logps/rejected": -74.49774169921875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8197593688964844, + "rewards/margins": 10.937904357910156, + "rewards/rejected": -14.75766372680664, + "step": 1200 + }, + { + "epoch": 7.117037037037037, + "grad_norm": 0.1087475095181626, + "learning_rate": 1.706185788583289e-08, + "logits/chosen": -1.8507537841796875, + "logits/rejected": -1.7265064716339111, + "logps/chosen": -45.44957733154297, + "logps/rejected": -79.27318572998047, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.685542583465576, + "rewards/margins": 13.47083854675293, + "rewards/rejected": -16.156381607055664, + "step": 1201 + }, + { + "epoch": 7.122962962962963, + "grad_norm": 0.1065517059466155, + "learning_rate": 1.6826769265815e-08, + "logits/chosen": -1.3087055683135986, + "logits/rejected": -1.2989039421081543, + "logps/chosen": -41.19493103027344, + "logps/rejected": -81.62377166748047, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.281460762023926, + "rewards/margins": 13.077204704284668, + "rewards/rejected": -16.358665466308594, + "step": 1202 + }, + { + "epoch": 7.128888888888889, + "grad_norm": 0.11318510157348385, + "learning_rate": 1.6593255083892228e-08, + "logits/chosen": -1.4445735216140747, + "logits/rejected": -1.2522766590118408, + "logps/chosen": -47.712623596191406, + "logps/rejected": -87.87376403808594, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7921104431152344, + "rewards/margins": 11.766212463378906, + "rewards/rejected": -15.55832290649414, + "step": 1203 + }, + { + "epoch": 7.134814814814815, + "grad_norm": 0.14086531041944936, + "learning_rate": 1.6361316916804896e-08, + "logits/chosen": -2.008540153503418, + "logits/rejected": -2.004145622253418, + "logps/chosen": -43.6939582824707, + "logps/rejected": -76.02693176269531, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.715010166168213, + "rewards/margins": 11.305937767028809, + "rewards/rejected": -14.020947456359863, + "step": 1204 + }, + { + "epoch": 7.140740740740741, + "grad_norm": 0.07673022638161378, + "learning_rate": 1.6130956330651646e-08, + "logits/chosen": -1.7630785703659058, + "logits/rejected": -1.8952760696411133, + "logps/chosen": -35.75289535522461, + "logps/rejected": -61.844886779785156, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7321906089782715, + "rewards/margins": 11.385841369628906, + "rewards/rejected": -14.118032455444336, + "step": 1205 + }, + { + "epoch": 7.1466666666666665, + "grad_norm": 0.12215685713176862, + "learning_rate": 1.5902174880878916e-08, + "logits/chosen": -1.8901073932647705, + "logits/rejected": -1.8296229839324951, + "logps/chosen": -38.18608856201172, + "logps/rejected": -79.75143432617188, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1476891040802, + "rewards/margins": 14.448156356811523, + "rewards/rejected": -16.595844268798828, + "step": 1206 + }, + { + "epoch": 7.152592592592592, + "grad_norm": 0.12354753151439266, + "learning_rate": 1.567497411227059e-08, + "logits/chosen": -2.1340646743774414, + "logits/rejected": -2.106337785720825, + "logps/chosen": -56.164154052734375, + "logps/rejected": -88.27549743652344, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.686497211456299, + "rewards/margins": 13.833208084106445, + "rewards/rejected": -19.51970672607422, + "step": 1207 + }, + { + "epoch": 7.158518518518519, + "grad_norm": 0.15837268371755486, + "learning_rate": 1.5449355558937337e-08, + "logits/chosen": -2.158918857574463, + "logits/rejected": -2.007082939147949, + "logps/chosen": -50.713157653808594, + "logps/rejected": -83.93782043457031, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5467233657836914, + "rewards/margins": 13.906265258789062, + "rewards/rejected": -16.452987670898438, + "step": 1208 + }, + { + "epoch": 7.164444444444444, + "grad_norm": 0.09623401985337293, + "learning_rate": 1.5225320744306408e-08, + "logits/chosen": -1.6897797584533691, + "logits/rejected": -1.607255220413208, + "logps/chosen": -40.003414154052734, + "logps/rejected": -81.5127182006836, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.148927688598633, + "rewards/margins": 12.95841121673584, + "rewards/rejected": -16.107337951660156, + "step": 1209 + }, + { + "epoch": 7.17037037037037, + "grad_norm": 0.1268425724222277, + "learning_rate": 1.5002871181111153e-08, + "logits/chosen": -1.3181800842285156, + "logits/rejected": -1.3308837413787842, + "logps/chosen": -44.879539489746094, + "logps/rejected": -76.36341857910156, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.692798614501953, + "rewards/margins": 12.881628036499023, + "rewards/rejected": -16.574426651000977, + "step": 1210 + }, + { + "epoch": 7.176296296296297, + "grad_norm": 0.14576067610000562, + "learning_rate": 1.4782008371381105e-08, + "logits/chosen": -1.5605270862579346, + "logits/rejected": -1.4438588619232178, + "logps/chosen": -46.77557373046875, + "logps/rejected": -87.78450012207031, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.457500457763672, + "rewards/margins": 12.943517684936523, + "rewards/rejected": -17.401016235351562, + "step": 1211 + }, + { + "epoch": 7.182222222222222, + "grad_norm": 0.0901720784546393, + "learning_rate": 1.4562733806431666e-08, + "logits/chosen": -2.1737942695617676, + "logits/rejected": -2.242323875427246, + "logps/chosen": -37.04216766357422, + "logps/rejected": -68.79451751708984, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2785842418670654, + "rewards/margins": 11.042994499206543, + "rewards/rejected": -12.321578025817871, + "step": 1212 + }, + { + "epoch": 7.188148148148148, + "grad_norm": 0.15482314701909852, + "learning_rate": 1.434504896685393e-08, + "logits/chosen": -1.648055076599121, + "logits/rejected": -1.6363328695297241, + "logps/chosen": -45.66598892211914, + "logps/rejected": -70.53289794921875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6662845611572266, + "rewards/margins": 12.108028411865234, + "rewards/rejected": -14.774312973022461, + "step": 1213 + }, + { + "epoch": 7.194074074074074, + "grad_norm": 0.11655454571606337, + "learning_rate": 1.4128955322504965e-08, + "logits/chosen": -1.491405725479126, + "logits/rejected": -1.4429785013198853, + "logps/chosen": -51.250797271728516, + "logps/rejected": -85.06803894042969, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8182194232940674, + "rewards/margins": 11.68270492553711, + "rewards/rejected": -15.500925064086914, + "step": 1214 + }, + { + "epoch": 7.2, + "grad_norm": 0.1574759664564603, + "learning_rate": 1.3914454332497604e-08, + "logits/chosen": -1.9412479400634766, + "logits/rejected": -1.8919508457183838, + "logps/chosen": -39.027462005615234, + "logps/rejected": -71.54948425292969, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6357011795043945, + "rewards/margins": 13.384864807128906, + "rewards/rejected": -16.020565032958984, + "step": 1215 + }, + { + "epoch": 7.205925925925926, + "grad_norm": 0.10315524246712969, + "learning_rate": 1.3701547445190836e-08, + "logits/chosen": -1.7386341094970703, + "logits/rejected": -1.432328701019287, + "logps/chosen": -57.30453109741211, + "logps/rejected": -104.63716125488281, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.707429885864258, + "rewards/margins": 13.92609977722168, + "rewards/rejected": -18.633529663085938, + "step": 1216 + }, + { + "epoch": 7.2118518518518515, + "grad_norm": 0.11981906949643559, + "learning_rate": 1.3490236098179813e-08, + "logits/chosen": -1.6856032609939575, + "logits/rejected": -1.7121961116790771, + "logps/chosen": -53.183937072753906, + "logps/rejected": -97.51420593261719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.14708137512207, + "rewards/margins": 13.861615180969238, + "rewards/rejected": -18.008697509765625, + "step": 1217 + }, + { + "epoch": 7.217777777777778, + "grad_norm": 0.1193832583649323, + "learning_rate": 1.3280521718286253e-08, + "logits/chosen": -1.4530967473983765, + "logits/rejected": -1.2759112119674683, + "logps/chosen": -45.5231819152832, + "logps/rejected": -75.6134262084961, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.124416828155518, + "rewards/margins": 11.91776180267334, + "rewards/rejected": -16.042179107666016, + "step": 1218 + }, + { + "epoch": 7.223703703703704, + "grad_norm": 0.09709503888659948, + "learning_rate": 1.3072405721548857e-08, + "logits/chosen": -1.8256022930145264, + "logits/rejected": -2.0024163722991943, + "logps/chosen": -61.09766387939453, + "logps/rejected": -71.4029312133789, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.887969493865967, + "rewards/margins": 9.966340065002441, + "rewards/rejected": -13.854310035705566, + "step": 1219 + }, + { + "epoch": 7.229629629629629, + "grad_norm": 0.10475361104381965, + "learning_rate": 1.2865889513213628e-08, + "logits/chosen": -1.619799256324768, + "logits/rejected": -1.7144474983215332, + "logps/chosen": -47.38188171386719, + "logps/rejected": -72.77843475341797, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.503984451293945, + "rewards/margins": 10.436614990234375, + "rewards/rejected": -14.94059944152832, + "step": 1220 + }, + { + "epoch": 7.235555555555556, + "grad_norm": 0.07688143241284169, + "learning_rate": 1.2660974487724407e-08, + "logits/chosen": -1.3199595212936401, + "logits/rejected": -1.292825698852539, + "logps/chosen": -42.49819564819336, + "logps/rejected": -77.67166137695312, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.343793869018555, + "rewards/margins": 13.139480590820312, + "rewards/rejected": -17.4832763671875, + "step": 1221 + }, + { + "epoch": 7.241481481481482, + "grad_norm": 0.060681308189254796, + "learning_rate": 1.2457662028713594e-08, + "logits/chosen": -1.419334888458252, + "logits/rejected": -1.3187925815582275, + "logps/chosen": -36.37110137939453, + "logps/rejected": -81.13549041748047, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.139266014099121, + "rewards/margins": 13.64179801940918, + "rewards/rejected": -16.781063079833984, + "step": 1222 + }, + { + "epoch": 7.247407407407407, + "grad_norm": 0.10166151212202382, + "learning_rate": 1.2255953508992612e-08, + "logits/chosen": -1.9275920391082764, + "logits/rejected": -1.8258066177368164, + "logps/chosen": -51.44398880004883, + "logps/rejected": -87.87835693359375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.75351095199585, + "rewards/margins": 12.301464080810547, + "rewards/rejected": -18.054975509643555, + "step": 1223 + }, + { + "epoch": 7.253333333333333, + "grad_norm": 0.07296420361158287, + "learning_rate": 1.205585029054279e-08, + "logits/chosen": -1.6533150672912598, + "logits/rejected": -1.7473328113555908, + "logps/chosen": -56.52097702026367, + "logps/rejected": -86.59286499023438, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.681492805480957, + "rewards/margins": 11.921660423278809, + "rewards/rejected": -17.603153228759766, + "step": 1224 + }, + { + "epoch": 7.2592592592592595, + "grad_norm": 0.0960882965233299, + "learning_rate": 1.1857353724505942e-08, + "logits/chosen": -1.789186716079712, + "logits/rejected": -1.7957122325897217, + "logps/chosen": -55.02898406982422, + "logps/rejected": -95.71049499511719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.103061199188232, + "rewards/margins": 14.242263793945312, + "rewards/rejected": -19.34532356262207, + "step": 1225 + }, + { + "epoch": 7.265185185185185, + "grad_norm": 0.11358951568845953, + "learning_rate": 1.1660465151175664e-08, + "logits/chosen": -2.1114211082458496, + "logits/rejected": -2.090874671936035, + "logps/chosen": -44.74166488647461, + "logps/rejected": -83.52067565917969, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4121861457824707, + "rewards/margins": 13.98027515411377, + "rewards/rejected": -17.3924617767334, + "step": 1226 + }, + { + "epoch": 7.271111111111111, + "grad_norm": 0.0780915485477315, + "learning_rate": 1.1465185899987794e-08, + "logits/chosen": -1.786757230758667, + "logits/rejected": -1.7711902856826782, + "logps/chosen": -44.79767608642578, + "logps/rejected": -80.92269134521484, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.684630870819092, + "rewards/margins": 12.207763671875, + "rewards/rejected": -16.89239501953125, + "step": 1227 + }, + { + "epoch": 7.277037037037037, + "grad_norm": 0.08736095702879869, + "learning_rate": 1.1271517289511783e-08, + "logits/chosen": -1.7297133207321167, + "logits/rejected": -1.6254792213439941, + "logps/chosen": -46.41914367675781, + "logps/rejected": -73.42701721191406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.2106242179870605, + "rewards/margins": 10.198822021484375, + "rewards/rejected": -14.409445762634277, + "step": 1228 + }, + { + "epoch": 7.282962962962963, + "grad_norm": 0.12460175164369738, + "learning_rate": 1.1079460627441666e-08, + "logits/chosen": -2.1245529651641846, + "logits/rejected": -1.9352598190307617, + "logps/chosen": -30.707019805908203, + "logps/rejected": -71.42001342773438, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4688324928283691, + "rewards/margins": 12.905763626098633, + "rewards/rejected": -14.374595642089844, + "step": 1229 + }, + { + "epoch": 7.288888888888889, + "grad_norm": 0.04378917663791941, + "learning_rate": 1.0889017210587215e-08, + "logits/chosen": -1.6909232139587402, + "logits/rejected": -1.1679630279541016, + "logps/chosen": -45.363487243652344, + "logps/rejected": -105.81968688964844, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5888261795043945, + "rewards/margins": 15.115147590637207, + "rewards/rejected": -18.7039737701416, + "step": 1230 + }, + { + "epoch": 7.294814814814814, + "grad_norm": 0.10066231851710405, + "learning_rate": 1.0700188324865189e-08, + "logits/chosen": -1.2599670886993408, + "logits/rejected": -1.2062746286392212, + "logps/chosen": -55.062339782714844, + "logps/rejected": -85.76885223388672, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.546828031539917, + "rewards/margins": 12.588035583496094, + "rewards/rejected": -16.134864807128906, + "step": 1231 + }, + { + "epoch": 7.300740740740741, + "grad_norm": 0.10055897394398317, + "learning_rate": 1.0512975245290685e-08, + "logits/chosen": -1.695151448249817, + "logits/rejected": -1.590423345565796, + "logps/chosen": -32.681095123291016, + "logps/rejected": -64.72657012939453, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.348266839981079, + "rewards/margins": 10.46805191040039, + "rewards/rejected": -12.81631851196289, + "step": 1232 + }, + { + "epoch": 7.306666666666667, + "grad_norm": 0.1545562358316079, + "learning_rate": 1.0327379235968548e-08, + "logits/chosen": -1.6161727905273438, + "logits/rejected": -1.5110998153686523, + "logps/chosen": -36.74101257324219, + "logps/rejected": -69.12883758544922, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4509549140930176, + "rewards/margins": 11.02175521850586, + "rewards/rejected": -13.472709655761719, + "step": 1233 + }, + { + "epoch": 7.312592592592592, + "grad_norm": 0.097607206610237, + "learning_rate": 1.0143401550084751e-08, + "logits/chosen": -2.2964179515838623, + "logits/rejected": -2.0990958213806152, + "logps/chosen": -41.50244140625, + "logps/rejected": -97.56422424316406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9923524856567383, + "rewards/margins": 13.67741584777832, + "rewards/rejected": -16.669769287109375, + "step": 1234 + }, + { + "epoch": 7.318518518518519, + "grad_norm": 0.18438840616271115, + "learning_rate": 9.961043429898036e-09, + "logits/chosen": -1.7009012699127197, + "logits/rejected": -1.5941734313964844, + "logps/chosen": -66.56523132324219, + "logps/rejected": -89.49272155761719, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.492288112640381, + "rewards/margins": 13.816864013671875, + "rewards/rejected": -18.309152603149414, + "step": 1235 + }, + { + "epoch": 7.3244444444444445, + "grad_norm": 0.10830083734526227, + "learning_rate": 9.780306106731418e-09, + "logits/chosen": -2.0649874210357666, + "logits/rejected": -1.8523389101028442, + "logps/chosen": -40.13294982910156, + "logps/rejected": -87.09971618652344, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.167768716812134, + "rewards/margins": 13.352956771850586, + "rewards/rejected": -16.52072525024414, + "step": 1236 + }, + { + "epoch": 7.33037037037037, + "grad_norm": 0.13555625022568338, + "learning_rate": 9.601190800963942e-09, + "logits/chosen": -1.7808589935302734, + "logits/rejected": -1.6981477737426758, + "logps/chosen": -38.0859489440918, + "logps/rejected": -69.84672546386719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0800421237945557, + "rewards/margins": 11.428643226623535, + "rewards/rejected": -14.508685111999512, + "step": 1237 + }, + { + "epoch": 7.336296296296297, + "grad_norm": 0.07701345679436172, + "learning_rate": 9.423698722022505e-09, + "logits/chosen": -1.7928366661071777, + "logits/rejected": -1.6887304782867432, + "logps/chosen": -53.604183197021484, + "logps/rejected": -96.15071105957031, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.998568534851074, + "rewards/margins": 13.373642921447754, + "rewards/rejected": -18.372211456298828, + "step": 1238 + }, + { + "epoch": 7.342222222222222, + "grad_norm": 0.09833084593667012, + "learning_rate": 9.247831068373458e-09, + "logits/chosen": -1.3438589572906494, + "logits/rejected": -1.445772409439087, + "logps/chosen": -50.54953384399414, + "logps/rejected": -79.28211975097656, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8045411109924316, + "rewards/margins": 12.622430801391602, + "rewards/rejected": -15.426971435546875, + "step": 1239 + }, + { + "epoch": 7.348148148148148, + "grad_norm": 0.07525388467886729, + "learning_rate": 9.073589027514789e-09, + "logits/chosen": -1.3987599611282349, + "logits/rejected": -1.130275011062622, + "logps/chosen": -42.32148361206055, + "logps/rejected": -93.08763122558594, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6124842166900635, + "rewards/margins": 16.635757446289062, + "rewards/rejected": -19.248241424560547, + "step": 1240 + }, + { + "epoch": 7.354074074074074, + "grad_norm": 0.06922858097247736, + "learning_rate": 8.900973775967963e-09, + "logits/chosen": -1.4953887462615967, + "logits/rejected": -1.475359320640564, + "logps/chosen": -38.44170379638672, + "logps/rejected": -63.54737091064453, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.280339241027832, + "rewards/margins": 10.804010391235352, + "rewards/rejected": -13.0843505859375, + "step": 1241 + }, + { + "epoch": 7.36, + "grad_norm": 0.09047335182590145, + "learning_rate": 8.729986479269924e-09, + "logits/chosen": -1.6588947772979736, + "logits/rejected": -1.534369945526123, + "logps/chosen": -51.98193359375, + "logps/rejected": -84.31365966796875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.556525945663452, + "rewards/margins": 14.048201560974121, + "rewards/rejected": -17.604726791381836, + "step": 1242 + }, + { + "epoch": 7.365925925925926, + "grad_norm": 0.12417063115745969, + "learning_rate": 8.56062829196541e-09, + "logits/chosen": -1.9344502687454224, + "logits/rejected": -1.9332109689712524, + "logps/chosen": -49.665287017822266, + "logps/rejected": -76.17633819580078, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.408128261566162, + "rewards/margins": 10.428698539733887, + "rewards/rejected": -12.83682632446289, + "step": 1243 + }, + { + "epoch": 7.371851851851852, + "grad_norm": 0.18621901326488574, + "learning_rate": 8.392900357598959e-09, + "logits/chosen": -1.5785934925079346, + "logits/rejected": -1.6058554649353027, + "logps/chosen": -56.491905212402344, + "logps/rejected": -87.51421356201172, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.567201614379883, + "rewards/margins": 13.711860656738281, + "rewards/rejected": -18.27906036376953, + "step": 1244 + }, + { + "epoch": 7.377777777777778, + "grad_norm": 0.2040690241681394, + "learning_rate": 8.2268038087073e-09, + "logits/chosen": -1.2561684846878052, + "logits/rejected": -1.3627725839614868, + "logps/chosen": -57.90671920776367, + "logps/rejected": -68.91231536865234, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6979713439941406, + "rewards/margins": 11.910882949829102, + "rewards/rejected": -15.608854293823242, + "step": 1245 + }, + { + "epoch": 7.383703703703704, + "grad_norm": 0.09974348633584786, + "learning_rate": 8.062339766811726e-09, + "logits/chosen": -1.4112184047698975, + "logits/rejected": -1.5091230869293213, + "logps/chosen": -54.87425231933594, + "logps/rejected": -84.4752197265625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7919604778289795, + "rewards/margins": 13.433491706848145, + "rewards/rejected": -17.225452423095703, + "step": 1246 + }, + { + "epoch": 7.3896296296296295, + "grad_norm": 0.09597782439298651, + "learning_rate": 7.899509342410376e-09, + "logits/chosen": -1.48615562915802, + "logits/rejected": -1.1188569068908691, + "logps/chosen": -43.99871063232422, + "logps/rejected": -87.73038482666016, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.9534430503845215, + "rewards/margins": 12.593347549438477, + "rewards/rejected": -16.546789169311523, + "step": 1247 + }, + { + "epoch": 7.395555555555555, + "grad_norm": 0.0992930007513707, + "learning_rate": 7.738313634970962e-09, + "logits/chosen": -2.0040321350097656, + "logits/rejected": -1.9670236110687256, + "logps/chosen": -46.382930755615234, + "logps/rejected": -87.54319763183594, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6569628715515137, + "rewards/margins": 15.009819030761719, + "rewards/rejected": -18.66678237915039, + "step": 1248 + }, + { + "epoch": 7.401481481481482, + "grad_norm": 0.10285352268854367, + "learning_rate": 7.578753732923132e-09, + "logits/chosen": -1.5322126150131226, + "logits/rejected": -1.5202000141143799, + "logps/chosen": -48.83302307128906, + "logps/rejected": -90.44819641113281, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2107291221618652, + "rewards/margins": 14.075639724731445, + "rewards/rejected": -17.28636932373047, + "step": 1249 + }, + { + "epoch": 7.407407407407407, + "grad_norm": 0.07384109176112451, + "learning_rate": 7.4208307136512385e-09, + "logits/chosen": -1.393030047416687, + "logits/rejected": -1.254082202911377, + "logps/chosen": -40.82709884643555, + "logps/rejected": -70.26875305175781, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.902104377746582, + "rewards/margins": 11.157198905944824, + "rewards/rejected": -15.059304237365723, + "step": 1250 + }, + { + "epoch": 7.413333333333333, + "grad_norm": 0.10033634481057024, + "learning_rate": 7.2645456434869965e-09, + "logits/chosen": -1.778923511505127, + "logits/rejected": -1.6090879440307617, + "logps/chosen": -52.271240234375, + "logps/rejected": -99.33088684082031, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.88856315612793, + "rewards/margins": 14.440168380737305, + "rewards/rejected": -19.328731536865234, + "step": 1251 + }, + { + "epoch": 7.41925925925926, + "grad_norm": 0.07932056686265246, + "learning_rate": 7.109899577702389e-09, + "logits/chosen": -1.5965080261230469, + "logits/rejected": -1.5556455850601196, + "logps/chosen": -38.09616470336914, + "logps/rejected": -73.47975158691406, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3132272958755493, + "rewards/margins": 12.046424865722656, + "rewards/rejected": -13.35965347290039, + "step": 1252 + }, + { + "epoch": 7.425185185185185, + "grad_norm": 0.08761752858525221, + "learning_rate": 6.956893560502358e-09, + "logits/chosen": -2.030911445617676, + "logits/rejected": -1.9399561882019043, + "logps/chosen": -42.745521545410156, + "logps/rejected": -75.47513580322266, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.092884063720703, + "rewards/margins": 10.458334922790527, + "rewards/rejected": -13.551218032836914, + "step": 1253 + }, + { + "epoch": 7.431111111111111, + "grad_norm": 0.10380101419190634, + "learning_rate": 6.805528625018014e-09, + "logits/chosen": -1.7056939601898193, + "logits/rejected": -1.5778566598892212, + "logps/chosen": -54.559730529785156, + "logps/rejected": -87.96572875976562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.20688533782959, + "rewards/margins": 12.745380401611328, + "rewards/rejected": -16.9522647857666, + "step": 1254 + }, + { + "epoch": 7.437037037037037, + "grad_norm": 0.10501305505929422, + "learning_rate": 6.655805793299413e-09, + "logits/chosen": -1.62470281124115, + "logits/rejected": -1.469839334487915, + "logps/chosen": -42.00901794433594, + "logps/rejected": -80.80410766601562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.080652952194214, + "rewards/margins": 12.35036849975586, + "rewards/rejected": -15.43101978302002, + "step": 1255 + }, + { + "epoch": 7.442962962962963, + "grad_norm": 0.10261502163857839, + "learning_rate": 6.5077260763087836e-09, + "logits/chosen": -1.2496845722198486, + "logits/rejected": -1.1165101528167725, + "logps/chosen": -43.556697845458984, + "logps/rejected": -72.50883483886719, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3539299964904785, + "rewards/margins": 10.830613136291504, + "rewards/rejected": -14.18454360961914, + "step": 1256 + }, + { + "epoch": 7.448888888888889, + "grad_norm": 0.13754126034863676, + "learning_rate": 6.361290473913705e-09, + "logits/chosen": -1.7846851348876953, + "logits/rejected": -1.6844943761825562, + "logps/chosen": -54.899391174316406, + "logps/rejected": -99.20624542236328, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.213879108428955, + "rewards/margins": 15.631208419799805, + "rewards/rejected": -19.8450870513916, + "step": 1257 + }, + { + "epoch": 7.454814814814815, + "grad_norm": 0.13332224892140737, + "learning_rate": 6.216499974880274e-09, + "logits/chosen": -1.4883358478546143, + "logits/rejected": -1.5634472370147705, + "logps/chosen": -45.47296142578125, + "logps/rejected": -67.79109191894531, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1466634273529053, + "rewards/margins": 12.485091209411621, + "rewards/rejected": -14.631753921508789, + "step": 1258 + }, + { + "epoch": 7.460740740740741, + "grad_norm": 0.10757164589821686, + "learning_rate": 6.073355556866527e-09, + "logits/chosen": -1.5689035654067993, + "logits/rejected": -1.6330454349517822, + "logps/chosen": -62.354061126708984, + "logps/rejected": -74.84004211425781, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7635915279388428, + "rewards/margins": 12.152324676513672, + "rewards/rejected": -15.915916442871094, + "step": 1259 + }, + { + "epoch": 7.466666666666667, + "grad_norm": 0.15137621474800703, + "learning_rate": 5.9318581864157555e-09, + "logits/chosen": -1.6206462383270264, + "logits/rejected": -1.5390565395355225, + "logps/chosen": -45.372283935546875, + "logps/rejected": -71.48526000976562, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.424093723297119, + "rewards/margins": 11.92187213897705, + "rewards/rejected": -15.345966339111328, + "step": 1260 + }, + { + "epoch": 7.4725925925925925, + "grad_norm": 0.06945252873508109, + "learning_rate": 5.792008818950034e-09, + "logits/chosen": -1.677442193031311, + "logits/rejected": -1.543940544128418, + "logps/chosen": -41.09063720703125, + "logps/rejected": -78.29434204101562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2955427169799805, + "rewards/margins": 12.228326797485352, + "rewards/rejected": -15.523869514465332, + "step": 1261 + }, + { + "epoch": 7.478518518518518, + "grad_norm": 0.09200967443221757, + "learning_rate": 5.653808398763726e-09, + "logits/chosen": -1.9046722650527954, + "logits/rejected": -1.8483989238739014, + "logps/chosen": -35.713706970214844, + "logps/rejected": -54.89629364013672, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9885141253471375, + "rewards/margins": 9.532726287841797, + "rewards/rejected": -10.521241188049316, + "step": 1262 + }, + { + "epoch": 7.484444444444445, + "grad_norm": 0.08851787133475601, + "learning_rate": 5.5172578590171606e-09, + "logits/chosen": -1.712750792503357, + "logits/rejected": -1.7361048460006714, + "logps/chosen": -33.406742095947266, + "logps/rejected": -64.78385925292969, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1024856567382812, + "rewards/margins": 11.14158821105957, + "rewards/rejected": -13.244073867797852, + "step": 1263 + }, + { + "epoch": 7.49037037037037, + "grad_norm": 0.13185333469961655, + "learning_rate": 5.382358121730296e-09, + "logits/chosen": -2.138230800628662, + "logits/rejected": -2.0349910259246826, + "logps/chosen": -38.78353500366211, + "logps/rejected": -73.98619842529297, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.396270751953125, + "rewards/margins": 11.905975341796875, + "rewards/rejected": -14.30224609375, + "step": 1264 + }, + { + "epoch": 7.496296296296296, + "grad_norm": 0.11830301808182496, + "learning_rate": 5.249110097776482e-09, + "logits/chosen": -1.7554690837860107, + "logits/rejected": -1.6527049541473389, + "logps/chosen": -55.28874969482422, + "logps/rejected": -85.10946655273438, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.7993483543396, + "rewards/margins": 12.124895095825195, + "rewards/rejected": -17.924243927001953, + "step": 1265 + }, + { + "epoch": 7.502222222222223, + "grad_norm": 0.10086200648031751, + "learning_rate": 5.117514686876378e-09, + "logits/chosen": -1.4983659982681274, + "logits/rejected": -1.3238434791564941, + "logps/chosen": -41.69336700439453, + "logps/rejected": -90.9068603515625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6861684322357178, + "rewards/margins": 14.651307106018066, + "rewards/rejected": -18.337474822998047, + "step": 1266 + }, + { + "epoch": 7.508148148148148, + "grad_norm": 0.11418057405054963, + "learning_rate": 4.987572777591764e-09, + "logits/chosen": -2.029273271560669, + "logits/rejected": -1.9411393404006958, + "logps/chosen": -50.297698974609375, + "logps/rejected": -85.19852447509766, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.682843208312988, + "rewards/margins": 13.862180709838867, + "rewards/rejected": -18.545024871826172, + "step": 1267 + }, + { + "epoch": 7.514074074074074, + "grad_norm": 0.110024960488623, + "learning_rate": 4.859285247319656e-09, + "logits/chosen": -1.9883079528808594, + "logits/rejected": -1.835797667503357, + "logps/chosen": -38.44123077392578, + "logps/rejected": -76.23625183105469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.574667453765869, + "rewards/margins": 12.788910865783691, + "rewards/rejected": -16.36357879638672, + "step": 1268 + }, + { + "epoch": 7.52, + "grad_norm": 0.07976217403911946, + "learning_rate": 4.732652962286282e-09, + "logits/chosen": -1.9681137800216675, + "logits/rejected": -1.8588993549346924, + "logps/chosen": -48.73500442504883, + "logps/rejected": -96.4697265625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.277359962463379, + "rewards/margins": 12.159460067749023, + "rewards/rejected": -17.436819076538086, + "step": 1269 + }, + { + "epoch": 7.525925925925926, + "grad_norm": 0.17881558205057357, + "learning_rate": 4.607676777541342e-09, + "logits/chosen": -1.4052648544311523, + "logits/rejected": -1.2790547609329224, + "logps/chosen": -48.86443328857422, + "logps/rejected": -83.3725357055664, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.212798118591309, + "rewards/margins": 12.557598114013672, + "rewards/rejected": -16.770395278930664, + "step": 1270 + }, + { + "epoch": 7.531851851851852, + "grad_norm": 0.13713918527473565, + "learning_rate": 4.4843575369521155e-09, + "logits/chosen": -1.8819085359573364, + "logits/rejected": -2.066300868988037, + "logps/chosen": -74.62556457519531, + "logps/rejected": -95.09159851074219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -6.8993940353393555, + "rewards/margins": 12.837356567382812, + "rewards/rejected": -19.736751556396484, + "step": 1271 + }, + { + "epoch": 7.5377777777777775, + "grad_norm": 0.11548976688435701, + "learning_rate": 4.362696073197863e-09, + "logits/chosen": -1.3876936435699463, + "logits/rejected": -1.496161937713623, + "logps/chosen": -46.12397003173828, + "logps/rejected": -64.80559539794922, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6518354415893555, + "rewards/margins": 10.367033004760742, + "rewards/rejected": -13.018869400024414, + "step": 1272 + }, + { + "epoch": 7.543703703703704, + "grad_norm": 0.14402006709619897, + "learning_rate": 4.242693207764159e-09, + "logits/chosen": -1.6995124816894531, + "logits/rejected": -1.7169837951660156, + "logps/chosen": -46.91732406616211, + "logps/rejected": -79.42771911621094, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7356224060058594, + "rewards/margins": 13.237366676330566, + "rewards/rejected": -15.972990036010742, + "step": 1273 + }, + { + "epoch": 7.54962962962963, + "grad_norm": 0.06793914185958956, + "learning_rate": 4.12434975093734e-09, + "logits/chosen": -1.6203457117080688, + "logits/rejected": -1.6125106811523438, + "logps/chosen": -48.54278564453125, + "logps/rejected": -76.17129516601562, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.657160758972168, + "rewards/margins": 12.374560356140137, + "rewards/rejected": -17.031721115112305, + "step": 1274 + }, + { + "epoch": 7.555555555555555, + "grad_norm": 0.08491480689935392, + "learning_rate": 4.007666501799012e-09, + "logits/chosen": -2.2714755535125732, + "logits/rejected": -2.1652722358703613, + "logps/chosen": -45.407203674316406, + "logps/rejected": -86.5791015625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.742861270904541, + "rewards/margins": 11.433135032653809, + "rewards/rejected": -15.175996780395508, + "step": 1275 + }, + { + "epoch": 7.561481481481481, + "grad_norm": 0.11529216841661874, + "learning_rate": 3.89264424822075e-09, + "logits/chosen": -1.641927719116211, + "logits/rejected": -1.4134973287582397, + "logps/chosen": -49.21048355102539, + "logps/rejected": -88.19728088378906, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.468847274780273, + "rewards/margins": 11.007088661193848, + "rewards/rejected": -15.475936889648438, + "step": 1276 + }, + { + "epoch": 7.567407407407408, + "grad_norm": 0.11468580187199377, + "learning_rate": 3.779283766858682e-09, + "logits/chosen": -1.4971985816955566, + "logits/rejected": -1.2270244359970093, + "logps/chosen": -28.876060485839844, + "logps/rejected": -70.2495346069336, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6484756469726562, + "rewards/margins": 12.464676856994629, + "rewards/rejected": -14.113153457641602, + "step": 1277 + }, + { + "epoch": 7.573333333333333, + "grad_norm": 0.10707890797785785, + "learning_rate": 3.667585823148217e-09, + "logits/chosen": -1.5404367446899414, + "logits/rejected": -1.5498936176300049, + "logps/chosen": -54.821353912353516, + "logps/rejected": -76.89654541015625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.003150939941406, + "rewards/margins": 10.68851089477539, + "rewards/rejected": -14.691661834716797, + "step": 1278 + }, + { + "epoch": 7.579259259259259, + "grad_norm": 0.13044904772149862, + "learning_rate": 3.5575511712990504e-09, + "logits/chosen": -1.5401300191879272, + "logits/rejected": -1.5082144737243652, + "logps/chosen": -49.69709396362305, + "logps/rejected": -83.05561065673828, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6358282566070557, + "rewards/margins": 11.022361755371094, + "rewards/rejected": -14.65818977355957, + "step": 1279 + }, + { + "epoch": 7.5851851851851855, + "grad_norm": 0.1139761611228426, + "learning_rate": 3.4491805542899155e-09, + "logits/chosen": -1.276806354522705, + "logits/rejected": -1.2018065452575684, + "logps/chosen": -38.74787139892578, + "logps/rejected": -74.66535186767578, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6435717344284058, + "rewards/margins": 12.292912483215332, + "rewards/rejected": -13.936485290527344, + "step": 1280 + }, + { + "epoch": 7.591111111111111, + "grad_norm": 0.11980572970276052, + "learning_rate": 3.342474703863507e-09, + "logits/chosen": -1.935957908630371, + "logits/rejected": -1.670924425125122, + "logps/chosen": -50.889198303222656, + "logps/rejected": -93.57077026367188, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.593352794647217, + "rewards/margins": 14.393285751342773, + "rewards/rejected": -17.986637115478516, + "step": 1281 + }, + { + "epoch": 7.597037037037037, + "grad_norm": 0.14396962997949114, + "learning_rate": 3.2374343405217884e-09, + "logits/chosen": -1.7795342206954956, + "logits/rejected": -1.6266406774520874, + "logps/chosen": -37.90673065185547, + "logps/rejected": -76.17749786376953, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9176108837127686, + "rewards/margins": 13.06861686706543, + "rewards/rejected": -15.986227035522461, + "step": 1282 + }, + { + "epoch": 7.6029629629629625, + "grad_norm": 0.050320004658963226, + "learning_rate": 3.1340601735209137e-09, + "logits/chosen": -1.5465812683105469, + "logits/rejected": -1.3979713916778564, + "logps/chosen": -44.11369323730469, + "logps/rejected": -81.26278686523438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2288765907287598, + "rewards/margins": 13.506202697753906, + "rewards/rejected": -16.735078811645508, + "step": 1283 + }, + { + "epoch": 7.608888888888889, + "grad_norm": 0.12064099276652479, + "learning_rate": 3.0323529008664807e-09, + "logits/chosen": -1.6967335939407349, + "logits/rejected": -1.5087066888809204, + "logps/chosen": -42.999183654785156, + "logps/rejected": -77.17369079589844, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.500814437866211, + "rewards/margins": 11.972524642944336, + "rewards/rejected": -14.473339080810547, + "step": 1284 + }, + { + "epoch": 7.614814814814815, + "grad_norm": 0.12569846714622554, + "learning_rate": 2.9323132093088954e-09, + "logits/chosen": -1.8397209644317627, + "logits/rejected": -1.7531307935714722, + "logps/chosen": -36.641639709472656, + "logps/rejected": -76.00559997558594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.290510654449463, + "rewards/margins": 14.336465835571289, + "rewards/rejected": -16.626977920532227, + "step": 1285 + }, + { + "epoch": 7.62074074074074, + "grad_norm": 0.16279004525534851, + "learning_rate": 2.833941774338655e-09, + "logits/chosen": -2.2504396438598633, + "logits/rejected": -2.192629337310791, + "logps/chosen": -41.64829635620117, + "logps/rejected": -79.78437805175781, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9262151718139648, + "rewards/margins": 14.325329780578613, + "rewards/rejected": -16.251544952392578, + "step": 1286 + }, + { + "epoch": 7.626666666666667, + "grad_norm": 0.12188734734718085, + "learning_rate": 2.7372392601817675e-09, + "logits/chosen": -1.862606406211853, + "logits/rejected": -1.7344017028808594, + "logps/chosen": -46.619632720947266, + "logps/rejected": -80.42242431640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7526557445526123, + "rewards/margins": 12.96417236328125, + "rewards/rejected": -15.716828346252441, + "step": 1287 + }, + { + "epoch": 7.632592592592593, + "grad_norm": 0.10275281860218967, + "learning_rate": 2.6422063197953926e-09, + "logits/chosen": -2.094736099243164, + "logits/rejected": -2.141115427017212, + "logps/chosen": -49.34331130981445, + "logps/rejected": -82.15042114257812, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.431687593460083, + "rewards/margins": 11.898085594177246, + "rewards/rejected": -15.32977294921875, + "step": 1288 + }, + { + "epoch": 7.638518518518518, + "grad_norm": 0.10108153760024327, + "learning_rate": 2.548843594863348e-09, + "logits/chosen": -1.8961472511291504, + "logits/rejected": -1.8754181861877441, + "logps/chosen": -51.4503173828125, + "logps/rejected": -77.76640319824219, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0304574966430664, + "rewards/margins": 11.970703125, + "rewards/rejected": -15.00115966796875, + "step": 1289 + }, + { + "epoch": 7.644444444444445, + "grad_norm": 0.11013184830599393, + "learning_rate": 2.4571517157916944e-09, + "logits/chosen": -1.6822834014892578, + "logits/rejected": -1.51578688621521, + "logps/chosen": -33.87409210205078, + "logps/rejected": -77.11097717285156, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.183983325958252, + "rewards/margins": 12.748117446899414, + "rewards/rejected": -14.932100296020508, + "step": 1290 + }, + { + "epoch": 7.6503703703703705, + "grad_norm": 0.2332749309815809, + "learning_rate": 2.3671313017046557e-09, + "logits/chosen": -1.6843562126159668, + "logits/rejected": -1.731232762336731, + "logps/chosen": -53.30952453613281, + "logps/rejected": -75.22252655029297, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.883240699768066, + "rewards/margins": 11.473252296447754, + "rewards/rejected": -16.35649299621582, + "step": 1291 + }, + { + "epoch": 7.656296296296296, + "grad_norm": 0.06386399382541402, + "learning_rate": 2.27878296044029e-09, + "logits/chosen": -1.9962579011917114, + "logits/rejected": -1.9769573211669922, + "logps/chosen": -50.019596099853516, + "logps/rejected": -79.72898864746094, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3184549808502197, + "rewards/margins": 11.753643989562988, + "rewards/rejected": -15.072098731994629, + "step": 1292 + }, + { + "epoch": 7.662222222222223, + "grad_norm": 0.1382007429150244, + "learning_rate": 2.1921072885464633e-09, + "logits/chosen": -1.746502161026001, + "logits/rejected": -1.880075216293335, + "logps/chosen": -46.33978271484375, + "logps/rejected": -69.8848648071289, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3827314376831055, + "rewards/margins": 10.580240249633789, + "rewards/rejected": -13.962972640991211, + "step": 1293 + }, + { + "epoch": 7.668148148148148, + "grad_norm": 0.08545353980119474, + "learning_rate": 2.1071048712768545e-09, + "logits/chosen": -2.118556499481201, + "logits/rejected": -2.007209062576294, + "logps/chosen": -39.88838195800781, + "logps/rejected": -72.66770935058594, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8593382835388184, + "rewards/margins": 11.902694702148438, + "rewards/rejected": -13.762033462524414, + "step": 1294 + }, + { + "epoch": 7.674074074074074, + "grad_norm": 0.10569469042372463, + "learning_rate": 2.0237762825868752e-09, + "logits/chosen": -2.210296154022217, + "logits/rejected": -2.1025829315185547, + "logps/chosen": -53.71271514892578, + "logps/rejected": -79.43670654296875, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.210070610046387, + "rewards/margins": 12.35400390625, + "rewards/rejected": -16.56407356262207, + "step": 1295 + }, + { + "epoch": 7.68, + "grad_norm": 0.12680804276707394, + "learning_rate": 1.9421220851298657e-09, + "logits/chosen": -1.7897592782974243, + "logits/rejected": -1.6870161294937134, + "logps/chosen": -45.054420471191406, + "logps/rejected": -87.12640380859375, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4378550052642822, + "rewards/margins": 13.260967254638672, + "rewards/rejected": -16.698823928833008, + "step": 1296 + }, + { + "epoch": 7.685925925925926, + "grad_norm": 0.1026870860090837, + "learning_rate": 1.8621428302533492e-09, + "logits/chosen": -1.8493924140930176, + "logits/rejected": -1.8512948751449585, + "logps/chosen": -46.09538650512695, + "logps/rejected": -74.38146209716797, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.512159824371338, + "rewards/margins": 11.365545272827148, + "rewards/rejected": -14.877706527709961, + "step": 1297 + }, + { + "epoch": 7.691851851851852, + "grad_norm": 0.12428573647645252, + "learning_rate": 1.7838390579952567e-09, + "logits/chosen": -1.5749092102050781, + "logits/rejected": -1.710465908050537, + "logps/chosen": -43.68793487548828, + "logps/rejected": -69.81632995605469, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.554255723953247, + "rewards/margins": 11.78613567352295, + "rewards/rejected": -15.340391159057617, + "step": 1298 + }, + { + "epoch": 7.697777777777778, + "grad_norm": 0.07377154770168977, + "learning_rate": 1.7072112970802633e-09, + "logits/chosen": -1.6239237785339355, + "logits/rejected": -1.5087684392929077, + "logps/chosen": -39.3017578125, + "logps/rejected": -78.0733642578125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.597720146179199, + "rewards/margins": 12.936697006225586, + "rewards/rejected": -15.534416198730469, + "step": 1299 + }, + { + "epoch": 7.703703703703704, + "grad_norm": 0.11840426071981136, + "learning_rate": 1.6322600649162354e-09, + "logits/chosen": -1.8614153861999512, + "logits/rejected": -1.8858567476272583, + "logps/chosen": -49.18476867675781, + "logps/rejected": -71.32328796386719, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.097779273986816, + "rewards/margins": 9.952096939086914, + "rewards/rejected": -14.04987621307373, + "step": 1300 + }, + { + "epoch": 7.70962962962963, + "grad_norm": 0.11361634619507371, + "learning_rate": 1.5589858675907618e-09, + "logits/chosen": -1.7827186584472656, + "logits/rejected": -1.7072844505310059, + "logps/chosen": -50.35566711425781, + "logps/rejected": -82.6529769897461, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8080339431762695, + "rewards/margins": 11.548542022705078, + "rewards/rejected": -15.356575965881348, + "step": 1301 + }, + { + "epoch": 7.7155555555555555, + "grad_norm": 0.10114625449764604, + "learning_rate": 1.4873891998677112e-09, + "logits/chosen": -1.7124638557434082, + "logits/rejected": -1.6303023099899292, + "logps/chosen": -39.04692077636719, + "logps/rejected": -69.08177947998047, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.496570110321045, + "rewards/margins": 10.149101257324219, + "rewards/rejected": -11.645671844482422, + "step": 1302 + }, + { + "epoch": 7.721481481481481, + "grad_norm": 0.10974877746783143, + "learning_rate": 1.4174705451838743e-09, + "logits/chosen": -1.6638457775115967, + "logits/rejected": -1.7446383237838745, + "logps/chosen": -44.485374450683594, + "logps/rejected": -66.89187622070312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.829464316368103, + "rewards/margins": 11.198625564575195, + "rewards/rejected": -13.02808952331543, + "step": 1303 + }, + { + "epoch": 7.727407407407408, + "grad_norm": 0.11233063444015381, + "learning_rate": 1.3492303756457158e-09, + "logits/chosen": -1.914198398590088, + "logits/rejected": -1.5145885944366455, + "logps/chosen": -45.77354431152344, + "logps/rejected": -98.0726089477539, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.005705833435059, + "rewards/margins": 17.062856674194336, + "rewards/rejected": -22.06856346130371, + "step": 1304 + }, + { + "epoch": 7.733333333333333, + "grad_norm": 0.11411994741058401, + "learning_rate": 1.2826691520262112e-09, + "logits/chosen": -1.5672929286956787, + "logits/rejected": -1.4095594882965088, + "logps/chosen": -43.227386474609375, + "logps/rejected": -78.2294692993164, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.725564956665039, + "rewards/margins": 10.642614364624023, + "rewards/rejected": -14.368179321289062, + "step": 1305 + }, + { + "epoch": 7.739259259259259, + "grad_norm": 0.11628123883143882, + "learning_rate": 1.2177873237617375e-09, + "logits/chosen": -1.3895998001098633, + "logits/rejected": -1.5497479438781738, + "logps/chosen": -59.86248016357422, + "logps/rejected": -70.28961181640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.182040691375732, + "rewards/margins": 10.678327560424805, + "rewards/rejected": -15.860368728637695, + "step": 1306 + }, + { + "epoch": 7.745185185185186, + "grad_norm": 0.10568108304021323, + "learning_rate": 1.1545853289489927e-09, + "logits/chosen": -1.4103091955184937, + "logits/rejected": -1.400766372680664, + "logps/chosen": -34.94217300415039, + "logps/rejected": -60.6082763671875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.777432441711426, + "rewards/margins": 9.586267471313477, + "rewards/rejected": -12.363698959350586, + "step": 1307 + }, + { + "epoch": 7.751111111111111, + "grad_norm": 0.12159343302298652, + "learning_rate": 1.0930635943420253e-09, + "logits/chosen": -2.34028697013855, + "logits/rejected": -2.267382860183716, + "logps/chosen": -37.190250396728516, + "logps/rejected": -84.93000030517578, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.365945339202881, + "rewards/margins": 14.706781387329102, + "rewards/rejected": -17.07272720336914, + "step": 1308 + }, + { + "epoch": 7.757037037037037, + "grad_norm": 0.11929320977244282, + "learning_rate": 1.0332225353494318e-09, + "logits/chosen": -1.680659294128418, + "logits/rejected": -1.76425039768219, + "logps/chosen": -55.115943908691406, + "logps/rejected": -80.5603256225586, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.972663164138794, + "rewards/margins": 12.271014213562012, + "rewards/rejected": -16.243677139282227, + "step": 1309 + }, + { + "epoch": 7.762962962962963, + "grad_norm": 0.09092136853386971, + "learning_rate": 9.750625560315528e-10, + "logits/chosen": -1.676912546157837, + "logits/rejected": -1.6585233211517334, + "logps/chosen": -52.220970153808594, + "logps/rejected": -79.3109130859375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.994405746459961, + "rewards/margins": 11.580153465270996, + "rewards/rejected": -14.57455825805664, + "step": 1310 + }, + { + "epoch": 7.768888888888889, + "grad_norm": 0.1932632252968447, + "learning_rate": 9.185840490975594e-10, + "logits/chosen": -1.5779668092727661, + "logits/rejected": -1.5990784168243408, + "logps/chosen": -47.03139114379883, + "logps/rejected": -77.99703216552734, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3039872646331787, + "rewards/margins": 13.618040084838867, + "rewards/rejected": -16.922027587890625, + "step": 1311 + }, + { + "epoch": 7.774814814814815, + "grad_norm": 0.11174616662096246, + "learning_rate": 8.637873959031206e-10, + "logits/chosen": -2.237205982208252, + "logits/rejected": -2.167008638381958, + "logps/chosen": -43.71097183227539, + "logps/rejected": -75.61814880371094, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.187469005584717, + "rewards/margins": 12.020849227905273, + "rewards/rejected": -15.208316802978516, + "step": 1312 + }, + { + "epoch": 7.7807407407407405, + "grad_norm": 0.10082024626851338, + "learning_rate": 8.106729664475176e-10, + "logits/chosen": -2.1557466983795166, + "logits/rejected": -2.0947141647338867, + "logps/chosen": -40.8330078125, + "logps/rejected": -75.10816955566406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.995957851409912, + "rewards/margins": 11.368391990661621, + "rewards/rejected": -15.364350318908691, + "step": 1313 + }, + { + "epoch": 7.786666666666667, + "grad_norm": 0.11218104181699615, + "learning_rate": 7.592411193713122e-10, + "logits/chosen": -1.7130632400512695, + "logits/rejected": -1.5563557147979736, + "logps/chosen": -56.98699188232422, + "logps/rejected": -95.31857299804688, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.602969646453857, + "rewards/margins": 11.59247875213623, + "rewards/rejected": -17.195449829101562, + "step": 1314 + }, + { + "epoch": 7.792592592592593, + "grad_norm": 0.09595333522956707, + "learning_rate": 7.094922019539318e-10, + "logits/chosen": -1.564188838005066, + "logits/rejected": -1.606329083442688, + "logps/chosen": -39.70273971557617, + "logps/rejected": -75.67445373535156, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5167977809906006, + "rewards/margins": 14.296806335449219, + "rewards/rejected": -17.813602447509766, + "step": 1315 + }, + { + "epoch": 7.798518518518518, + "grad_norm": 0.1047928513357108, + "learning_rate": 6.61426550111227e-10, + "logits/chosen": -1.9991806745529175, + "logits/rejected": -1.858351230621338, + "logps/chosen": -38.9638671875, + "logps/rejected": -89.70887756347656, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.846712112426758, + "rewards/margins": 14.311556816101074, + "rewards/rejected": -17.158267974853516, + "step": 1316 + }, + { + "epoch": 7.804444444444444, + "grad_norm": 0.12049712411179087, + "learning_rate": 6.150444883933348e-10, + "logits/chosen": -1.5024068355560303, + "logits/rejected": -1.3233418464660645, + "logps/chosen": -47.504905700683594, + "logps/rejected": -93.08120727539062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.704084634780884, + "rewards/margins": 13.947275161743164, + "rewards/rejected": -17.65135955810547, + "step": 1317 + }, + { + "epoch": 7.810370370370371, + "grad_norm": 0.12781150582163409, + "learning_rate": 5.703463299823186e-10, + "logits/chosen": -1.7797167301177979, + "logits/rejected": -1.7678923606872559, + "logps/chosen": -33.082733154296875, + "logps/rejected": -97.64846801757812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2870450019836426, + "rewards/margins": 17.191646575927734, + "rewards/rejected": -18.47869110107422, + "step": 1318 + }, + { + "epoch": 7.816296296296296, + "grad_norm": 0.10997039292160629, + "learning_rate": 5.27332376690226e-10, + "logits/chosen": -2.0640406608581543, + "logits/rejected": -1.9499751329421997, + "logps/chosen": -46.399940490722656, + "logps/rejected": -87.12672424316406, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.000057220458984, + "rewards/margins": 11.56611442565918, + "rewards/rejected": -15.566171646118164, + "step": 1319 + }, + { + "epoch": 7.822222222222222, + "grad_norm": 0.09734750549803473, + "learning_rate": 4.860029189569237e-10, + "logits/chosen": -2.239694595336914, + "logits/rejected": -2.2137787342071533, + "logps/chosen": -58.1463737487793, + "logps/rejected": -77.39590454101562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -5.256124973297119, + "rewards/margins": 11.735958099365234, + "rewards/rejected": -16.992082595825195, + "step": 1320 + }, + { + "epoch": 7.8281481481481485, + "grad_norm": 0.06290331703403879, + "learning_rate": 4.463582358482376e-10, + "logits/chosen": -2.029208183288574, + "logits/rejected": -1.9457252025604248, + "logps/chosen": -48.41543197631836, + "logps/rejected": -94.46034240722656, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4602203369140625, + "rewards/margins": 14.713550567626953, + "rewards/rejected": -18.173768997192383, + "step": 1321 + }, + { + "epoch": 7.834074074074074, + "grad_norm": 0.11405387953819877, + "learning_rate": 4.083985950539548e-10, + "logits/chosen": -1.8435865640640259, + "logits/rejected": -1.8563013076782227, + "logps/chosen": -52.38283920288086, + "logps/rejected": -86.04027557373047, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.0260491371154785, + "rewards/margins": 13.521184921264648, + "rewards/rejected": -17.54723358154297, + "step": 1322 + }, + { + "epoch": 7.84, + "grad_norm": 0.1428495263399808, + "learning_rate": 3.721242528861024e-10, + "logits/chosen": -1.6836150884628296, + "logits/rejected": -1.6538059711456299, + "logps/chosen": -46.02968978881836, + "logps/rejected": -71.76293182373047, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.687353134155273, + "rewards/margins": 11.388587951660156, + "rewards/rejected": -16.075942993164062, + "step": 1323 + }, + { + "epoch": 7.8459259259259255, + "grad_norm": 0.1021413462070928, + "learning_rate": 3.3753545427722687e-10, + "logits/chosen": -2.0874242782592773, + "logits/rejected": -2.1049461364746094, + "logps/chosen": -44.546688079833984, + "logps/rejected": -89.52922058105469, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.7562994956970215, + "rewards/margins": 12.259674072265625, + "rewards/rejected": -16.015974044799805, + "step": 1324 + }, + { + "epoch": 7.851851851851852, + "grad_norm": 0.11440705165713286, + "learning_rate": 3.0463243277864534e-10, + "logits/chosen": -1.6675009727478027, + "logits/rejected": -1.6661535501480103, + "logps/chosen": -41.84225082397461, + "logps/rejected": -66.14202117919922, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4963809251785278, + "rewards/margins": 11.167799949645996, + "rewards/rejected": -12.664179801940918, + "step": 1325 + }, + { + "epoch": 7.857777777777778, + "grad_norm": 0.08839074484858349, + "learning_rate": 2.734154105589748e-10, + "logits/chosen": -1.431457757949829, + "logits/rejected": -1.4145808219909668, + "logps/chosen": -34.363014221191406, + "logps/rejected": -62.02093505859375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3365049362182617, + "rewards/margins": 10.666013717651367, + "rewards/rejected": -13.002519607543945, + "step": 1326 + }, + { + "epoch": 7.863703703703703, + "grad_norm": 0.10791454724929499, + "learning_rate": 2.4388459840257724e-10, + "logits/chosen": -1.8096765279769897, + "logits/rejected": -1.8453236818313599, + "logps/chosen": -40.75598907470703, + "logps/rejected": -75.48899841308594, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.80953311920166, + "rewards/margins": 13.202881813049316, + "rewards/rejected": -17.012414932250977, + "step": 1327 + }, + { + "epoch": 7.86962962962963, + "grad_norm": 0.11257458760022891, + "learning_rate": 2.1604019570811704e-10, + "logits/chosen": -1.8961286544799805, + "logits/rejected": -1.9870917797088623, + "logps/chosen": -49.68988037109375, + "logps/rejected": -73.82746887207031, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7470507621765137, + "rewards/margins": 12.514230728149414, + "rewards/rejected": -15.261281967163086, + "step": 1328 + }, + { + "epoch": 7.875555555555556, + "grad_norm": 0.08900179041357273, + "learning_rate": 1.8988239048725595e-10, + "logits/chosen": -1.9694173336029053, + "logits/rejected": -1.8216187953948975, + "logps/chosen": -44.448753356933594, + "logps/rejected": -81.052490234375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6496639251708984, + "rewards/margins": 12.538106918334961, + "rewards/rejected": -16.18777084350586, + "step": 1329 + }, + { + "epoch": 7.881481481481481, + "grad_norm": 0.09927287718802277, + "learning_rate": 1.6541135936343208e-10, + "logits/chosen": -2.2331199645996094, + "logits/rejected": -1.9718306064605713, + "logps/chosen": -51.08818054199219, + "logps/rejected": -108.66032409667969, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2654128074645996, + "rewards/margins": 15.549224853515625, + "rewards/rejected": -18.81463623046875, + "step": 1330 + }, + { + "epoch": 7.887407407407407, + "grad_norm": 0.13440981318068518, + "learning_rate": 1.426272675704998e-10, + "logits/chosen": -1.9113953113555908, + "logits/rejected": -1.9644521474838257, + "logps/chosen": -47.6470947265625, + "logps/rejected": -83.68902587890625, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3995003700256348, + "rewards/margins": 13.84328556060791, + "rewards/rejected": -17.24278450012207, + "step": 1331 + }, + { + "epoch": 7.8933333333333335, + "grad_norm": 0.11681222659218105, + "learning_rate": 1.2153026895178608e-10, + "logits/chosen": -1.6768038272857666, + "logits/rejected": -1.722691297531128, + "logps/chosen": -52.901611328125, + "logps/rejected": -76.805908203125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.354005813598633, + "rewards/margins": 12.054499626159668, + "rewards/rejected": -15.4085054397583, + "step": 1332 + }, + { + "epoch": 7.899259259259259, + "grad_norm": 0.08841688627616534, + "learning_rate": 1.0212050595895249e-10, + "logits/chosen": -1.667407751083374, + "logits/rejected": -1.7305935621261597, + "logps/chosen": -52.599090576171875, + "logps/rejected": -74.85166931152344, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8365044593811035, + "rewards/margins": 11.444159507751465, + "rewards/rejected": -14.280664443969727, + "step": 1333 + }, + { + "epoch": 7.905185185185185, + "grad_norm": 0.096242284043103, + "learning_rate": 8.439810965113481e-11, + "logits/chosen": -1.8078327178955078, + "logits/rejected": -1.6372575759887695, + "logps/chosen": -40.54448318481445, + "logps/rejected": -73.397705078125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.039003372192383, + "rewards/margins": 11.167409896850586, + "rewards/rejected": -15.206413269042969, + "step": 1334 + }, + { + "epoch": 7.911111111111111, + "grad_norm": 0.10789774171873755, + "learning_rate": 6.836319969388827e-11, + "logits/chosen": -1.5104821920394897, + "logits/rejected": -1.3770867586135864, + "logps/chosen": -46.491546630859375, + "logps/rejected": -78.11680603027344, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.605240821838379, + "rewards/margins": 10.936412811279297, + "rewards/rejected": -14.541654586791992, + "step": 1335 + }, + { + "epoch": 7.917037037037037, + "grad_norm": 0.08868861034370847, + "learning_rate": 5.4015884358549204e-11, + "logits/chosen": -1.4172427654266357, + "logits/rejected": -1.3312875032424927, + "logps/chosen": -46.777320861816406, + "logps/rejected": -77.1638412475586, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3355093002319336, + "rewards/margins": 13.356868743896484, + "rewards/rejected": -16.692378997802734, + "step": 1336 + }, + { + "epoch": 7.922962962962963, + "grad_norm": 0.09206868646615148, + "learning_rate": 4.135626052143015e-11, + "logits/chosen": -1.7921714782714844, + "logits/rejected": -1.6932836771011353, + "logps/chosen": -43.38299560546875, + "logps/rejected": -83.28189086914062, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.84717059135437, + "rewards/margins": 13.680866241455078, + "rewards/rejected": -17.52803611755371, + "step": 1337 + }, + { + "epoch": 7.928888888888888, + "grad_norm": 0.20766800780823566, + "learning_rate": 3.0384413663125944e-11, + "logits/chosen": -1.8923168182373047, + "logits/rejected": -2.0263829231262207, + "logps/chosen": -42.76374435424805, + "logps/rejected": -70.45816040039062, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6587953567504883, + "rewards/margins": 10.849041938781738, + "rewards/rejected": -13.507837295532227, + "step": 1338 + }, + { + "epoch": 7.934814814814815, + "grad_norm": 0.10849471922864087, + "learning_rate": 2.110041786804184e-11, + "logits/chosen": -1.8717421293258667, + "logits/rejected": -1.8520114421844482, + "logps/chosen": -54.066280364990234, + "logps/rejected": -92.87806701660156, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.783078193664551, + "rewards/margins": 13.9727783203125, + "rewards/rejected": -18.755857467651367, + "step": 1339 + }, + { + "epoch": 7.940740740740741, + "grad_norm": 0.11149416835250378, + "learning_rate": 1.350433582381072e-11, + "logits/chosen": -1.572990894317627, + "logits/rejected": -1.0800707340240479, + "logps/chosen": -39.92055892944336, + "logps/rejected": -71.92617797851562, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5409345626831055, + "rewards/margins": 10.899103164672852, + "rewards/rejected": -13.44003677368164, + "step": 1340 + }, + { + "epoch": 7.946666666666666, + "grad_norm": 0.11266971237546647, + "learning_rate": 7.596218820876688e-12, + "logits/chosen": -1.4236558675765991, + "logits/rejected": -1.7003998756408691, + "logps/chosen": -57.53947448730469, + "logps/rejected": -71.41385650634766, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.154088020324707, + "rewards/margins": 10.86117935180664, + "rewards/rejected": -14.015266418457031, + "step": 1341 + }, + { + "epoch": 7.952592592592593, + "grad_norm": 0.10920237224370884, + "learning_rate": 3.376106752134289e-12, + "logits/chosen": -2.0125412940979004, + "logits/rejected": -1.873422622680664, + "logps/chosen": -31.185543060302734, + "logps/rejected": -70.14773559570312, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9257241487503052, + "rewards/margins": 13.334260940551758, + "rewards/rejected": -15.259984970092773, + "step": 1342 + }, + { + "epoch": 7.9585185185185185, + "grad_norm": 0.09535432194912208, + "learning_rate": 8.440281127897186e-13, + "logits/chosen": -2.0392391681671143, + "logits/rejected": -1.8588032722473145, + "logps/chosen": -50.48688507080078, + "logps/rejected": -101.44157409667969, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.4265379905700684, + "rewards/margins": 17.10596466064453, + "rewards/rejected": -20.532503128051758, + "step": 1343 + }, + { + "epoch": 7.964444444444444, + "grad_norm": 0.1169578512158143, + "learning_rate": 0.0, + "logits/chosen": -1.6152704954147339, + "logits/rejected": -1.6888506412506104, + "logps/chosen": -48.29261779785156, + "logps/rejected": -78.50033569335938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6884405612945557, + "rewards/margins": 12.385543823242188, + "rewards/rejected": -16.073984146118164, + "step": 1344 + }, + { + "epoch": 7.964444444444444, + "step": 1344, + "total_flos": 0.0, + "train_loss": 0.11349717956385402, + "train_runtime": 13006.5105, + "train_samples_per_second": 6.64, + "train_steps_per_second": 0.103 + } + ], + "logging_steps": 1, + "max_steps": 1344, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}