diff --git "a/trainer_state.json" "b/trainer_state.json" deleted file mode 100644--- "a/trainer_state.json" +++ /dev/null @@ -1,7557 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 501, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001996007984031936, - "grad_norm": 8.273702760320834, - "learning_rate": 9.803921568627451e-09, - "logits/chosen": -15.345624923706055, - "logits/rejected": -15.43127727508545, - "logps/chosen": -309.0523376464844, - "logps/rejected": -315.7975769042969, - "loss": 0.6931, - "rewards/accuracies": 0.0, - "rewards/chosen": 0.0, - "rewards/margins": 0.0, - "rewards/rejected": 0.0, - "step": 1 - }, - { - "epoch": 0.003992015968063872, - "grad_norm": 9.403401939105745, - "learning_rate": 1.9607843137254902e-08, - "logits/chosen": -15.736159324645996, - "logits/rejected": -15.511228561401367, - "logps/chosen": -276.1156311035156, - "logps/rejected": -319.82891845703125, - "loss": 0.6931, - "rewards/accuracies": 0.0, - "rewards/chosen": 0.0, - "rewards/margins": 0.0, - "rewards/rejected": 0.0, - "step": 2 - }, - { - "epoch": 0.005988023952095809, - "grad_norm": 7.417684586141953, - "learning_rate": 2.941176470588235e-08, - "logits/chosen": -14.069502830505371, - "logits/rejected": -14.952973365783691, - "logps/chosen": -327.70660400390625, - "logps/rejected": -324.401611328125, - "loss": 0.6935, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.0006795646040700376, - "rewards/margins": 0.00012883666204288602, - "rewards/rejected": 0.0005507277674041688, - "step": 3 - }, - { - "epoch": 0.007984031936127744, - "grad_norm": 8.870177799116133, - "learning_rate": 3.9215686274509804e-08, - "logits/chosen": -14.754829406738281, - "logits/rejected": -14.156275749206543, - "logps/chosen": -405.5284423828125, - "logps/rejected": -507.5711669921875, - "loss": 0.6927, - "rewards/accuracies": 0.5, - "rewards/chosen": -0.00030017876997590065, - "rewards/margins": -0.00034380401484668255, - "rewards/rejected": 4.362566687632352e-05, - "step": 4 - }, - { - "epoch": 0.00998003992015968, - "grad_norm": 8.607915464157758, - "learning_rate": 4.901960784313725e-08, - "logits/chosen": -15.998005867004395, - "logits/rejected": -15.419865608215332, - "logps/chosen": -334.4444580078125, - "logps/rejected": -347.6990051269531, - "loss": 0.6925, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.003209929447621107, - "rewards/margins": 0.006283964961767197, - "rewards/rejected": -0.0030740355141460896, - "step": 5 - }, - { - "epoch": 0.011976047904191617, - "grad_norm": 7.828043502488399, - "learning_rate": 5.88235294117647e-08, - "logits/chosen": -15.476319313049316, - "logits/rejected": -15.254171371459961, - "logps/chosen": -315.6751708984375, - "logps/rejected": -321.661376953125, - "loss": 0.6931, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.004277873318642378, - "rewards/margins": 0.0027148674707859755, - "rewards/rejected": 0.0015630058478564024, - "step": 6 - }, - { - "epoch": 0.013972055888223553, - "grad_norm": 8.411076253189734, - "learning_rate": 6.862745098039216e-08, - "logits/chosen": -15.430569648742676, - "logits/rejected": -15.730286598205566, - "logps/chosen": -329.1186218261719, - "logps/rejected": -333.16375732421875, - "loss": 0.693, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.0016522119985893369, - "rewards/margins": 0.0027996539138257504, - "rewards/rejected": -0.004451866261661053, - "step": 7 - }, - { - "epoch": 0.015968063872255488, - "grad_norm": 8.071033025816464, - "learning_rate": 7.843137254901961e-08, - "logits/chosen": -14.63811206817627, - "logits/rejected": -15.146449089050293, - "logps/chosen": -418.5869445800781, - "logps/rejected": -409.2070007324219, - "loss": 0.6926, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.0014343691291287541, - "rewards/margins": 0.0001833915594033897, - "rewards/rejected": -0.001617760630324483, - "step": 8 - }, - { - "epoch": 0.017964071856287425, - "grad_norm": 8.296615844679023, - "learning_rate": 8.823529411764706e-08, - "logits/chosen": -16.07230567932129, - "logits/rejected": -15.337064743041992, - "logps/chosen": -500.545166015625, - "logps/rejected": -516.0908813476562, - "loss": 0.693, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.000825481372885406, - "rewards/margins": -0.00035296427085995674, - "rewards/rejected": 0.0011784456437453628, - "step": 9 - }, - { - "epoch": 0.01996007984031936, - "grad_norm": 7.947789076143285, - "learning_rate": 9.80392156862745e-08, - "logits/chosen": -15.586200714111328, - "logits/rejected": -15.603667259216309, - "logps/chosen": -385.4709777832031, - "logps/rejected": -413.04498291015625, - "loss": 0.6927, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.0015043115708976984, - "rewards/margins": -0.0028634597547352314, - "rewards/rejected": 0.001359148183837533, - "step": 10 - }, - { - "epoch": 0.021956087824351298, - "grad_norm": 8.347411190779997, - "learning_rate": 1.0784313725490195e-07, - "logits/chosen": -14.763092994689941, - "logits/rejected": -14.29030990600586, - "logps/chosen": -450.0237731933594, - "logps/rejected": -507.61370849609375, - "loss": 0.6935, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.0023990964982658625, - "rewards/margins": -0.003815202508121729, - "rewards/rejected": 0.00141610624268651, - "step": 11 - }, - { - "epoch": 0.023952095808383235, - "grad_norm": 9.064316975676492, - "learning_rate": 1.176470588235294e-07, - "logits/chosen": -15.072341918945312, - "logits/rejected": -14.772862434387207, - "logps/chosen": -335.04266357421875, - "logps/rejected": -371.4832763671875, - "loss": 0.6931, - "rewards/accuracies": 0.25, - "rewards/chosen": -0.0030230379197746515, - "rewards/margins": -0.005290627479553223, - "rewards/rejected": 0.0022675893269479275, - "step": 12 - }, - { - "epoch": 0.02594810379241517, - "grad_norm": 7.725863389166651, - "learning_rate": 1.2745098039215685e-07, - "logits/chosen": -16.316631317138672, - "logits/rejected": -16.06757164001465, - "logps/chosen": -419.7452087402344, - "logps/rejected": -390.98773193359375, - "loss": 0.6936, - "rewards/accuracies": 0.3125, - "rewards/chosen": -0.001233673538081348, - "rewards/margins": -0.0001909491838887334, - "rewards/rejected": -0.0010427236557006836, - "step": 13 - }, - { - "epoch": 0.027944111776447105, - "grad_norm": 8.42732610297517, - "learning_rate": 1.3725490196078432e-07, - "logits/chosen": -15.35225772857666, - "logits/rejected": -15.612798690795898, - "logps/chosen": -328.98046875, - "logps/rejected": -415.5600280761719, - "loss": 0.6934, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.001978826941922307, - "rewards/margins": 0.00040232675382867455, - "rewards/rejected": 0.001576499780640006, - "step": 14 - }, - { - "epoch": 0.029940119760479042, - "grad_norm": 7.731931977749619, - "learning_rate": 1.4705882352941175e-07, - "logits/chosen": -16.70879554748535, - "logits/rejected": -16.097858428955078, - "logps/chosen": -316.15423583984375, - "logps/rejected": -323.2774353027344, - "loss": 0.6928, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.003133001271635294, - "rewards/margins": -0.0018952846294268966, - "rewards/rejected": -0.0012377167586237192, - "step": 15 - }, - { - "epoch": 0.031936127744510975, - "grad_norm": 7.9669032254132315, - "learning_rate": 1.5686274509803921e-07, - "logits/chosen": -13.231587409973145, - "logits/rejected": -13.3031005859375, - "logps/chosen": -337.7759704589844, - "logps/rejected": -322.94390869140625, - "loss": 0.6933, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0010202788980677724, - "rewards/margins": 0.0009024335886351764, - "rewards/rejected": 0.00011784554226323962, - "step": 16 - }, - { - "epoch": 0.033932135728542916, - "grad_norm": 7.6237992365862075, - "learning_rate": 1.6666666666666665e-07, - "logits/chosen": -14.528105735778809, - "logits/rejected": -14.738598823547363, - "logps/chosen": -241.29954528808594, - "logps/rejected": -247.58921813964844, - "loss": 0.6919, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0020066355355083942, - "rewards/margins": 0.0035532282199710608, - "rewards/rejected": -0.0015465925680473447, - "step": 17 - }, - { - "epoch": 0.03592814371257485, - "grad_norm": 11.366099973012018, - "learning_rate": 1.764705882352941e-07, - "logits/chosen": -15.85158920288086, - "logits/rejected": -15.455657958984375, - "logps/chosen": -346.03350830078125, - "logps/rejected": -324.2177734375, - "loss": 0.6933, - "rewards/accuracies": 0.375, - "rewards/chosen": -0.0006118249148130417, - "rewards/margins": -0.0007706499891355634, - "rewards/rejected": 0.00015882489969953895, - "step": 18 - }, - { - "epoch": 0.03792415169660679, - "grad_norm": 7.942061772626445, - "learning_rate": 1.8627450980392158e-07, - "logits/chosen": -14.716241836547852, - "logits/rejected": -14.732061386108398, - "logps/chosen": -283.9400939941406, - "logps/rejected": -269.4703674316406, - "loss": 0.6934, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.00019351489027030766, - "rewards/margins": -0.0023846100084483624, - "rewards/rejected": 0.002191095147281885, - "step": 19 - }, - { - "epoch": 0.03992015968063872, - "grad_norm": 8.146329480525193, - "learning_rate": 1.96078431372549e-07, - "logits/chosen": -13.448728561401367, - "logits/rejected": -13.313655853271484, - "logps/chosen": -375.7843933105469, - "logps/rejected": -317.9215393066406, - "loss": 0.6928, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.0009911824017763138, - "rewards/margins": -0.0024417974054813385, - "rewards/rejected": 0.001450614770874381, - "step": 20 - }, - { - "epoch": 0.041916167664670656, - "grad_norm": 8.55196124746064, - "learning_rate": 2.0588235294117645e-07, - "logits/chosen": -15.470209121704102, - "logits/rejected": -15.217321395874023, - "logps/chosen": -299.0631103515625, - "logps/rejected": -327.6717529296875, - "loss": 0.6924, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.0023876859340816736, - "rewards/margins": -0.0039983270689845085, - "rewards/rejected": 0.006386012304574251, - "step": 21 - }, - { - "epoch": 0.043912175648702596, - "grad_norm": 8.466171545750429, - "learning_rate": 2.156862745098039e-07, - "logits/chosen": -16.77044677734375, - "logits/rejected": -16.241891860961914, - "logps/chosen": -277.0028076171875, - "logps/rejected": -275.6781005859375, - "loss": 0.6915, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.006396574899554253, - "rewards/margins": 0.0012766977306455374, - "rewards/rejected": 0.005119876936078072, - "step": 22 - }, - { - "epoch": 0.04590818363273453, - "grad_norm": 8.402876138940963, - "learning_rate": 2.2549019607843137e-07, - "logits/chosen": -15.568875312805176, - "logits/rejected": -15.177936553955078, - "logps/chosen": -430.4210510253906, - "logps/rejected": -396.56219482421875, - "loss": 0.6921, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0003438423154875636, - "rewards/margins": 0.0015764283016324043, - "rewards/rejected": -0.001232585753314197, - "step": 23 - }, - { - "epoch": 0.04790419161676647, - "grad_norm": 12.52829981591039, - "learning_rate": 2.352941176470588e-07, - "logits/chosen": -14.164735794067383, - "logits/rejected": -14.398335456848145, - "logps/chosen": -396.947265625, - "logps/rejected": -385.9594421386719, - "loss": 0.6912, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.0060534426011145115, - "rewards/margins": 0.0018496987177059054, - "rewards/rejected": 0.004203743766993284, - "step": 24 - }, - { - "epoch": 0.0499001996007984, - "grad_norm": 8.318983318997933, - "learning_rate": 2.4509803921568627e-07, - "logits/chosen": -15.94021987915039, - "logits/rejected": -15.565300941467285, - "logps/chosen": -312.4967346191406, - "logps/rejected": -291.0953369140625, - "loss": 0.6917, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.008275885134935379, - "rewards/margins": 0.006061806343495846, - "rewards/rejected": 0.0022140787914395332, - "step": 25 - }, - { - "epoch": 0.05189620758483034, - "grad_norm": 8.103398791916263, - "learning_rate": 2.549019607843137e-07, - "logits/chosen": -14.540434837341309, - "logits/rejected": -14.473610877990723, - "logps/chosen": -353.2845153808594, - "logps/rejected": -374.34490966796875, - "loss": 0.6915, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.006215238478034735, - "rewards/margins": -4.699244163930416e-05, - "rewards/rejected": 0.006262229755520821, - "step": 26 - }, - { - "epoch": 0.05389221556886228, - "grad_norm": 7.722965287876503, - "learning_rate": 2.6470588235294114e-07, - "logits/chosen": -16.8541202545166, - "logits/rejected": -16.772695541381836, - "logps/chosen": -333.2151184082031, - "logps/rejected": -381.6868591308594, - "loss": 0.6918, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.011300897225737572, - "rewards/margins": 0.00862040463835001, - "rewards/rejected": 0.002680492587387562, - "step": 27 - }, - { - "epoch": 0.05588822355289421, - "grad_norm": 8.21241123134308, - "learning_rate": 2.7450980392156863e-07, - "logits/chosen": -14.656830787658691, - "logits/rejected": -15.223196983337402, - "logps/chosen": -384.7855529785156, - "logps/rejected": -390.5248718261719, - "loss": 0.6912, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.005614032968878746, - "rewards/margins": 0.0015372277703136206, - "rewards/rejected": 0.004076804965734482, - "step": 28 - }, - { - "epoch": 0.05788423153692615, - "grad_norm": 8.15008749253195, - "learning_rate": 2.8431372549019607e-07, - "logits/chosen": -15.4053955078125, - "logits/rejected": -15.084259986877441, - "logps/chosen": -397.54937744140625, - "logps/rejected": -373.31109619140625, - "loss": 0.6908, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.009957370348274708, - "rewards/margins": 0.0023600957356393337, - "rewards/rejected": 0.007597275078296661, - "step": 29 - }, - { - "epoch": 0.059880239520958084, - "grad_norm": 8.125634119871668, - "learning_rate": 2.941176470588235e-07, - "logits/chosen": -14.482078552246094, - "logits/rejected": -14.186015129089355, - "logps/chosen": -271.9766845703125, - "logps/rejected": -284.2981262207031, - "loss": 0.6909, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.011308508925139904, - "rewards/margins": 0.004111303482204676, - "rewards/rejected": 0.007197204511612654, - "step": 30 - }, - { - "epoch": 0.06187624750499002, - "grad_norm": 7.748089103692524, - "learning_rate": 3.0392156862745094e-07, - "logits/chosen": -15.912099838256836, - "logits/rejected": -15.93221664428711, - "logps/chosen": -304.4377136230469, - "logps/rejected": -315.7114562988281, - "loss": 0.6899, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.02117222733795643, - "rewards/margins": 0.006583967246115208, - "rewards/rejected": 0.01458826009184122, - "step": 31 - }, - { - "epoch": 0.06387225548902195, - "grad_norm": 8.060112893776262, - "learning_rate": 3.1372549019607843e-07, - "logits/chosen": -15.711540222167969, - "logits/rejected": -15.569220542907715, - "logps/chosen": -403.3907470703125, - "logps/rejected": -386.66754150390625, - "loss": 0.6901, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.0031930492259562016, - "rewards/margins": -0.0004316616104915738, - "rewards/rejected": 0.0036247102543711662, - "step": 32 - }, - { - "epoch": 0.0658682634730539, - "grad_norm": 8.131844895571634, - "learning_rate": 3.2352941176470586e-07, - "logits/chosen": -14.937063217163086, - "logits/rejected": -15.226661682128906, - "logps/chosen": -452.5379638671875, - "logps/rejected": -454.41009521484375, - "loss": 0.6883, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.016051730141043663, - "rewards/margins": 0.009525422938168049, - "rewards/rejected": 0.006526308599859476, - "step": 33 - }, - { - "epoch": 0.06786427145708583, - "grad_norm": 8.419849559119973, - "learning_rate": 3.333333333333333e-07, - "logits/chosen": -15.381107330322266, - "logits/rejected": -15.284709930419922, - "logps/chosen": -401.2351989746094, - "logps/rejected": -390.9241027832031, - "loss": 0.6886, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.024615010246634483, - "rewards/margins": 0.018835801631212234, - "rewards/rejected": 0.005779208615422249, - "step": 34 - }, - { - "epoch": 0.06986027944111776, - "grad_norm": 8.437180869369922, - "learning_rate": 3.431372549019608e-07, - "logits/chosen": -15.466768264770508, - "logits/rejected": -15.167000770568848, - "logps/chosen": -352.6540832519531, - "logps/rejected": -326.68682861328125, - "loss": 0.6875, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.02680317685008049, - "rewards/margins": 0.01718745194375515, - "rewards/rejected": 0.00961572676897049, - "step": 35 - }, - { - "epoch": 0.0718562874251497, - "grad_norm": 7.982258318061745, - "learning_rate": 3.529411764705882e-07, - "logits/chosen": -14.632706642150879, - "logits/rejected": -14.941289901733398, - "logps/chosen": -332.2666015625, - "logps/rejected": -354.9202575683594, - "loss": 0.6871, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.03227221965789795, - "rewards/margins": 0.024501098319888115, - "rewards/rejected": 0.007771119941025972, - "step": 36 - }, - { - "epoch": 0.07385229540918163, - "grad_norm": 8.683716423285436, - "learning_rate": 3.6274509803921566e-07, - "logits/chosen": -16.10747528076172, - "logits/rejected": -15.692405700683594, - "logps/chosen": -350.07916259765625, - "logps/rejected": -328.48040771484375, - "loss": 0.6879, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.026277001947164536, - "rewards/margins": 0.0028891037218272686, - "rewards/rejected": 0.023387901484966278, - "step": 37 - }, - { - "epoch": 0.07584830339321358, - "grad_norm": 8.263903944711704, - "learning_rate": 3.7254901960784315e-07, - "logits/chosen": -16.594873428344727, - "logits/rejected": -16.079730987548828, - "logps/chosen": -308.6328125, - "logps/rejected": -305.5195617675781, - "loss": 0.6857, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.03257057070732117, - "rewards/margins": 0.01241181418299675, - "rewards/rejected": 0.020158756524324417, - "step": 38 - }, - { - "epoch": 0.07784431137724551, - "grad_norm": 8.511057638069643, - "learning_rate": 3.8235294117647053e-07, - "logits/chosen": -14.873018264770508, - "logits/rejected": -14.647686004638672, - "logps/chosen": -309.0533447265625, - "logps/rejected": -321.43011474609375, - "loss": 0.6828, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.036281879991292953, - "rewards/margins": 0.025011887773871422, - "rewards/rejected": 0.011269993148744106, - "step": 39 - }, - { - "epoch": 0.07984031936127745, - "grad_norm": 8.338517669067725, - "learning_rate": 3.92156862745098e-07, - "logits/chosen": -15.299338340759277, - "logits/rejected": -15.163370132446289, - "logps/chosen": -302.93212890625, - "logps/rejected": -345.40545654296875, - "loss": 0.685, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.044032178819179535, - "rewards/margins": 0.024382634088397026, - "rewards/rejected": 0.01964954286813736, - "step": 40 - }, - { - "epoch": 0.08183632734530938, - "grad_norm": 8.07172349335598, - "learning_rate": 4.019607843137255e-07, - "logits/chosen": -15.369956016540527, - "logits/rejected": -15.433903694152832, - "logps/chosen": -409.83984375, - "logps/rejected": -412.54180908203125, - "loss": 0.6853, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.035033755004405975, - "rewards/margins": 0.01923045516014099, - "rewards/rejected": 0.015803297981619835, - "step": 41 - }, - { - "epoch": 0.08383233532934131, - "grad_norm": 7.920156576030204, - "learning_rate": 4.117647058823529e-07, - "logits/chosen": -14.02873420715332, - "logits/rejected": -13.986605644226074, - "logps/chosen": -354.6033935546875, - "logps/rejected": -359.84014892578125, - "loss": 0.6864, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.034912168979644775, - "rewards/margins": 0.009367440827190876, - "rewards/rejected": 0.025544727221131325, - "step": 42 - }, - { - "epoch": 0.08582834331337326, - "grad_norm": 8.232720229409079, - "learning_rate": 4.215686274509804e-07, - "logits/chosen": -15.009214401245117, - "logits/rejected": -15.307815551757812, - "logps/chosen": -380.186767578125, - "logps/rejected": -415.4902648925781, - "loss": 0.6858, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.018880976364016533, - "rewards/margins": 0.010581063106656075, - "rewards/rejected": 0.008299913257360458, - "step": 43 - }, - { - "epoch": 0.08782435129740519, - "grad_norm": 8.533444238200468, - "learning_rate": 4.313725490196078e-07, - "logits/chosen": -14.22335433959961, - "logits/rejected": -14.953871726989746, - "logps/chosen": -347.45574951171875, - "logps/rejected": -389.9525451660156, - "loss": 0.6835, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.01024315319955349, - "rewards/margins": -0.00465776864439249, - "rewards/rejected": 0.014900922775268555, - "step": 44 - }, - { - "epoch": 0.08982035928143713, - "grad_norm": 8.038118181406828, - "learning_rate": 4.4117647058823526e-07, - "logits/chosen": -14.101947784423828, - "logits/rejected": -13.89334774017334, - "logps/chosen": -317.0509033203125, - "logps/rejected": -323.69183349609375, - "loss": 0.6817, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.026917992159724236, - "rewards/margins": 0.008457997813820839, - "rewards/rejected": 0.018459992483258247, - "step": 45 - }, - { - "epoch": 0.09181636726546906, - "grad_norm": 8.149720896485583, - "learning_rate": 4.5098039215686274e-07, - "logits/chosen": -14.710037231445312, - "logits/rejected": -14.176923751831055, - "logps/chosen": -445.82379150390625, - "logps/rejected": -488.3534851074219, - "loss": 0.6809, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.023077696561813354, - "rewards/margins": 0.04250966012477875, - "rewards/rejected": -0.019431961700320244, - "step": 46 - }, - { - "epoch": 0.09381237524950099, - "grad_norm": 7.992010994151138, - "learning_rate": 4.6078431372549013e-07, - "logits/chosen": -15.482625961303711, - "logits/rejected": -14.013190269470215, - "logps/chosen": -367.063232421875, - "logps/rejected": -342.14471435546875, - "loss": 0.6774, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.0394100584089756, - "rewards/margins": 0.02275776118040085, - "rewards/rejected": 0.016652297228574753, - "step": 47 - }, - { - "epoch": 0.09580838323353294, - "grad_norm": 7.756878381863422, - "learning_rate": 4.705882352941176e-07, - "logits/chosen": -15.568811416625977, - "logits/rejected": -15.423860549926758, - "logps/chosen": -433.0845031738281, - "logps/rejected": -439.37396240234375, - "loss": 0.6776, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0388474203646183, - "rewards/margins": 0.03113992139697075, - "rewards/rejected": 0.007707500830292702, - "step": 48 - }, - { - "epoch": 0.09780439121756487, - "grad_norm": 8.493689270125792, - "learning_rate": 4.803921568627451e-07, - "logits/chosen": -14.434328079223633, - "logits/rejected": -14.237007141113281, - "logps/chosen": -375.27874755859375, - "logps/rejected": -351.34783935546875, - "loss": 0.6807, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.031233904883265495, - "rewards/margins": 0.011667889542877674, - "rewards/rejected": 0.019566014409065247, - "step": 49 - }, - { - "epoch": 0.0998003992015968, - "grad_norm": 7.9896349499421735, - "learning_rate": 4.901960784313725e-07, - "logits/chosen": -15.668423652648926, - "logits/rejected": -15.05298900604248, - "logps/chosen": -307.7395935058594, - "logps/rejected": -320.8984375, - "loss": 0.6748, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.06957457959651947, - "rewards/margins": 0.025473352521657944, - "rewards/rejected": 0.044101230800151825, - "step": 50 - }, - { - "epoch": 0.10179640718562874, - "grad_norm": 8.021742888341015, - "learning_rate": 5e-07, - "logits/chosen": -14.430184364318848, - "logits/rejected": -14.525110244750977, - "logps/chosen": -258.0986022949219, - "logps/rejected": -283.91607666015625, - "loss": 0.6743, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.04039158299565315, - "rewards/margins": 0.033271849155426025, - "rewards/rejected": 0.007119735702872276, - "step": 51 - }, - { - "epoch": 0.10379241516966067, - "grad_norm": 8.431429641964243, - "learning_rate": 4.999939076763486e-07, - "logits/chosen": -15.636438369750977, - "logits/rejected": -15.311294555664062, - "logps/chosen": -344.984619140625, - "logps/rejected": -323.0438232421875, - "loss": 0.6688, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.09328603744506836, - "rewards/margins": 0.04766825586557388, - "rewards/rejected": 0.04561777785420418, - "step": 52 - }, - { - "epoch": 0.10578842315369262, - "grad_norm": 8.486118751253972, - "learning_rate": 4.99975631002326e-07, - "logits/chosen": -15.692873001098633, - "logits/rejected": -15.621683120727539, - "logps/chosen": -294.3231201171875, - "logps/rejected": -324.92559814453125, - "loss": 0.6684, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.08665186166763306, - "rewards/margins": 0.0698593482375145, - "rewards/rejected": 0.016792509704828262, - "step": 53 - }, - { - "epoch": 0.10778443113772455, - "grad_norm": 8.584995513046005, - "learning_rate": 4.999451708687113e-07, - "logits/chosen": -13.536327362060547, - "logits/rejected": -14.043939590454102, - "logps/chosen": -316.51129150390625, - "logps/rejected": -340.39251708984375, - "loss": 0.6677, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.04377901926636696, - "rewards/margins": 0.07115273177623749, - "rewards/rejected": -0.027373716235160828, - "step": 54 - }, - { - "epoch": 0.10978043912175649, - "grad_norm": 8.410801379788628, - "learning_rate": 4.999025287600885e-07, - "logits/chosen": -15.176254272460938, - "logits/rejected": -14.865735054016113, - "logps/chosen": -347.4294128417969, - "logps/rejected": -362.15496826171875, - "loss": 0.6672, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.0652116909623146, - "rewards/margins": 0.05398234352469444, - "rewards/rejected": 0.01122935302555561, - "step": 55 - }, - { - "epoch": 0.11177644710578842, - "grad_norm": 9.191123619800617, - "learning_rate": 4.998477067547739e-07, - "logits/chosen": -14.366271018981934, - "logits/rejected": -13.491727828979492, - "logps/chosen": -294.74420166015625, - "logps/rejected": -313.5377502441406, - "loss": 0.6695, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.08104420453310013, - "rewards/margins": 0.06518866121768951, - "rewards/rejected": 0.015855543315410614, - "step": 56 - }, - { - "epoch": 0.11377245508982035, - "grad_norm": 8.36989537276007, - "learning_rate": 4.997807075247145e-07, - "logits/chosen": -15.414962768554688, - "logits/rejected": -15.15987777709961, - "logps/chosen": -329.4682312011719, - "logps/rejected": -355.2568054199219, - "loss": 0.6699, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.05671892687678337, - "rewards/margins": 0.05990144982933998, - "rewards/rejected": -0.0031825248152017593, - "step": 57 - }, - { - "epoch": 0.1157684630738523, - "grad_norm": 7.963033814703697, - "learning_rate": 4.997015343353585e-07, - "logits/chosen": -15.05243968963623, - "logits/rejected": -15.26134967803955, - "logps/chosen": -407.4922790527344, - "logps/rejected": -400.91033935546875, - "loss": 0.6667, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.044123757630586624, - "rewards/margins": 0.07534614205360413, - "rewards/rejected": -0.03122239001095295, - "step": 58 - }, - { - "epoch": 0.11776447105788423, - "grad_norm": 7.537729687084787, - "learning_rate": 4.996101910454953e-07, - "logits/chosen": -14.969869613647461, - "logits/rejected": -14.01164436340332, - "logps/chosen": -338.7287292480469, - "logps/rejected": -342.40234375, - "loss": 0.6676, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.02124447375535965, - "rewards/margins": 0.0819094106554985, - "rewards/rejected": -0.060664933174848557, - "step": 59 - }, - { - "epoch": 0.11976047904191617, - "grad_norm": 9.279313627982006, - "learning_rate": 4.995066821070679e-07, - "logits/chosen": -13.303523063659668, - "logits/rejected": -14.182500839233398, - "logps/chosen": -362.7261047363281, - "logps/rejected": -333.9490051269531, - "loss": 0.6709, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.003293365240097046, - "rewards/margins": 0.09314459562301636, - "rewards/rejected": -0.08985123038291931, - "step": 60 - }, - { - "epoch": 0.1217564870259481, - "grad_norm": 8.252068285364153, - "learning_rate": 4.99391012564956e-07, - "logits/chosen": -17.23371124267578, - "logits/rejected": -16.242280960083008, - "logps/chosen": -367.806396484375, - "logps/rejected": -337.6153259277344, - "loss": 0.6547, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.03489462658762932, - "rewards/margins": 0.1051090657711029, - "rewards/rejected": -0.07021445780992508, - "step": 61 - }, - { - "epoch": 0.12375249500998003, - "grad_norm": 8.15636533439587, - "learning_rate": 4.9926318805673e-07, - "logits/chosen": -15.824554443359375, - "logits/rejected": -15.663161277770996, - "logps/chosen": -282.3473815917969, - "logps/rejected": -311.18994140625, - "loss": 0.6597, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.09248015284538269, - "rewards/margins": 0.09078236669301987, - "rewards/rejected": 0.0016977828927338123, - "step": 62 - }, - { - "epoch": 0.12574850299401197, - "grad_norm": 8.910924596109384, - "learning_rate": 4.991232148123761e-07, - "logits/chosen": -16.677001953125, - "logits/rejected": -16.39942741394043, - "logps/chosen": -460.5334167480469, - "logps/rejected": -422.064453125, - "loss": 0.6681, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.06818778812885284, - "rewards/margins": 0.01459517702460289, - "rewards/rejected": -0.08278295397758484, - "step": 63 - }, - { - "epoch": 0.1277445109780439, - "grad_norm": 8.580401141003072, - "learning_rate": 4.989710996539925e-07, - "logits/chosen": -15.317991256713867, - "logits/rejected": -15.26541519165039, - "logps/chosen": -424.9542541503906, - "logps/rejected": -399.70989990234375, - "loss": 0.6564, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.08943880349397659, - "rewards/margins": 0.0476187989115715, - "rewards/rejected": -0.1370576024055481, - "step": 64 - }, - { - "epoch": 0.12974051896207583, - "grad_norm": 8.429082566678625, - "learning_rate": 4.988068499954577e-07, - "logits/chosen": -16.077041625976562, - "logits/rejected": -15.868209838867188, - "logps/chosen": -316.7906494140625, - "logps/rejected": -335.0377197265625, - "loss": 0.6581, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.016306515783071518, - "rewards/margins": 0.07164555788040161, - "rewards/rejected": -0.08795207738876343, - "step": 65 - }, - { - "epoch": 0.1317365269461078, - "grad_norm": 8.133889266001447, - "learning_rate": 4.986304738420683e-07, - "logits/chosen": -15.085855484008789, - "logits/rejected": -14.784677505493164, - "logps/chosen": -300.3466796875, - "logps/rejected": -314.3912353515625, - "loss": 0.6578, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.04595138877630234, - "rewards/margins": 0.06701233983039856, - "rewards/rejected": -0.021060939878225327, - "step": 66 - }, - { - "epoch": 0.13373253493013973, - "grad_norm": 8.721591720129195, - "learning_rate": 4.984419797901491e-07, - "logits/chosen": -15.171606063842773, - "logits/rejected": -15.278976440429688, - "logps/chosen": -465.72027587890625, - "logps/rejected": -483.4525146484375, - "loss": 0.6438, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.07418551295995712, - "rewards/margins": 0.11256247758865356, - "rewards/rejected": -0.18674799799919128, - "step": 67 - }, - { - "epoch": 0.13572854291417166, - "grad_norm": 8.689476143706786, - "learning_rate": 4.982413770266342e-07, - "logits/chosen": -16.235498428344727, - "logits/rejected": -15.38123893737793, - "logps/chosen": -363.73419189453125, - "logps/rejected": -350.5956726074219, - "loss": 0.6554, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.09112317860126495, - "rewards/margins": 0.10634519904851913, - "rewards/rejected": -0.19746838510036469, - "step": 68 - }, - { - "epoch": 0.1377245508982036, - "grad_norm": 9.320906157232027, - "learning_rate": 4.980286753286194e-07, - "logits/chosen": -14.806767463684082, - "logits/rejected": -15.064220428466797, - "logps/chosen": -229.83612060546875, - "logps/rejected": -257.00848388671875, - "loss": 0.649, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.08728949725627899, - "rewards/margins": 0.05588501691818237, - "rewards/rejected": -0.14317449927330017, - "step": 69 - }, - { - "epoch": 0.13972055888223553, - "grad_norm": 8.45599582084097, - "learning_rate": 4.978038850628853e-07, - "logits/chosen": -15.640983581542969, - "logits/rejected": -15.810898780822754, - "logps/chosen": -403.06884765625, - "logps/rejected": -411.408203125, - "loss": 0.6461, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.07357379794120789, - "rewards/margins": 0.10018520057201385, - "rewards/rejected": -0.17375899851322174, - "step": 70 - }, - { - "epoch": 0.14171656686626746, - "grad_norm": 8.397438393746963, - "learning_rate": 4.975670171853925e-07, - "logits/chosen": -15.794336318969727, - "logits/rejected": -15.847979545593262, - "logps/chosen": -379.279296875, - "logps/rejected": -363.3298645019531, - "loss": 0.6501, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.12070687115192413, - "rewards/margins": 0.0922447144985199, - "rewards/rejected": -0.21295160055160522, - "step": 71 - }, - { - "epoch": 0.1437125748502994, - "grad_norm": 8.604521034039575, - "learning_rate": 4.973180832407471e-07, - "logits/chosen": -14.604567527770996, - "logits/rejected": -14.695852279663086, - "logps/chosen": -345.739990234375, - "logps/rejected": -436.0314636230469, - "loss": 0.6454, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.07075143605470657, - "rewards/margins": 0.16379314661026, - "rewards/rejected": -0.23454459011554718, - "step": 72 - }, - { - "epoch": 0.14570858283433133, - "grad_norm": 8.431953705553173, - "learning_rate": 4.970570953616382e-07, - "logits/chosen": -14.503868103027344, - "logits/rejected": -15.155458450317383, - "logps/chosen": -326.4110107421875, - "logps/rejected": -384.87322998046875, - "loss": 0.648, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.15915606915950775, - "rewards/margins": 0.1893479824066162, - "rewards/rejected": -0.34850406646728516, - "step": 73 - }, - { - "epoch": 0.14770459081836326, - "grad_norm": 8.881932037242414, - "learning_rate": 4.96784066268247e-07, - "logits/chosen": -13.779004096984863, - "logits/rejected": -13.429590225219727, - "logps/chosen": -291.79296875, - "logps/rejected": -296.4447937011719, - "loss": 0.643, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.17777323722839355, - "rewards/margins": 0.02108706906437874, - "rewards/rejected": -0.1988603174686432, - "step": 74 - }, - { - "epoch": 0.1497005988023952, - "grad_norm": 8.314122527140741, - "learning_rate": 4.964990092676262e-07, - "logits/chosen": -17.725143432617188, - "logits/rejected": -17.505762100219727, - "logps/chosen": -341.1331787109375, - "logps/rejected": -350.88116455078125, - "loss": 0.6369, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.1543130725622177, - "rewards/margins": 0.12310568988323212, - "rewards/rejected": -0.27741876244544983, - "step": 75 - }, - { - "epoch": 0.15169660678642716, - "grad_norm": 8.48077117653257, - "learning_rate": 4.96201938253052e-07, - "logits/chosen": -16.73549461364746, - "logits/rejected": -16.362672805786133, - "logps/chosen": -395.6476135253906, - "logps/rejected": -469.1103515625, - "loss": 0.6252, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.19351297616958618, - "rewards/margins": 0.18316538631916046, - "rewards/rejected": -0.37667837738990784, - "step": 76 - }, - { - "epoch": 0.1536926147704591, - "grad_norm": 8.393604155470431, - "learning_rate": 4.958928677033465e-07, - "logits/chosen": -15.707889556884766, - "logits/rejected": -15.537942886352539, - "logps/chosen": -281.0392761230469, - "logps/rejected": -319.1485900878906, - "loss": 0.629, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.06948637962341309, - "rewards/margins": 0.21688398718833923, - "rewards/rejected": -0.28637033700942993, - "step": 77 - }, - { - "epoch": 0.15568862275449102, - "grad_norm": 9.067089889668104, - "learning_rate": 4.955718126821722e-07, - "logits/chosen": -16.561952590942383, - "logits/rejected": -15.844078063964844, - "logps/chosen": -364.40185546875, - "logps/rejected": -347.0330505371094, - "loss": 0.628, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.16794858872890472, - "rewards/margins": 0.05846191197633743, - "rewards/rejected": -0.22641049325466156, - "step": 78 - }, - { - "epoch": 0.15768463073852296, - "grad_norm": 8.940066268140832, - "learning_rate": 4.952387888372978e-07, - "logits/chosen": -15.177964210510254, - "logits/rejected": -15.126307487487793, - "logps/chosen": -411.2934265136719, - "logps/rejected": -388.7569274902344, - "loss": 0.6447, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2629951238632202, - "rewards/margins": 0.11192844063043594, - "rewards/rejected": -0.37492355704307556, - "step": 79 - }, - { - "epoch": 0.1596806387225549, - "grad_norm": 8.869063942219517, - "learning_rate": 4.94893812399836e-07, - "logits/chosen": -15.601551055908203, - "logits/rejected": -16.042139053344727, - "logps/chosen": -344.1158752441406, - "logps/rejected": -439.0846862792969, - "loss": 0.6223, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.15164640545845032, - "rewards/margins": 0.31529542803764343, - "rewards/rejected": -0.46694183349609375, - "step": 80 - }, - { - "epoch": 0.16167664670658682, - "grad_norm": 9.779541683117705, - "learning_rate": 4.945369001834514e-07, - "logits/chosen": -16.39380645751953, - "logits/rejected": -15.366002082824707, - "logps/chosen": -427.91278076171875, - "logps/rejected": -421.6066589355469, - "loss": 0.6294, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.22397363185882568, - "rewards/margins": 0.07985258847475052, - "rewards/rejected": -0.3038262128829956, - "step": 81 - }, - { - "epoch": 0.16367265469061876, - "grad_norm": 9.681655379739604, - "learning_rate": 4.941680695835419e-07, - "logits/chosen": -16.988998413085938, - "logits/rejected": -16.312742233276367, - "logps/chosen": -392.65618896484375, - "logps/rejected": -409.4969787597656, - "loss": 0.6472, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.3562784492969513, - "rewards/margins": 0.05219919979572296, - "rewards/rejected": -0.40847766399383545, - "step": 82 - }, - { - "epoch": 0.1656686626746507, - "grad_norm": 9.018644132426164, - "learning_rate": 4.937873385763907e-07, - "logits/chosen": -18.049556732177734, - "logits/rejected": -16.927623748779297, - "logps/chosen": -329.4657287597656, - "logps/rejected": -301.9615783691406, - "loss": 0.617, - "rewards/accuracies": 0.4375, - "rewards/chosen": -0.3110809922218323, - "rewards/margins": -0.030765339732170105, - "rewards/rejected": -0.280315637588501, - "step": 83 - }, - { - "epoch": 0.16766467065868262, - "grad_norm": 9.744502049109531, - "learning_rate": 4.9339472571829e-07, - "logits/chosen": -16.83614730834961, - "logits/rejected": -15.979333877563477, - "logps/chosen": -302.4591979980469, - "logps/rejected": -308.2034606933594, - "loss": 0.6372, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.029211895540356636, - "rewards/margins": 0.14211447536945343, - "rewards/rejected": -0.17132636904716492, - "step": 84 - }, - { - "epoch": 0.16966067864271456, - "grad_norm": 10.062959151859104, - "learning_rate": 4.929902501446366e-07, - "logits/chosen": -16.791458129882812, - "logits/rejected": -16.60369300842285, - "logps/chosen": -301.0384521484375, - "logps/rejected": -344.17340087890625, - "loss": 0.6229, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.21959218382835388, - "rewards/margins": 0.10680700838565826, - "rewards/rejected": -0.32639920711517334, - "step": 85 - }, - { - "epoch": 0.17165668662674652, - "grad_norm": 8.899211999721798, - "learning_rate": 4.925739315689991e-07, - "logits/chosen": -17.578712463378906, - "logits/rejected": -17.100988388061523, - "logps/chosen": -392.40399169921875, - "logps/rejected": -408.375244140625, - "loss": 0.5926, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.3149687647819519, - "rewards/margins": 0.22621186077594757, - "rewards/rejected": -0.5411806106567383, - "step": 86 - }, - { - "epoch": 0.17365269461077845, - "grad_norm": 9.058158084220052, - "learning_rate": 4.921457902821578e-07, - "logits/chosen": -18.370519638061523, - "logits/rejected": -18.149328231811523, - "logps/chosen": -506.34832763671875, - "logps/rejected": -465.40228271484375, - "loss": 0.6228, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5572055578231812, - "rewards/margins": 0.24167928099632263, - "rewards/rejected": -0.7988848686218262, - "step": 87 - }, - { - "epoch": 0.17564870259481039, - "grad_norm": 9.924143893065036, - "learning_rate": 4.917058471511148e-07, - "logits/chosen": -17.789403915405273, - "logits/rejected": -17.73256492614746, - "logps/chosen": -473.0888977050781, - "logps/rejected": -505.4393310546875, - "loss": 0.622, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.562219500541687, - "rewards/margins": 0.14176039397716522, - "rewards/rejected": -0.7039799690246582, - "step": 88 - }, - { - "epoch": 0.17764471057884232, - "grad_norm": 9.455855889835503, - "learning_rate": 4.912541236180778e-07, - "logits/chosen": -15.555730819702148, - "logits/rejected": -16.45915985107422, - "logps/chosen": -359.06304931640625, - "logps/rejected": -448.340087890625, - "loss": 0.6132, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.1561770737171173, - "rewards/margins": 0.43770644068717957, - "rewards/rejected": -0.5938835144042969, - "step": 89 - }, - { - "epoch": 0.17964071856287425, - "grad_norm": 9.198885333259362, - "learning_rate": 4.907906416994145e-07, - "logits/chosen": -16.712560653686523, - "logits/rejected": -15.981279373168945, - "logps/chosen": -379.13067626953125, - "logps/rejected": -461.3275451660156, - "loss": 0.621, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.15404972434043884, - "rewards/margins": 0.31564387679100037, - "rewards/rejected": -0.4696936309337616, - "step": 90 - }, - { - "epoch": 0.18163672654690619, - "grad_norm": 9.490922115703798, - "learning_rate": 4.903154239845797e-07, - "logits/chosen": -16.743335723876953, - "logits/rejected": -16.130474090576172, - "logps/chosen": -375.4258728027344, - "logps/rejected": -382.15667724609375, - "loss": 0.6204, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.46173471212387085, - "rewards/margins": 0.16935935616493225, - "rewards/rejected": -0.6310940980911255, - "step": 91 - }, - { - "epoch": 0.18363273453093812, - "grad_norm": 9.888840866130433, - "learning_rate": 4.898284936350143e-07, - "logits/chosen": -15.539291381835938, - "logits/rejected": -15.749015808105469, - "logps/chosen": -375.1903991699219, - "logps/rejected": -397.3068542480469, - "loss": 0.6055, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.3286465108394623, - "rewards/margins": 0.2511574327945709, - "rewards/rejected": -0.5798039436340332, - "step": 92 - }, - { - "epoch": 0.18562874251497005, - "grad_norm": 9.534547072466589, - "learning_rate": 4.893298743830167e-07, - "logits/chosen": -17.950407028198242, - "logits/rejected": -17.723539352416992, - "logps/chosen": -570.85693359375, - "logps/rejected": -546.0053100585938, - "loss": 0.6082, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6157274842262268, - "rewards/margins": 0.40511518716812134, - "rewards/rejected": -1.0208425521850586, - "step": 93 - }, - { - "epoch": 0.18762475049900199, - "grad_norm": 9.722881859804074, - "learning_rate": 4.888195905305859e-07, - "logits/chosen": -17.128742218017578, - "logits/rejected": -16.86199951171875, - "logps/chosen": -364.7801513671875, - "logps/rejected": -412.03985595703125, - "loss": 0.6212, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.35856327414512634, - "rewards/margins": 0.058156758546829224, - "rewards/rejected": -0.4167200028896332, - "step": 94 - }, - { - "epoch": 0.18962075848303392, - "grad_norm": 9.687485216578887, - "learning_rate": 4.882976669482367e-07, - "logits/chosen": -16.509841918945312, - "logits/rejected": -17.33835220336914, - "logps/chosen": -401.19586181640625, - "logps/rejected": -437.3283386230469, - "loss": 0.6115, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.44053059816360474, - "rewards/margins": 0.5870952606201172, - "rewards/rejected": -1.0276257991790771, - "step": 95 - }, - { - "epoch": 0.19161676646706588, - "grad_norm": 9.298831520670507, - "learning_rate": 4.877641290737883e-07, - "logits/chosen": -16.204208374023438, - "logits/rejected": -16.041362762451172, - "logps/chosen": -377.23699951171875, - "logps/rejected": -422.1312561035156, - "loss": 0.594, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.2721441388130188, - "rewards/margins": 0.31349363923072815, - "rewards/rejected": -0.5856378078460693, - "step": 96 - }, - { - "epoch": 0.1936127744510978, - "grad_norm": 11.267444543343759, - "learning_rate": 4.872190029111241e-07, - "logits/chosen": -17.558420181274414, - "logits/rejected": -17.034278869628906, - "logps/chosen": -502.9958190917969, - "logps/rejected": -536.4631958007812, - "loss": 0.593, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.6253632307052612, - "rewards/margins": 0.29862356185913086, - "rewards/rejected": -0.9239866733551025, - "step": 97 - }, - { - "epoch": 0.19560878243512975, - "grad_norm": 9.47144132073207, - "learning_rate": 4.866623150289241e-07, - "logits/chosen": -17.21906852722168, - "logits/rejected": -16.40659523010254, - "logps/chosen": -308.1177062988281, - "logps/rejected": -365.7485656738281, - "loss": 0.5896, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.20060190558433533, - "rewards/margins": 0.40862858295440674, - "rewards/rejected": -0.6092304587364197, - "step": 98 - }, - { - "epoch": 0.19760479041916168, - "grad_norm": 11.742965565854453, - "learning_rate": 4.860940925593702e-07, - "logits/chosen": -18.529356002807617, - "logits/rejected": -18.07916259765625, - "logps/chosen": -302.5202331542969, - "logps/rejected": -296.69964599609375, - "loss": 0.566, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.18639332056045532, - "rewards/margins": 0.23985722661018372, - "rewards/rejected": -0.42625054717063904, - "step": 99 - }, - { - "epoch": 0.1996007984031936, - "grad_norm": 9.838131303202186, - "learning_rate": 4.855143631968242e-07, - "logits/chosen": -16.533798217773438, - "logits/rejected": -17.409114837646484, - "logps/chosen": -452.28173828125, - "logps/rejected": -523.7857055664062, - "loss": 0.576, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.42187410593032837, - "rewards/margins": 0.3787775933742523, - "rewards/rejected": -0.8006517887115479, - "step": 100 - }, - { - "epoch": 0.20159680638722555, - "grad_norm": 9.784365965262033, - "learning_rate": 4.849231551964771e-07, - "logits/chosen": -17.89838218688965, - "logits/rejected": -17.24924087524414, - "logps/chosen": -389.7870178222656, - "logps/rejected": -441.84857177734375, - "loss": 0.6049, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5550696849822998, - "rewards/margins": 0.31255972385406494, - "rewards/rejected": -0.86762934923172, - "step": 101 - }, - { - "epoch": 0.20359281437125748, - "grad_norm": 10.254231166703345, - "learning_rate": 4.843204973729728e-07, - "logits/chosen": -17.67897605895996, - "logits/rejected": -16.87425994873047, - "logps/chosen": -350.6794128417969, - "logps/rejected": -377.69073486328125, - "loss": 0.5967, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4413577914237976, - "rewards/margins": 0.24204562604427338, - "rewards/rejected": -0.6834034323692322, - "step": 102 - }, - { - "epoch": 0.2055888223552894, - "grad_norm": 10.177731859208695, - "learning_rate": 4.837064190990036e-07, - "logits/chosen": -17.95716094970703, - "logits/rejected": -18.56848907470703, - "logps/chosen": -363.04754638671875, - "logps/rejected": -426.90582275390625, - "loss": 0.5939, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.4591674506664276, - "rewards/margins": 0.3569161593914032, - "rewards/rejected": -0.8160836100578308, - "step": 103 - }, - { - "epoch": 0.20758483033932135, - "grad_norm": 11.665978228628967, - "learning_rate": 4.830809503038781e-07, - "logits/chosen": -18.042129516601562, - "logits/rejected": -18.134384155273438, - "logps/chosen": -371.7164611816406, - "logps/rejected": -375.3335876464844, - "loss": 0.6234, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.6777195334434509, - "rewards/margins": 0.03881584107875824, - "rewards/rejected": -0.7165352702140808, - "step": 104 - }, - { - "epoch": 0.20958083832335328, - "grad_norm": 10.307421231401023, - "learning_rate": 4.824441214720628e-07, - "logits/chosen": -17.154827117919922, - "logits/rejected": -16.737272262573242, - "logps/chosen": -415.67706298828125, - "logps/rejected": -465.8774719238281, - "loss": 0.5871, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7462588548660278, - "rewards/margins": 0.246952623128891, - "rewards/rejected": -0.9932115077972412, - "step": 105 - }, - { - "epoch": 0.21157684630738524, - "grad_norm": 10.785145631272416, - "learning_rate": 4.817959636416969e-07, - "logits/chosen": -16.967071533203125, - "logits/rejected": -17.13130760192871, - "logps/chosen": -345.673095703125, - "logps/rejected": -378.43780517578125, - "loss": 0.5532, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4800865054130554, - "rewards/margins": 0.3331327438354492, - "rewards/rejected": -0.8132193684577942, - "step": 106 - }, - { - "epoch": 0.21357285429141717, - "grad_norm": 10.599087532304829, - "learning_rate": 4.811365084030783e-07, - "logits/chosen": -15.638040542602539, - "logits/rejected": -16.557126998901367, - "logps/chosen": -444.3518371582031, - "logps/rejected": -541.2994995117188, - "loss": 0.5506, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.5941789150238037, - "rewards/margins": 0.6498495936393738, - "rewards/rejected": -1.2440284490585327, - "step": 107 - }, - { - "epoch": 0.2155688622754491, - "grad_norm": 11.345714476429055, - "learning_rate": 4.804657878971251e-07, - "logits/chosen": -17.96299934387207, - "logits/rejected": -18.472097396850586, - "logps/chosen": -431.6180114746094, - "logps/rejected": -497.6167297363281, - "loss": 0.5714, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.7720993757247925, - "rewards/margins": 0.20353072881698608, - "rewards/rejected": -0.9756300449371338, - "step": 108 - }, - { - "epoch": 0.21756487025948104, - "grad_norm": 10.98517018534807, - "learning_rate": 4.797838348138086e-07, - "logits/chosen": -16.523500442504883, - "logits/rejected": -16.56661605834961, - "logps/chosen": -433.0921325683594, - "logps/rejected": -492.59112548828125, - "loss": 0.5571, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8152545690536499, - "rewards/margins": 0.39452555775642395, - "rewards/rejected": -1.2097800970077515, - "step": 109 - }, - { - "epoch": 0.21956087824351297, - "grad_norm": 10.837354651005112, - "learning_rate": 4.790906823905599e-07, - "logits/chosen": -17.36256980895996, - "logits/rejected": -16.92915153503418, - "logps/chosen": -410.4229431152344, - "logps/rejected": -412.46435546875, - "loss": 0.5787, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5949603319168091, - "rewards/margins": 0.22644329071044922, - "rewards/rejected": -0.8214036226272583, - "step": 110 - }, - { - "epoch": 0.2215568862275449, - "grad_norm": 10.320787241206938, - "learning_rate": 4.783863644106502e-07, - "logits/chosen": -16.421478271484375, - "logits/rejected": -16.64776039123535, - "logps/chosen": -483.7657165527344, - "logps/rejected": -481.812744140625, - "loss": 0.5564, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6031550765037537, - "rewards/margins": 0.19912101328372955, - "rewards/rejected": -0.8022760152816772, - "step": 111 - }, - { - "epoch": 0.22355289421157684, - "grad_norm": 11.107038600485746, - "learning_rate": 4.776709152015442e-07, - "logits/chosen": -18.22406005859375, - "logits/rejected": -18.18329429626465, - "logps/chosen": -334.51348876953125, - "logps/rejected": -383.4963684082031, - "loss": 0.598, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.5061721205711365, - "rewards/margins": 0.23278102278709412, - "rewards/rejected": -0.7389531135559082, - "step": 112 - }, - { - "epoch": 0.22554890219560877, - "grad_norm": 10.862434019726457, - "learning_rate": 4.769443696332272e-07, - "logits/chosen": -16.520822525024414, - "logits/rejected": -17.432579040527344, - "logps/chosen": -359.88568115234375, - "logps/rejected": -433.949462890625, - "loss": 0.5686, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.4676639437675476, - "rewards/margins": 0.6773457527160645, - "rewards/rejected": -1.1450097560882568, - "step": 113 - }, - { - "epoch": 0.2275449101796407, - "grad_norm": 10.633885886185016, - "learning_rate": 4.762067631165049e-07, - "logits/chosen": -16.13068962097168, - "logits/rejected": -16.44469451904297, - "logps/chosen": -433.1568298339844, - "logps/rejected": -488.2666931152344, - "loss": 0.5095, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.5838625431060791, - "rewards/margins": 0.51032954454422, - "rewards/rejected": -1.0941921472549438, - "step": 114 - }, - { - "epoch": 0.22954091816367264, - "grad_norm": 11.311189673193253, - "learning_rate": 4.7545813160127845e-07, - "logits/chosen": -16.38141632080078, - "logits/rejected": -17.378694534301758, - "logps/chosen": -542.2471313476562, - "logps/rejected": -644.93115234375, - "loss": 0.5264, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.7417050004005432, - "rewards/margins": 1.1188050508499146, - "rewards/rejected": -1.860509991645813, - "step": 115 - }, - { - "epoch": 0.2315369261477046, - "grad_norm": 11.760352337564447, - "learning_rate": 4.746985115747917e-07, - "logits/chosen": -17.631946563720703, - "logits/rejected": -18.053573608398438, - "logps/chosen": -447.10943603515625, - "logps/rejected": -478.54083251953125, - "loss": 0.5718, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8603832125663757, - "rewards/margins": 0.30809980630874634, - "rewards/rejected": -1.168483018875122, - "step": 116 - }, - { - "epoch": 0.23353293413173654, - "grad_norm": 13.092392132476672, - "learning_rate": 4.739279400598532e-07, - "logits/chosen": -17.92958641052246, - "logits/rejected": -17.64777946472168, - "logps/chosen": -566.686279296875, - "logps/rejected": -626.3263549804688, - "loss": 0.5499, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8065224885940552, - "rewards/margins": 0.33996978402137756, - "rewards/rejected": -1.1464921236038208, - "step": 117 - }, - { - "epoch": 0.23552894211576847, - "grad_norm": 11.695233651191389, - "learning_rate": 4.731464546130314e-07, - "logits/chosen": -18.524166107177734, - "logits/rejected": -17.927783966064453, - "logps/chosen": -480.1812744140625, - "logps/rejected": -493.7731628417969, - "loss": 0.5755, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9772423505783081, - "rewards/margins": 0.1880900263786316, - "rewards/rejected": -1.1653324365615845, - "step": 118 - }, - { - "epoch": 0.2375249500998004, - "grad_norm": 14.981531373880904, - "learning_rate": 4.7235409332282436e-07, - "logits/chosen": -18.190269470214844, - "logits/rejected": -18.184982299804688, - "logps/chosen": -418.0079345703125, - "logps/rejected": -410.58599853515625, - "loss": 0.5681, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8476977944374084, - "rewards/margins": 0.24571648240089417, - "rewards/rejected": -1.093414306640625, - "step": 119 - }, - { - "epoch": 0.23952095808383234, - "grad_norm": 12.8327990641794, - "learning_rate": 4.7155089480780365e-07, - "logits/chosen": -17.30999183654785, - "logits/rejected": -17.51451301574707, - "logps/chosen": -473.12420654296875, - "logps/rejected": -552.434814453125, - "loss": 0.5624, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9688943028450012, - "rewards/margins": 0.5823672413825989, - "rewards/rejected": -1.5512614250183105, - "step": 120 - }, - { - "epoch": 0.24151696606786427, - "grad_norm": 12.280307782693178, - "learning_rate": 4.707368982147317e-07, - "logits/chosen": -17.097469329833984, - "logits/rejected": -16.843915939331055, - "logps/chosen": -452.419189453125, - "logps/rejected": -520.343505859375, - "loss": 0.505, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.6861757040023804, - "rewards/margins": 0.5018269419670105, - "rewards/rejected": -1.188002586364746, - "step": 121 - }, - { - "epoch": 0.2435129740518962, - "grad_norm": 12.97976538848426, - "learning_rate": 4.6991214321665414e-07, - "logits/chosen": -17.63509750366211, - "logits/rejected": -17.495311737060547, - "logps/chosen": -431.20465087890625, - "logps/rejected": -465.6344299316406, - "loss": 0.5491, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7622246742248535, - "rewards/margins": 0.2962070107460022, - "rewards/rejected": -1.0584317445755005, - "step": 122 - }, - { - "epoch": 0.24550898203592814, - "grad_norm": 11.496195773328525, - "learning_rate": 4.6907667001096585e-07, - "logits/chosen": -17.77838897705078, - "logits/rejected": -18.234045028686523, - "logps/chosen": -474.1757507324219, - "logps/rejected": -698.40478515625, - "loss": 0.5397, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8501319289207458, - "rewards/margins": 1.1107265949249268, - "rewards/rejected": -1.9608584642410278, - "step": 123 - }, - { - "epoch": 0.24750499001996007, - "grad_norm": 12.44934209996877, - "learning_rate": 4.6823051931745237e-07, - "logits/chosen": -18.704727172851562, - "logits/rejected": -18.47835350036621, - "logps/chosen": -354.03076171875, - "logps/rejected": -454.5083312988281, - "loss": 0.5607, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.5832939147949219, - "rewards/margins": 0.8197604417800903, - "rewards/rejected": -1.4030543565750122, - "step": 124 - }, - { - "epoch": 0.249500998003992, - "grad_norm": 11.46676444609218, - "learning_rate": 4.6737373237630473e-07, - "logits/chosen": -17.123502731323242, - "logits/rejected": -17.59217071533203, - "logps/chosen": -421.08380126953125, - "logps/rejected": -530.366455078125, - "loss": 0.5211, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.0707979202270508, - "rewards/margins": 0.6613157391548157, - "rewards/rejected": -1.7321135997772217, - "step": 125 - }, - { - "epoch": 0.25149700598802394, - "grad_norm": 12.183293436350857, - "learning_rate": 4.6650635094610966e-07, - "logits/chosen": -18.38361358642578, - "logits/rejected": -18.456607818603516, - "logps/chosen": -464.5298156738281, - "logps/rejected": -517.3018798828125, - "loss": 0.5149, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.0340361595153809, - "rewards/margins": 0.34383463859558105, - "rewards/rejected": -1.3778706789016724, - "step": 126 - }, - { - "epoch": 0.25349301397205587, - "grad_norm": 11.999072355533364, - "learning_rate": 4.6562841730181435e-07, - "logits/chosen": -19.123811721801758, - "logits/rejected": -18.296850204467773, - "logps/chosen": -476.5393371582031, - "logps/rejected": -521.8916015625, - "loss": 0.5389, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.2099838256835938, - "rewards/margins": 0.3222024738788605, - "rewards/rejected": -1.532186508178711, - "step": 127 - }, - { - "epoch": 0.2554890219560878, - "grad_norm": 12.045125886996352, - "learning_rate": 4.647399742326661e-07, - "logits/chosen": -17.34141731262207, - "logits/rejected": -17.46158218383789, - "logps/chosen": -414.46832275390625, - "logps/rejected": -451.8553466796875, - "loss": 0.5338, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9311251640319824, - "rewards/margins": 0.434777170419693, - "rewards/rejected": -1.365902304649353, - "step": 128 - }, - { - "epoch": 0.25748502994011974, - "grad_norm": 11.977860936951346, - "learning_rate": 4.6384106504012665e-07, - "logits/chosen": -17.99986457824707, - "logits/rejected": -17.715444564819336, - "logps/chosen": -358.08837890625, - "logps/rejected": -396.7147521972656, - "loss": 0.4943, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7580198049545288, - "rewards/margins": 0.34733161330223083, - "rewards/rejected": -1.105351448059082, - "step": 129 - }, - { - "epoch": 0.25948103792415167, - "grad_norm": 11.874157690404008, - "learning_rate": 4.6293173353576186e-07, - "logits/chosen": -19.319414138793945, - "logits/rejected": -19.246450424194336, - "logps/chosen": -460.84991455078125, - "logps/rejected": -561.8507690429688, - "loss": 0.5235, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8761596083641052, - "rewards/margins": 0.9677722454071045, - "rewards/rejected": -1.843931794166565, - "step": 130 - }, - { - "epoch": 0.26147704590818366, - "grad_norm": 11.642857128737496, - "learning_rate": 4.6201202403910643e-07, - "logits/chosen": -18.393766403198242, - "logits/rejected": -18.55428695678711, - "logps/chosen": -434.67999267578125, - "logps/rejected": -488.51824951171875, - "loss": 0.5162, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.6767091751098633, - "rewards/margins": 0.5004065632820129, - "rewards/rejected": -1.1771156787872314, - "step": 131 - }, - { - "epoch": 0.2634730538922156, - "grad_norm": 11.491974226815117, - "learning_rate": 4.6108198137550377e-07, - "logits/chosen": -19.24776268005371, - "logits/rejected": -18.658472061157227, - "logps/chosen": -440.7890625, - "logps/rejected": -503.10595703125, - "loss": 0.4921, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9644001722335815, - "rewards/margins": 0.6125253438949585, - "rewards/rejected": -1.57692551612854, - "step": 132 - }, - { - "epoch": 0.2654690618762475, - "grad_norm": 12.899140291726132, - "learning_rate": 4.6014165087392105e-07, - "logits/chosen": -19.86013412475586, - "logits/rejected": -19.67055320739746, - "logps/chosen": -401.9028625488281, - "logps/rejected": -435.5128479003906, - "loss": 0.5094, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9691441655158997, - "rewards/margins": 0.3254424035549164, - "rewards/rejected": -1.294586420059204, - "step": 133 - }, - { - "epoch": 0.26746506986027946, - "grad_norm": 16.439092593676218, - "learning_rate": 4.591910783647404e-07, - "logits/chosen": -18.951297760009766, - "logits/rejected": -19.075408935546875, - "logps/chosen": -498.13165283203125, - "logps/rejected": -586.1142578125, - "loss": 0.5117, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.8596202731132507, - "rewards/margins": 0.8309850096702576, - "rewards/rejected": -1.6906054019927979, - "step": 134 - }, - { - "epoch": 0.2694610778443114, - "grad_norm": 16.957511057498678, - "learning_rate": 4.582303101775248e-07, - "logits/chosen": -18.295085906982422, - "logits/rejected": -17.897132873535156, - "logps/chosen": -457.41119384765625, - "logps/rejected": -517.1851196289062, - "loss": 0.5318, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.8927091956138611, - "rewards/margins": 0.5656099319458008, - "rewards/rejected": -1.458319067955017, - "step": 135 - }, - { - "epoch": 0.2714570858283433, - "grad_norm": 12.506389333337408, - "learning_rate": 4.572593931387604e-07, - "logits/chosen": -18.31269073486328, - "logits/rejected": -18.61883544921875, - "logps/chosen": -440.43206787109375, - "logps/rejected": -573.1978759765625, - "loss": 0.4875, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9546418190002441, - "rewards/margins": 0.8689519762992859, - "rewards/rejected": -1.8235938549041748, - "step": 136 - }, - { - "epoch": 0.27345309381237526, - "grad_norm": 14.756504766621427, - "learning_rate": 4.5627837456957374e-07, - "logits/chosen": -17.93488121032715, - "logits/rejected": -18.0272274017334, - "logps/chosen": -484.2279052734375, - "logps/rejected": -520.9596557617188, - "loss": 0.5817, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9371182322502136, - "rewards/margins": 0.3456663191318512, - "rewards/rejected": -1.2827844619750977, - "step": 137 - }, - { - "epoch": 0.2754491017964072, - "grad_norm": 13.948355377700631, - "learning_rate": 4.55287302283426e-07, - "logits/chosen": -18.253450393676758, - "logits/rejected": -17.966075897216797, - "logps/chosen": -420.6814880371094, - "logps/rejected": -508.517333984375, - "loss": 0.5273, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.033136010169983, - "rewards/margins": 0.7523062825202942, - "rewards/rejected": -1.7854422330856323, - "step": 138 - }, - { - "epoch": 0.2774451097804391, - "grad_norm": 12.827093040680634, - "learning_rate": 4.542862245837821e-07, - "logits/chosen": -18.383729934692383, - "logits/rejected": -18.069950103759766, - "logps/chosen": -398.27105712890625, - "logps/rejected": -538.0073852539062, - "loss": 0.4845, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.9848905205726624, - "rewards/margins": 0.9015528559684753, - "rewards/rejected": -1.8864431381225586, - "step": 139 - }, - { - "epoch": 0.27944111776447106, - "grad_norm": 16.782793626489322, - "learning_rate": 4.5327519026175686e-07, - "logits/chosen": -19.082073211669922, - "logits/rejected": -18.443805694580078, - "logps/chosen": -403.085693359375, - "logps/rejected": -444.4017639160156, - "loss": 0.5388, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9073647260665894, - "rewards/margins": 0.35679662227630615, - "rewards/rejected": -1.264161467552185, - "step": 140 - }, - { - "epoch": 0.281437125748503, - "grad_norm": 11.997391966133737, - "learning_rate": 4.5225424859373684e-07, - "logits/chosen": -18.931427001953125, - "logits/rejected": -18.6968936920166, - "logps/chosen": -479.6912841796875, - "logps/rejected": -543.6511840820312, - "loss": 0.5032, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.1790649890899658, - "rewards/margins": 0.5665948987007141, - "rewards/rejected": -1.7456599473953247, - "step": 141 - }, - { - "epoch": 0.2834331337325349, - "grad_norm": 16.73284107773462, - "learning_rate": 4.512234493389785e-07, - "logits/chosen": -18.50774383544922, - "logits/rejected": -17.77086067199707, - "logps/chosen": -400.50604248046875, - "logps/rejected": -467.09259033203125, - "loss": 0.5364, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9760237336158752, - "rewards/margins": 0.4449756145477295, - "rewards/rejected": -1.42099928855896, - "step": 142 - }, - { - "epoch": 0.28542914171656686, - "grad_norm": 14.572499779841467, - "learning_rate": 4.501828427371833e-07, - "logits/chosen": -17.65532684326172, - "logits/rejected": -18.490184783935547, - "logps/chosen": -442.3606872558594, - "logps/rejected": -481.2702941894531, - "loss": 0.5458, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.434018850326538, - "rewards/margins": 0.32890772819519043, - "rewards/rejected": -1.7629268169403076, - "step": 143 - }, - { - "epoch": 0.2874251497005988, - "grad_norm": 12.077196392810203, - "learning_rate": 4.4913247950604903e-07, - "logits/chosen": -17.562110900878906, - "logits/rejected": -18.58519744873047, - "logps/chosen": -514.4218139648438, - "logps/rejected": -602.9424438476562, - "loss": 0.5105, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.461451768875122, - "rewards/margins": 0.6788410544395447, - "rewards/rejected": -2.1402928829193115, - "step": 144 - }, - { - "epoch": 0.2894211576846307, - "grad_norm": 12.508567044518502, - "learning_rate": 4.4807241083879764e-07, - "logits/chosen": -17.865825653076172, - "logits/rejected": -17.985769271850586, - "logps/chosen": -434.7288513183594, - "logps/rejected": -482.5139465332031, - "loss": 0.5639, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4301695823669434, - "rewards/margins": 0.34268102049827576, - "rewards/rejected": -1.772850751876831, - "step": 145 - }, - { - "epoch": 0.29141716566866266, - "grad_norm": 14.199498491697744, - "learning_rate": 4.470026884016804e-07, - "logits/chosen": -19.102584838867188, - "logits/rejected": -19.021583557128906, - "logps/chosen": -458.40802001953125, - "logps/rejected": -512.4900512695312, - "loss": 0.524, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1264287233352661, - "rewards/margins": 0.5926605463027954, - "rewards/rejected": -1.7190892696380615, - "step": 146 - }, - { - "epoch": 0.2934131736526946, - "grad_norm": 11.094319848631054, - "learning_rate": 4.459233643314599e-07, - "logits/chosen": -17.470544815063477, - "logits/rejected": -17.389942169189453, - "logps/chosen": -371.8180236816406, - "logps/rejected": -449.5108947753906, - "loss": 0.4906, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0187515020370483, - "rewards/margins": 0.49795636534690857, - "rewards/rejected": -1.5167080163955688, - "step": 147 - }, - { - "epoch": 0.2954091816367265, - "grad_norm": 11.94286172302262, - "learning_rate": 4.4483449123286855e-07, - "logits/chosen": -17.679344177246094, - "logits/rejected": -18.112321853637695, - "logps/chosen": -568.5883178710938, - "logps/rejected": -588.3480834960938, - "loss": 0.5598, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4919590950012207, - "rewards/margins": 0.22017580270767212, - "rewards/rejected": -1.7121349573135376, - "step": 148 - }, - { - "epoch": 0.29740518962075846, - "grad_norm": 13.921819284966194, - "learning_rate": 4.437361221760449e-07, - "logits/chosen": -19.270662307739258, - "logits/rejected": -19.816879272460938, - "logps/chosen": -478.0981140136719, - "logps/rejected": -570.726318359375, - "loss": 0.4884, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8832843899726868, - "rewards/margins": 0.8125098347663879, - "rewards/rejected": -1.6957943439483643, - "step": 149 - }, - { - "epoch": 0.2994011976047904, - "grad_norm": 13.963951037655912, - "learning_rate": 4.426283106939473e-07, - "logits/chosen": -18.801692962646484, - "logits/rejected": -18.507511138916016, - "logps/chosen": -404.40924072265625, - "logps/rejected": -461.0819091796875, - "loss": 0.5169, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9502855539321899, - "rewards/margins": 0.5388966798782349, - "rewards/rejected": -1.4891822338104248, - "step": 150 - }, - { - "epoch": 0.3013972055888224, - "grad_norm": 11.751403000253367, - "learning_rate": 4.415111107797445e-07, - "logits/chosen": -17.666833877563477, - "logits/rejected": -18.496967315673828, - "logps/chosen": -437.1878967285156, - "logps/rejected": -508.55517578125, - "loss": 0.4824, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1933588981628418, - "rewards/margins": 0.5103777050971985, - "rewards/rejected": -1.703736662864685, - "step": 151 - }, - { - "epoch": 0.3033932135728543, - "grad_norm": 15.664706001428366, - "learning_rate": 4.403845768841842e-07, - "logits/chosen": -19.15441131591797, - "logits/rejected": -19.0245418548584, - "logps/chosen": -517.4494018554688, - "logps/rejected": -583.9824829101562, - "loss": 0.5042, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1839749813079834, - "rewards/margins": 0.7368112206459045, - "rewards/rejected": -1.9207862615585327, - "step": 152 - }, - { - "epoch": 0.30538922155688625, - "grad_norm": 12.964369532636645, - "learning_rate": 4.392487639129391e-07, - "logits/chosen": -17.844913482666016, - "logits/rejected": -18.007299423217773, - "logps/chosen": -430.0899353027344, - "logps/rejected": -520.009033203125, - "loss": 0.4851, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.9588636755943298, - "rewards/margins": 0.9035049080848694, - "rewards/rejected": -1.8623687028884888, - "step": 153 - }, - { - "epoch": 0.3073852295409182, - "grad_norm": 12.352390828674043, - "learning_rate": 4.3810372722393106e-07, - "logits/chosen": -18.35409927368164, - "logits/rejected": -18.485759735107422, - "logps/chosen": -433.47747802734375, - "logps/rejected": -456.7944030761719, - "loss": 0.508, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0077626705169678, - "rewards/margins": 0.3574880361557007, - "rewards/rejected": -1.3652507066726685, - "step": 154 - }, - { - "epoch": 0.3093812375249501, - "grad_norm": 12.60101053011718, - "learning_rate": 4.36949522624633e-07, - "logits/chosen": -18.486366271972656, - "logits/rejected": -18.509090423583984, - "logps/chosen": -511.802734375, - "logps/rejected": -631.6337890625, - "loss": 0.483, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.3301135301589966, - "rewards/margins": 0.8965414762496948, - "rewards/rejected": -2.2266550064086914, - "step": 155 - }, - { - "epoch": 0.31137724550898205, - "grad_norm": 12.076316766290963, - "learning_rate": 4.357862063693485e-07, - "logits/chosen": -18.479555130004883, - "logits/rejected": -18.734378814697266, - "logps/chosen": -412.6598815917969, - "logps/rejected": -505.7449951171875, - "loss": 0.4914, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.813165545463562, - "rewards/margins": 0.7614642381668091, - "rewards/rejected": -1.574629783630371, - "step": 156 - }, - { - "epoch": 0.313373253493014, - "grad_norm": 14.618551602047281, - "learning_rate": 4.34613835156471e-07, - "logits/chosen": -18.515804290771484, - "logits/rejected": -18.076066970825195, - "logps/chosen": -518.2728271484375, - "logps/rejected": -609.0603637695312, - "loss": 0.475, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4048892259597778, - "rewards/margins": 0.7887292504310608, - "rewards/rejected": -2.1936185359954834, - "step": 157 - }, - { - "epoch": 0.3153692614770459, - "grad_norm": 14.08049532394846, - "learning_rate": 4.3343246612571905e-07, - "logits/chosen": -18.047651290893555, - "logits/rejected": -18.692232131958008, - "logps/chosen": -412.463623046875, - "logps/rejected": -513.392822265625, - "loss": 0.4961, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.0158097743988037, - "rewards/margins": 0.7373302578926086, - "rewards/rejected": -1.753139853477478, - "step": 158 - }, - { - "epoch": 0.31736526946107785, - "grad_norm": 12.549764922498149, - "learning_rate": 4.3224215685535287e-07, - "logits/chosen": -18.217510223388672, - "logits/rejected": -18.150440216064453, - "logps/chosen": -432.1754455566406, - "logps/rejected": -474.58477783203125, - "loss": 0.4799, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.9655523896217346, - "rewards/margins": 0.6841700673103333, - "rewards/rejected": -1.6497223377227783, - "step": 159 - }, - { - "epoch": 0.3193612774451098, - "grad_norm": 19.934375633780295, - "learning_rate": 4.310429653593669e-07, - "logits/chosen": -19.979171752929688, - "logits/rejected": -20.37800407409668, - "logps/chosen": -465.45416259765625, - "logps/rejected": -549.4246826171875, - "loss": 0.5618, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0206208229064941, - "rewards/margins": 0.7802413702011108, - "rewards/rejected": -1.800862193107605, - "step": 160 - }, - { - "epoch": 0.3213572854291417, - "grad_norm": 14.01791810960553, - "learning_rate": 4.2983495008466273e-07, - "logits/chosen": -19.8514404296875, - "logits/rejected": -19.740142822265625, - "logps/chosen": -557.4789428710938, - "logps/rejected": -551.2369995117188, - "loss": 0.4997, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4763171672821045, - "rewards/margins": 0.34967875480651855, - "rewards/rejected": -1.825995922088623, - "step": 161 - }, - { - "epoch": 0.32335329341317365, - "grad_norm": 12.51564581989701, - "learning_rate": 4.286181699082008e-07, - "logits/chosen": -19.269309997558594, - "logits/rejected": -19.268407821655273, - "logps/chosen": -471.971923828125, - "logps/rejected": -514.5819702148438, - "loss": 0.4595, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.1184983253479004, - "rewards/margins": 0.3826027512550354, - "rewards/rejected": -1.5011011362075806, - "step": 162 - }, - { - "epoch": 0.3253493013972056, - "grad_norm": 15.542357689555216, - "learning_rate": 4.273926841341302e-07, - "logits/chosen": -20.759143829345703, - "logits/rejected": -20.3962345123291, - "logps/chosen": -425.91998291015625, - "logps/rejected": -551.6376953125, - "loss": 0.4915, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8230355978012085, - "rewards/margins": 1.2822701930999756, - "rewards/rejected": -2.1053059101104736, - "step": 163 - }, - { - "epoch": 0.3273453093812375, - "grad_norm": 11.931760390211124, - "learning_rate": 4.2615855249089867e-07, - "logits/chosen": -19.185287475585938, - "logits/rejected": -19.267969131469727, - "logps/chosen": -475.9891662597656, - "logps/rejected": -579.8348388671875, - "loss": 0.5064, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2812844514846802, - "rewards/margins": 0.9683108329772949, - "rewards/rejected": -2.2495951652526855, - "step": 164 - }, - { - "epoch": 0.32934131736526945, - "grad_norm": 14.325662785483841, - "learning_rate": 4.249158351283413e-07, - "logits/chosen": -18.518449783325195, - "logits/rejected": -18.800861358642578, - "logps/chosen": -423.35260009765625, - "logps/rejected": -591.2802734375, - "loss": 0.4822, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.156442642211914, - "rewards/margins": 1.339384913444519, - "rewards/rejected": -2.4958276748657227, - "step": 165 - }, - { - "epoch": 0.3313373253493014, - "grad_norm": 13.209121169799278, - "learning_rate": 4.236645926147493e-07, - "logits/chosen": -18.23113250732422, - "logits/rejected": -17.86396598815918, - "logps/chosen": -509.241943359375, - "logps/rejected": -584.48193359375, - "loss": 0.5113, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3406342267990112, - "rewards/margins": 0.4660688638687134, - "rewards/rejected": -1.8067032098770142, - "step": 166 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 12.679678337512888, - "learning_rate": 4.224048859339174e-07, - "logits/chosen": -20.044212341308594, - "logits/rejected": -19.84770965576172, - "logps/chosen": -474.1540222167969, - "logps/rejected": -576.6797485351562, - "loss": 0.4782, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5084277391433716, - "rewards/margins": 0.8898066878318787, - "rewards/rejected": -2.3982343673706055, - "step": 167 - }, - { - "epoch": 0.33532934131736525, - "grad_norm": 12.803928092289635, - "learning_rate": 4.2113677648217216e-07, - "logits/chosen": -19.095563888549805, - "logits/rejected": -18.801067352294922, - "logps/chosen": -410.49737548828125, - "logps/rejected": -484.6048278808594, - "loss": 0.4629, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.1591637134552002, - "rewards/margins": 0.666135847568512, - "rewards/rejected": -1.8252995014190674, - "step": 168 - }, - { - "epoch": 0.3373253493013972, - "grad_norm": 14.174173721204978, - "learning_rate": 4.1986032606537916e-07, - "logits/chosen": -16.58905792236328, - "logits/rejected": -16.84238624572754, - "logps/chosen": -569.0613403320312, - "logps/rejected": -614.7280883789062, - "loss": 0.5008, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4528391361236572, - "rewards/margins": 0.6195281744003296, - "rewards/rejected": -2.0723674297332764, - "step": 169 - }, - { - "epoch": 0.3393213572854291, - "grad_norm": 13.080053722751183, - "learning_rate": 4.1857559689593083e-07, - "logits/chosen": -18.27602767944336, - "logits/rejected": -17.57232666015625, - "logps/chosen": -453.475830078125, - "logps/rejected": -515.5191040039062, - "loss": 0.4821, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.3552886247634888, - "rewards/margins": 0.832838773727417, - "rewards/rejected": -2.1881275177001953, - "step": 170 - }, - { - "epoch": 0.3413173652694611, - "grad_norm": 12.364971785532722, - "learning_rate": 4.172826515897145e-07, - "logits/chosen": -18.919944763183594, - "logits/rejected": -19.389076232910156, - "logps/chosen": -388.2543029785156, - "logps/rejected": -470.0887451171875, - "loss": 0.4563, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9409610033035278, - "rewards/margins": 0.7529804706573486, - "rewards/rejected": -1.6939414739608765, - "step": 171 - }, - { - "epoch": 0.34331337325349304, - "grad_norm": 13.246390983471949, - "learning_rate": 4.1598155316306037e-07, - "logits/chosen": -18.05150032043457, - "logits/rejected": -18.103609085083008, - "logps/chosen": -473.530029296875, - "logps/rejected": -540.929443359375, - "loss": 0.4981, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.117943525314331, - "rewards/margins": 0.5308763384819031, - "rewards/rejected": -1.6488198041915894, - "step": 172 - }, - { - "epoch": 0.34530938123752497, - "grad_norm": 13.04681058875418, - "learning_rate": 4.146723650296701e-07, - "logits/chosen": -20.524463653564453, - "logits/rejected": -20.365554809570312, - "logps/chosen": -426.28448486328125, - "logps/rejected": -516.8076782226562, - "loss": 0.5333, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.146423101425171, - "rewards/margins": 0.9261961579322815, - "rewards/rejected": -2.0726191997528076, - "step": 173 - }, - { - "epoch": 0.3473053892215569, - "grad_norm": 13.558902213534159, - "learning_rate": 4.133551509975264e-07, - "logits/chosen": -18.120737075805664, - "logits/rejected": -17.530546188354492, - "logps/chosen": -374.5181884765625, - "logps/rejected": -488.8734130859375, - "loss": 0.478, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8048744201660156, - "rewards/margins": 1.0980815887451172, - "rewards/rejected": -1.9029561281204224, - "step": 174 - }, - { - "epoch": 0.34930139720558884, - "grad_norm": 12.399640912045163, - "learning_rate": 4.120299752657827e-07, - "logits/chosen": -20.241302490234375, - "logits/rejected": -19.89463233947754, - "logps/chosen": -435.4852294921875, - "logps/rejected": -600.3646240234375, - "loss": 0.4833, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.0055052042007446, - "rewards/margins": 1.095568299293518, - "rewards/rejected": -2.1010735034942627, - "step": 175 - }, - { - "epoch": 0.35129740518962077, - "grad_norm": 18.67991630316, - "learning_rate": 4.106969024216348e-07, - "logits/chosen": -20.45185661315918, - "logits/rejected": -20.16585922241211, - "logps/chosen": -495.82159423828125, - "logps/rejected": -615.032958984375, - "loss": 0.5435, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4768478870391846, - "rewards/margins": 0.9564065933227539, - "rewards/rejected": -2.4332542419433594, - "step": 176 - }, - { - "epoch": 0.3532934131736527, - "grad_norm": 14.433114513707654, - "learning_rate": 4.0935599743717244e-07, - "logits/chosen": -19.229232788085938, - "logits/rejected": -19.95961570739746, - "logps/chosen": -481.91278076171875, - "logps/rejected": -587.878662109375, - "loss": 0.478, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3701449632644653, - "rewards/margins": 1.1796942949295044, - "rewards/rejected": -2.549839496612549, - "step": 177 - }, - { - "epoch": 0.35528942115768464, - "grad_norm": 13.188342082066123, - "learning_rate": 4.080073256662127e-07, - "logits/chosen": -18.213958740234375, - "logits/rejected": -19.102828979492188, - "logps/chosen": -580.2965087890625, - "logps/rejected": -703.9386596679688, - "loss": 0.4952, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4226490259170532, - "rewards/margins": 1.1172631978988647, - "rewards/rejected": -2.539912223815918, - "step": 178 - }, - { - "epoch": 0.35728542914171657, - "grad_norm": 13.727220401466234, - "learning_rate": 4.066509528411151e-07, - "logits/chosen": -19.163558959960938, - "logits/rejected": -20.03252601623535, - "logps/chosen": -416.621826171875, - "logps/rejected": -556.1578979492188, - "loss": 0.4664, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9182752370834351, - "rewards/margins": 1.0827980041503906, - "rewards/rejected": -2.001073122024536, - "step": 179 - }, - { - "epoch": 0.3592814371257485, - "grad_norm": 13.429269056852185, - "learning_rate": 4.0528694506957754e-07, - "logits/chosen": -19.609607696533203, - "logits/rejected": -19.0802059173584, - "logps/chosen": -463.4999694824219, - "logps/rejected": -571.7774047851562, - "loss": 0.4641, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1410191059112549, - "rewards/margins": 0.9015007019042969, - "rewards/rejected": -2.0425198078155518, - "step": 180 - }, - { - "epoch": 0.36127744510978044, - "grad_norm": 13.035911390029865, - "learning_rate": 4.039153688314145e-07, - "logits/chosen": -19.555572509765625, - "logits/rejected": -19.886554718017578, - "logps/chosen": -429.10003662109375, - "logps/rejected": -526.60888671875, - "loss": 0.4332, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4029077291488647, - "rewards/margins": 0.8520881533622742, - "rewards/rejected": -2.254995822906494, - "step": 181 - }, - { - "epoch": 0.36327345309381237, - "grad_norm": 15.508960229044732, - "learning_rate": 4.025362909753169e-07, - "logits/chosen": -18.01523208618164, - "logits/rejected": -18.14191246032715, - "logps/chosen": -425.21063232421875, - "logps/rejected": -510.28546142578125, - "loss": 0.4844, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3480522632598877, - "rewards/margins": 0.6659371852874756, - "rewards/rejected": -2.0139894485473633, - "step": 182 - }, - { - "epoch": 0.3652694610778443, - "grad_norm": 16.333189353880638, - "learning_rate": 4.0114977871559377e-07, - "logits/chosen": -19.889019012451172, - "logits/rejected": -20.158649444580078, - "logps/chosen": -388.4324645996094, - "logps/rejected": -496.0431823730469, - "loss": 0.502, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3997801542282104, - "rewards/margins": 0.8247696161270142, - "rewards/rejected": -2.2245497703552246, - "step": 183 - }, - { - "epoch": 0.36726546906187624, - "grad_norm": 12.698608735181692, - "learning_rate": 3.997558996288964e-07, - "logits/chosen": -19.303394317626953, - "logits/rejected": -18.998262405395508, - "logps/chosen": -573.5017700195312, - "logps/rejected": -647.8325805664062, - "loss": 0.4686, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7628663778305054, - "rewards/margins": 0.7538273334503174, - "rewards/rejected": -2.5166938304901123, - "step": 184 - }, - { - "epoch": 0.36926147704590817, - "grad_norm": 14.891574400285474, - "learning_rate": 3.983547216509254e-07, - "logits/chosen": -19.312734603881836, - "logits/rejected": -19.04522132873535, - "logps/chosen": -529.5467529296875, - "logps/rejected": -636.4886474609375, - "loss": 0.4513, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.68673574924469, - "rewards/margins": 0.7229608297348022, - "rewards/rejected": -2.409696578979492, - "step": 185 - }, - { - "epoch": 0.3712574850299401, - "grad_norm": 14.363890009536421, - "learning_rate": 3.9694631307311825e-07, - "logits/chosen": -20.22901153564453, - "logits/rejected": -19.24595832824707, - "logps/chosen": -568.1968994140625, - "logps/rejected": -576.79052734375, - "loss": 0.4763, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8447073698043823, - "rewards/margins": 0.5339704751968384, - "rewards/rejected": -2.3786778450012207, - "step": 186 - }, - { - "epoch": 0.37325349301397204, - "grad_norm": 26.393758306411566, - "learning_rate": 3.9553074253932233e-07, - "logits/chosen": -19.26047134399414, - "logits/rejected": -19.362327575683594, - "logps/chosen": -527.1236572265625, - "logps/rejected": -585.4468994140625, - "loss": 0.5022, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4415079355239868, - "rewards/margins": 0.6697221994400024, - "rewards/rejected": -2.1112301349639893, - "step": 187 - }, - { - "epoch": 0.37524950099800397, - "grad_norm": 13.852299099165053, - "learning_rate": 3.941080790424483e-07, - "logits/chosen": -19.18405532836914, - "logits/rejected": -19.502866744995117, - "logps/chosen": -496.4898376464844, - "logps/rejected": -598.0606079101562, - "loss": 0.5085, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5746562480926514, - "rewards/margins": 0.7243920564651489, - "rewards/rejected": -2.2990481853485107, - "step": 188 - }, - { - "epoch": 0.3772455089820359, - "grad_norm": 12.900703996378226, - "learning_rate": 3.9267839192110797e-07, - "logits/chosen": -18.6906795501709, - "logits/rejected": -19.030370712280273, - "logps/chosen": -507.127197265625, - "logps/rejected": -523.0045776367188, - "loss": 0.4568, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4412367343902588, - "rewards/margins": 0.3045232892036438, - "rewards/rejected": -1.7457599639892578, - "step": 189 - }, - { - "epoch": 0.37924151696606784, - "grad_norm": 14.68383696725854, - "learning_rate": 3.912417508562345e-07, - "logits/chosen": -19.418498992919922, - "logits/rejected": -19.330608367919922, - "logps/chosen": -421.99456787109375, - "logps/rejected": -503.4772033691406, - "loss": 0.4865, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.183841347694397, - "rewards/margins": 0.7099670171737671, - "rewards/rejected": -1.8938082456588745, - "step": 190 - }, - { - "epoch": 0.3812375249500998, - "grad_norm": 11.75575189656368, - "learning_rate": 3.8979822586768666e-07, - "logits/chosen": -21.16440200805664, - "logits/rejected": -20.485124588012695, - "logps/chosen": -514.5684204101562, - "logps/rejected": -571.970703125, - "loss": 0.4889, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4731186628341675, - "rewards/margins": 0.6870366930961609, - "rewards/rejected": -2.1601555347442627, - "step": 191 - }, - { - "epoch": 0.38323353293413176, - "grad_norm": 14.448756790216004, - "learning_rate": 3.88347887310836e-07, - "logits/chosen": -20.147769927978516, - "logits/rejected": -19.916521072387695, - "logps/chosen": -447.382080078125, - "logps/rejected": -597.453125, - "loss": 0.457, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4994301795959473, - "rewards/margins": 1.272138237953186, - "rewards/rejected": -2.771568536758423, - "step": 192 - }, - { - "epoch": 0.3852295409181637, - "grad_norm": 12.23374123752803, - "learning_rate": 3.8689080587313755e-07, - "logits/chosen": -19.599998474121094, - "logits/rejected": -19.84237289428711, - "logps/chosen": -470.7962646484375, - "logps/rejected": -576.8146362304688, - "loss": 0.4514, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.35133695602417, - "rewards/margins": 0.9762927293777466, - "rewards/rejected": -2.327629804611206, - "step": 193 - }, - { - "epoch": 0.3872255489021956, - "grad_norm": 14.698946134402291, - "learning_rate": 3.85427052570685e-07, - "logits/chosen": -19.12531280517578, - "logits/rejected": -18.746444702148438, - "logps/chosen": -468.0195007324219, - "logps/rejected": -533.82177734375, - "loss": 0.483, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.414006233215332, - "rewards/margins": 0.6618885397911072, - "rewards/rejected": -2.075894832611084, - "step": 194 - }, - { - "epoch": 0.38922155688622756, - "grad_norm": 15.811852032735173, - "learning_rate": 3.839566987447491e-07, - "logits/chosen": -19.964893341064453, - "logits/rejected": -19.358760833740234, - "logps/chosen": -411.2624816894531, - "logps/rejected": -456.06390380859375, - "loss": 0.4831, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.0391451120376587, - "rewards/margins": 0.6880395412445068, - "rewards/rejected": -1.7271846532821655, - "step": 195 - }, - { - "epoch": 0.3912175648702595, - "grad_norm": 12.305213877004382, - "learning_rate": 3.824798160583012e-07, - "logits/chosen": -19.77585220336914, - "logits/rejected": -19.821577072143555, - "logps/chosen": -391.9801025390625, - "logps/rejected": -514.4705810546875, - "loss": 0.4478, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.8564898371696472, - "rewards/margins": 1.0837904214859009, - "rewards/rejected": -1.9402803182601929, - "step": 196 - }, - { - "epoch": 0.3932135728542914, - "grad_norm": 14.833698621408654, - "learning_rate": 3.809964764925198e-07, - "logits/chosen": -19.650461196899414, - "logits/rejected": -18.813337326049805, - "logps/chosen": -556.4791259765625, - "logps/rejected": -641.11083984375, - "loss": 0.4637, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5146292448043823, - "rewards/margins": 1.2214813232421875, - "rewards/rejected": -2.7361104488372803, - "step": 197 - }, - { - "epoch": 0.39520958083832336, - "grad_norm": 13.357898707821814, - "learning_rate": 3.7950675234328256e-07, - "logits/chosen": -20.979297637939453, - "logits/rejected": -20.838703155517578, - "logps/chosen": -502.97607421875, - "logps/rejected": -646.2918090820312, - "loss": 0.4448, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.1799731254577637, - "rewards/margins": 1.185584545135498, - "rewards/rejected": -3.3655576705932617, - "step": 198 - }, - { - "epoch": 0.3972055888223553, - "grad_norm": 15.671442333902643, - "learning_rate": 3.780107162176429e-07, - "logits/chosen": -19.549230575561523, - "logits/rejected": -20.148639678955078, - "logps/chosen": -540.7632446289062, - "logps/rejected": -629.897705078125, - "loss": 0.4734, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.1680128574371338, - "rewards/margins": 0.9218576550483704, - "rewards/rejected": -2.0898704528808594, - "step": 199 - }, - { - "epoch": 0.3992015968063872, - "grad_norm": 14.718831099618205, - "learning_rate": 3.765084410302909e-07, - "logits/chosen": -21.243850708007812, - "logits/rejected": -21.027610778808594, - "logps/chosen": -570.447998046875, - "logps/rejected": -706.76220703125, - "loss": 0.4543, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8626515865325928, - "rewards/margins": 1.4691358804702759, - "rewards/rejected": -3.331787347793579, - "step": 200 - }, - { - "epoch": 0.40119760479041916, - "grad_norm": 17.61224276039994, - "learning_rate": 3.75e-07, - "logits/chosen": -19.843793869018555, - "logits/rejected": -19.291797637939453, - "logps/chosen": -513.7886962890625, - "logps/rejected": -493.4097900390625, - "loss": 0.4852, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8454079627990723, - "rewards/margins": 0.3778110444545746, - "rewards/rejected": -2.223219156265259, - "step": 201 - }, - { - "epoch": 0.4031936127744511, - "grad_norm": 14.06450720707383, - "learning_rate": 3.734854666460577e-07, - "logits/chosen": -19.530370712280273, - "logits/rejected": -19.210784912109375, - "logps/chosen": -490.2061767578125, - "logps/rejected": -568.5958862304688, - "loss": 0.4496, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6006358861923218, - "rewards/margins": 0.7596541047096252, - "rewards/rejected": -2.360290050506592, - "step": 202 - }, - { - "epoch": 0.405189620758483, - "grad_norm": 14.859823057968415, - "learning_rate": 3.7196491478468316e-07, - "logits/chosen": -18.494264602661133, - "logits/rejected": -18.339521408081055, - "logps/chosen": -529.4029541015625, - "logps/rejected": -648.5390625, - "loss": 0.4919, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5195198059082031, - "rewards/margins": 0.6871219873428345, - "rewards/rejected": -2.206641674041748, - "step": 203 - }, - { - "epoch": 0.40718562874251496, - "grad_norm": 13.639472563583217, - "learning_rate": 3.704384185254288e-07, - "logits/chosen": -19.484989166259766, - "logits/rejected": -19.255311965942383, - "logps/chosen": -525.3623046875, - "logps/rejected": -636.157958984375, - "loss": 0.4336, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5300272703170776, - "rewards/margins": 0.6417598724365234, - "rewards/rejected": -2.1717870235443115, - "step": 204 - }, - { - "epoch": 0.4091816367265469, - "grad_norm": 12.901826312150071, - "learning_rate": 3.689060522675688e-07, - "logits/chosen": -20.066303253173828, - "logits/rejected": -20.39885139465332, - "logps/chosen": -607.7792358398438, - "logps/rejected": -796.8943481445312, - "loss": 0.4076, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6671760082244873, - "rewards/margins": 1.1472505331039429, - "rewards/rejected": -2.8144266605377197, - "step": 205 - }, - { - "epoch": 0.4111776447105788, - "grad_norm": 16.04548062088645, - "learning_rate": 3.673678906964727e-07, - "logits/chosen": -20.172260284423828, - "logits/rejected": -19.925201416015625, - "logps/chosen": -427.82568359375, - "logps/rejected": -501.93438720703125, - "loss": 0.4985, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4109199047088623, - "rewards/margins": 0.7991017699241638, - "rewards/rejected": -2.210021734237671, - "step": 206 - }, - { - "epoch": 0.41317365269461076, - "grad_norm": 13.933597686095297, - "learning_rate": 3.658240087799654e-07, - "logits/chosen": -21.581363677978516, - "logits/rejected": -21.107837677001953, - "logps/chosen": -417.2480163574219, - "logps/rejected": -577.9209594726562, - "loss": 0.4376, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2277367115020752, - "rewards/margins": 1.1889203786849976, - "rewards/rejected": -2.4166574478149414, - "step": 207 - }, - { - "epoch": 0.4151696606786427, - "grad_norm": 17.56870232794954, - "learning_rate": 3.6427448176467357e-07, - "logits/chosen": -18.793655395507812, - "logits/rejected": -19.212615966796875, - "logps/chosen": -690.8632202148438, - "logps/rejected": -731.3242797851562, - "loss": 0.4648, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.026750087738037, - "rewards/margins": 0.9612019062042236, - "rewards/rejected": -2.9879517555236816, - "step": 208 - }, - { - "epoch": 0.4171656686626746, - "grad_norm": 12.43534426487344, - "learning_rate": 3.6271938517235765e-07, - "logits/chosen": -19.1177921295166, - "logits/rejected": -18.949975967407227, - "logps/chosen": -414.0655517578125, - "logps/rejected": -504.7939147949219, - "loss": 0.4392, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1112933158874512, - "rewards/margins": 0.9417324066162109, - "rewards/rejected": -2.053025484085083, - "step": 209 - }, - { - "epoch": 0.41916167664670656, - "grad_norm": 14.047345814245144, - "learning_rate": 3.6115879479623183e-07, - "logits/chosen": -20.57330322265625, - "logits/rejected": -20.117856979370117, - "logps/chosen": -516.2587890625, - "logps/rejected": -601.1445922851562, - "loss": 0.446, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6481542587280273, - "rewards/margins": 1.0199745893478394, - "rewards/rejected": -2.6681289672851562, - "step": 210 - }, - { - "epoch": 0.42115768463073855, - "grad_norm": 12.856885130231753, - "learning_rate": 3.595927866972693e-07, - "logits/chosen": -20.086902618408203, - "logits/rejected": -20.204526901245117, - "logps/chosen": -525.0537109375, - "logps/rejected": -619.6348266601562, - "loss": 0.4773, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.8990871906280518, - "rewards/margins": 0.745017409324646, - "rewards/rejected": -2.644104480743408, - "step": 211 - }, - { - "epoch": 0.4231536926147705, - "grad_norm": 12.98210694751887, - "learning_rate": 3.580214372004956e-07, - "logits/chosen": -19.950450897216797, - "logits/rejected": -20.33487319946289, - "logps/chosen": -394.83404541015625, - "logps/rejected": -497.79345703125, - "loss": 0.4549, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2214103937149048, - "rewards/margins": 0.9347801804542542, - "rewards/rejected": -2.156190872192383, - "step": 212 - }, - { - "epoch": 0.4251497005988024, - "grad_norm": 19.31248906417738, - "learning_rate": 3.5644482289126813e-07, - "logits/chosen": -18.889362335205078, - "logits/rejected": -19.420923233032227, - "logps/chosen": -548.869873046875, - "logps/rejected": -656.6920166015625, - "loss": 0.515, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7871437072753906, - "rewards/margins": 0.8735736608505249, - "rewards/rejected": -2.660717487335205, - "step": 213 - }, - { - "epoch": 0.42714570858283435, - "grad_norm": 15.762431059083589, - "learning_rate": 3.548630206115443e-07, - "logits/chosen": -20.15378189086914, - "logits/rejected": -20.292001724243164, - "logps/chosen": -404.6817626953125, - "logps/rejected": -477.21563720703125, - "loss": 0.4503, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3599151372909546, - "rewards/margins": 0.5870508551597595, - "rewards/rejected": -1.946966290473938, - "step": 214 - }, - { - "epoch": 0.4291417165668663, - "grad_norm": 11.197156920062987, - "learning_rate": 3.5327610745613546e-07, - "logits/chosen": -19.350055694580078, - "logits/rejected": -19.167869567871094, - "logps/chosen": -528.2733154296875, - "logps/rejected": -597.0432739257812, - "loss": 0.4185, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5914320945739746, - "rewards/margins": 0.8656637668609619, - "rewards/rejected": -2.4570956230163574, - "step": 215 - }, - { - "epoch": 0.4311377245508982, - "grad_norm": 15.40471464271596, - "learning_rate": 3.516841607689501e-07, - "logits/chosen": -19.665361404418945, - "logits/rejected": -19.29046058654785, - "logps/chosen": -510.60125732421875, - "logps/rejected": -581.7286376953125, - "loss": 0.4988, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5553991794586182, - "rewards/margins": 1.0072236061096191, - "rewards/rejected": -2.5626227855682373, - "step": 216 - }, - { - "epoch": 0.43313373253493015, - "grad_norm": 11.966288977941108, - "learning_rate": 3.500872581392238e-07, - "logits/chosen": -19.7708740234375, - "logits/rejected": -20.453022003173828, - "logps/chosen": -386.0968017578125, - "logps/rejected": -516.030029296875, - "loss": 0.4256, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.455343246459961, - "rewards/margins": 0.9787195920944214, - "rewards/rejected": -2.434062957763672, - "step": 217 - }, - { - "epoch": 0.4351297405189621, - "grad_norm": 12.989461741224307, - "learning_rate": 3.4848547739773774e-07, - "logits/chosen": -20.3544864654541, - "logits/rejected": -20.054367065429688, - "logps/chosen": -562.2991943359375, - "logps/rejected": -694.3490600585938, - "loss": 0.4304, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8581453561782837, - "rewards/margins": 1.252414345741272, - "rewards/rejected": -3.1105597019195557, - "step": 218 - }, - { - "epoch": 0.437125748502994, - "grad_norm": 13.490425487755859, - "learning_rate": 3.468788966130257e-07, - "logits/chosen": -19.16351318359375, - "logits/rejected": -18.119314193725586, - "logps/chosen": -550.3583984375, - "logps/rejected": -596.575927734375, - "loss": 0.4787, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.73578679561615, - "rewards/margins": 0.7520574927330017, - "rewards/rejected": -2.487844467163086, - "step": 219 - }, - { - "epoch": 0.43912175648702595, - "grad_norm": 19.70339532784469, - "learning_rate": 3.4526759408756857e-07, - "logits/chosen": -18.91830825805664, - "logits/rejected": -19.164518356323242, - "logps/chosen": -669.055419921875, - "logps/rejected": -708.6876220703125, - "loss": 0.469, - "rewards/accuracies": 0.5, - "rewards/chosen": -2.031832695007324, - "rewards/margins": 0.7691054940223694, - "rewards/rejected": -2.8009378910064697, - "step": 220 - }, - { - "epoch": 0.4411177644710579, - "grad_norm": 13.350681461714572, - "learning_rate": 3.43651648353978e-07, - "logits/chosen": -18.520729064941406, - "logits/rejected": -18.25364875793457, - "logps/chosen": -519.2975463867188, - "logps/rejected": -589.70263671875, - "loss": 0.4508, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2545933723449707, - "rewards/margins": 0.9282989501953125, - "rewards/rejected": -2.182892322540283, - "step": 221 - }, - { - "epoch": 0.4431137724550898, - "grad_norm": 12.985951445968679, - "learning_rate": 3.4203113817116953e-07, - "logits/chosen": -18.722192764282227, - "logits/rejected": -18.830791473388672, - "logps/chosen": -471.1470947265625, - "logps/rejected": -526.8262329101562, - "loss": 0.4195, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.1359221935272217, - "rewards/margins": 0.6822535991668701, - "rewards/rejected": -1.8181757926940918, - "step": 222 - }, - { - "epoch": 0.44510978043912175, - "grad_norm": 25.299702854421888, - "learning_rate": 3.40406142520523e-07, - "logits/chosen": -19.63787078857422, - "logits/rejected": -19.73871421813965, - "logps/chosen": -336.0172424316406, - "logps/rejected": -433.79217529296875, - "loss": 0.4428, - "rewards/accuracies": 0.9375, - "rewards/chosen": -0.9037049412727356, - "rewards/margins": 0.909833550453186, - "rewards/rejected": -1.8135385513305664, - "step": 223 - }, - { - "epoch": 0.4471057884231537, - "grad_norm": 13.981874950399485, - "learning_rate": 3.387767406020343e-07, - "logits/chosen": -20.34281349182129, - "logits/rejected": -20.043916702270508, - "logps/chosen": -561.2789916992188, - "logps/rejected": -721.3209838867188, - "loss": 0.4499, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.8252278566360474, - "rewards/margins": 1.3606287240982056, - "rewards/rejected": -3.185856819152832, - "step": 224 - }, - { - "epoch": 0.4491017964071856, - "grad_norm": 13.226395290628279, - "learning_rate": 3.371430118304538e-07, - "logits/chosen": -19.773094177246094, - "logits/rejected": -19.672040939331055, - "logps/chosen": -587.82177734375, - "logps/rejected": -659.1246337890625, - "loss": 0.4739, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.080528736114502, - "rewards/margins": 0.47968238592147827, - "rewards/rejected": -2.560211181640625, - "step": 225 - }, - { - "epoch": 0.45109780439121755, - "grad_norm": 21.18297840943059, - "learning_rate": 3.355050358314172e-07, - "logits/chosen": -18.7939510345459, - "logits/rejected": -19.188337326049805, - "logps/chosen": -548.7808837890625, - "logps/rejected": -600.9039916992188, - "loss": 0.4465, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9692116975784302, - "rewards/margins": 0.954479992389679, - "rewards/rejected": -2.923691749572754, - "step": 226 - }, - { - "epoch": 0.4530938123752495, - "grad_norm": 16.166215514551812, - "learning_rate": 3.338628924375638e-07, - "logits/chosen": -20.488510131835938, - "logits/rejected": -20.845476150512695, - "logps/chosen": -427.76348876953125, - "logps/rejected": -595.1253051757812, - "loss": 0.4742, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2822277545928955, - "rewards/margins": 1.3352444171905518, - "rewards/rejected": -2.6174721717834473, - "step": 227 - }, - { - "epoch": 0.4550898203592814, - "grad_norm": 14.590880103789674, - "learning_rate": 3.322166616846458e-07, - "logits/chosen": -20.881481170654297, - "logits/rejected": -20.86432647705078, - "logps/chosen": -444.5021057128906, - "logps/rejected": -491.39251708984375, - "loss": 0.4437, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.5504581928253174, - "rewards/margins": 0.4571852684020996, - "rewards/rejected": -2.007643699645996, - "step": 228 - }, - { - "epoch": 0.45708582834331335, - "grad_norm": 12.92186538925852, - "learning_rate": 3.305664238076278e-07, - "logits/chosen": -20.525470733642578, - "logits/rejected": -19.75753402709961, - "logps/chosen": -395.13714599609375, - "logps/rejected": -545.7861938476562, - "loss": 0.4846, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.4842098951339722, - "rewards/margins": 1.059401512145996, - "rewards/rejected": -2.543611526489258, - "step": 229 - }, - { - "epoch": 0.4590818363273453, - "grad_norm": 14.842723434798721, - "learning_rate": 3.289122592367756e-07, - "logits/chosen": -19.929006576538086, - "logits/rejected": -19.978878021240234, - "logps/chosen": -541.9116821289062, - "logps/rejected": -634.8336181640625, - "loss": 0.4491, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7504769563674927, - "rewards/margins": 1.1156678199768066, - "rewards/rejected": -2.866144895553589, - "step": 230 - }, - { - "epoch": 0.46107784431137727, - "grad_norm": 14.4942978574615, - "learning_rate": 3.272542485937368e-07, - "logits/chosen": -19.461790084838867, - "logits/rejected": -19.085315704345703, - "logps/chosen": -499.6000061035156, - "logps/rejected": -576.95849609375, - "loss": 0.4646, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5696638822555542, - "rewards/margins": 0.9026892185211182, - "rewards/rejected": -2.472352981567383, - "step": 231 - }, - { - "epoch": 0.4630738522954092, - "grad_norm": 15.223995717750787, - "learning_rate": 3.2559247268761114e-07, - "logits/chosen": -18.554140090942383, - "logits/rejected": -18.188430786132812, - "logps/chosen": -400.5260009765625, - "logps/rejected": -457.10906982421875, - "loss": 0.4356, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.025871992111206, - "rewards/margins": 0.5790168642997742, - "rewards/rejected": -1.604888916015625, - "step": 232 - }, - { - "epoch": 0.46506986027944114, - "grad_norm": 13.813707701215902, - "learning_rate": 3.2392701251101167e-07, - "logits/chosen": -18.753156661987305, - "logits/rejected": -19.0374755859375, - "logps/chosen": -587.1029663085938, - "logps/rejected": -672.4597778320312, - "loss": 0.466, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9925709962844849, - "rewards/margins": 0.7200486660003662, - "rewards/rejected": -2.7126200199127197, - "step": 233 - }, - { - "epoch": 0.46706586826347307, - "grad_norm": 16.032336387205383, - "learning_rate": 3.222579492361179e-07, - "logits/chosen": -18.763916015625, - "logits/rejected": -19.015911102294922, - "logps/chosen": -566.2586059570312, - "logps/rejected": -674.83642578125, - "loss": 0.4563, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4943814277648926, - "rewards/margins": 1.242883563041687, - "rewards/rejected": -2.73726487159729, - "step": 234 - }, - { - "epoch": 0.469061876247505, - "grad_norm": 18.953609466395125, - "learning_rate": 3.2058536421071914e-07, - "logits/chosen": -18.818294525146484, - "logits/rejected": -18.358341217041016, - "logps/chosen": -554.9633178710938, - "logps/rejected": -607.111328125, - "loss": 0.5233, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9789049625396729, - "rewards/margins": 0.6192256212234497, - "rewards/rejected": -2.598130464553833, - "step": 235 - }, - { - "epoch": 0.47105788423153694, - "grad_norm": 16.02285470658808, - "learning_rate": 3.1890933895424976e-07, - "logits/chosen": -18.656795501708984, - "logits/rejected": -18.9710636138916, - "logps/chosen": -535.3847045898438, - "logps/rejected": -593.01708984375, - "loss": 0.5067, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8721699714660645, - "rewards/margins": 0.48781105875968933, - "rewards/rejected": -2.359980821609497, - "step": 236 - }, - { - "epoch": 0.47305389221556887, - "grad_norm": 14.053925538557477, - "learning_rate": 3.172299551538164e-07, - "logits/chosen": -18.687612533569336, - "logits/rejected": -18.866008758544922, - "logps/chosen": -703.1234130859375, - "logps/rejected": -821.7389526367188, - "loss": 0.4206, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.976850986480713, - "rewards/margins": 0.9709327220916748, - "rewards/rejected": -2.9477834701538086, - "step": 237 - }, - { - "epoch": 0.4750499001996008, - "grad_norm": 13.888076003211115, - "learning_rate": 3.155472946602162e-07, - "logits/chosen": -20.23816680908203, - "logits/rejected": -19.793060302734375, - "logps/chosen": -548.01025390625, - "logps/rejected": -643.1630249023438, - "loss": 0.4279, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6879640817642212, - "rewards/margins": 0.6913233399391174, - "rewards/rejected": -2.3792872428894043, - "step": 238 - }, - { - "epoch": 0.47704590818363274, - "grad_norm": 13.837763164300675, - "learning_rate": 3.1386143948394763e-07, - "logits/chosen": -20.161375045776367, - "logits/rejected": -20.032562255859375, - "logps/chosen": -454.22332763671875, - "logps/rejected": -519.0992431640625, - "loss": 0.4894, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2950642108917236, - "rewards/margins": 0.6701027750968933, - "rewards/rejected": -1.9651669263839722, - "step": 239 - }, - { - "epoch": 0.47904191616766467, - "grad_norm": 17.528357068059776, - "learning_rate": 3.121724717912138e-07, - "logits/chosen": -20.390954971313477, - "logits/rejected": -19.70508575439453, - "logps/chosen": -586.51708984375, - "logps/rejected": -618.0750732421875, - "loss": 0.4483, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.9998843669891357, - "rewards/margins": 0.6734704375267029, - "rewards/rejected": -2.6733546257019043, - "step": 240 - }, - { - "epoch": 0.4810379241516966, - "grad_norm": 14.196704035292845, - "learning_rate": 3.104804738999169e-07, - "logits/chosen": -18.2707576751709, - "logits/rejected": -18.1250057220459, - "logps/chosen": -601.682861328125, - "logps/rejected": -642.5757446289062, - "loss": 0.4509, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.0958642959594727, - "rewards/margins": 0.6518892049789429, - "rewards/rejected": -2.747753381729126, - "step": 241 - }, - { - "epoch": 0.48303393213572854, - "grad_norm": 16.89149711735207, - "learning_rate": 3.087855282756475e-07, - "logits/chosen": -19.373920440673828, - "logits/rejected": -19.25455093383789, - "logps/chosen": -558.441650390625, - "logps/rejected": -685.5426635742188, - "loss": 0.4812, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7334048748016357, - "rewards/margins": 1.1145367622375488, - "rewards/rejected": -2.8479413986206055, - "step": 242 - }, - { - "epoch": 0.48502994011976047, - "grad_norm": 17.843513648960016, - "learning_rate": 3.0708771752766395e-07, - "logits/chosen": -20.650711059570312, - "logits/rejected": -19.873567581176758, - "logps/chosen": -551.3946533203125, - "logps/rejected": -695.5105590820312, - "loss": 0.4222, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.690913200378418, - "rewards/margins": 1.3804028034210205, - "rewards/rejected": -3.0713162422180176, - "step": 243 - }, - { - "epoch": 0.4870259481037924, - "grad_norm": 14.705398106965339, - "learning_rate": 3.053871244048669e-07, - "logits/chosen": -19.685924530029297, - "logits/rejected": -20.568681716918945, - "logps/chosen": -460.550537109375, - "logps/rejected": -524.4147338867188, - "loss": 0.4803, - "rewards/accuracies": 0.5625, - "rewards/chosen": -1.4355939626693726, - "rewards/margins": 0.5603222846984863, - "rewards/rejected": -1.9959162473678589, - "step": 244 - }, - { - "epoch": 0.48902195608782434, - "grad_norm": 15.46176286439623, - "learning_rate": 3.036838317917658e-07, - "logits/chosen": -18.501953125, - "logits/rejected": -19.013107299804688, - "logps/chosen": -574.2833251953125, - "logps/rejected": -710.7667236328125, - "loss": 0.4301, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5793968439102173, - "rewards/margins": 1.549849033355713, - "rewards/rejected": -3.1292459964752197, - "step": 245 - }, - { - "epoch": 0.49101796407185627, - "grad_norm": 15.399821902790524, - "learning_rate": 3.0197792270443976e-07, - "logits/chosen": -20.143672943115234, - "logits/rejected": -19.312414169311523, - "logps/chosen": -465.90557861328125, - "logps/rejected": -530.7208251953125, - "loss": 0.4609, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.567827820777893, - "rewards/margins": 0.7017680406570435, - "rewards/rejected": -2.2695958614349365, - "step": 246 - }, - { - "epoch": 0.4930139720558882, - "grad_norm": 13.365354324692614, - "learning_rate": 3.002694802864912e-07, - "logits/chosen": -19.2493896484375, - "logits/rejected": -19.1527099609375, - "logps/chosen": -371.802490234375, - "logps/rejected": -477.4065246582031, - "loss": 0.414, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.1133661270141602, - "rewards/margins": 0.8846907019615173, - "rewards/rejected": -1.9980566501617432, - "step": 247 - }, - { - "epoch": 0.49500998003992014, - "grad_norm": 13.171743969163947, - "learning_rate": 2.98558587804993e-07, - "logits/chosen": -20.076560974121094, - "logits/rejected": -20.700763702392578, - "logps/chosen": -525.328125, - "logps/rejected": -625.5068359375, - "loss": 0.3772, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.616844654083252, - "rewards/margins": 1.0423572063446045, - "rewards/rejected": -2.6592018604278564, - "step": 248 - }, - { - "epoch": 0.49700598802395207, - "grad_norm": 13.46509740936646, - "learning_rate": 2.968453286464312e-07, - "logits/chosen": -20.32817840576172, - "logits/rejected": -19.739133834838867, - "logps/chosen": -538.3781127929688, - "logps/rejected": -731.992431640625, - "loss": 0.3985, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7721741199493408, - "rewards/margins": 1.9263598918914795, - "rewards/rejected": -3.6985340118408203, - "step": 249 - }, - { - "epoch": 0.499001996007984, - "grad_norm": 13.647026896565563, - "learning_rate": 2.9512978631264e-07, - "logits/chosen": -19.286752700805664, - "logits/rejected": -19.01239013671875, - "logps/chosen": -533.054931640625, - "logps/rejected": -629.1688232421875, - "loss": 0.4277, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.2967782020568848, - "rewards/margins": 1.074138879776001, - "rewards/rejected": -2.3709170818328857, - "step": 250 - }, - { - "epoch": 0.500998003992016, - "grad_norm": 12.433462674366762, - "learning_rate": 2.934120444167326e-07, - "logits/chosen": -19.06900978088379, - "logits/rejected": -19.819610595703125, - "logps/chosen": -459.6295166015625, - "logps/rejected": -622.0855102539062, - "loss": 0.4119, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.2915412187576294, - "rewards/margins": 1.6004011631011963, - "rewards/rejected": -2.8919425010681152, - "step": 251 - }, - { - "epoch": 0.5029940119760479, - "grad_norm": 13.330997971956275, - "learning_rate": 2.916921866790256e-07, - "logits/chosen": -19.483095169067383, - "logits/rejected": -19.165523529052734, - "logps/chosen": -636.4345703125, - "logps/rejected": -667.2681884765625, - "loss": 0.4478, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.135575294494629, - "rewards/margins": 0.516217827796936, - "rewards/rejected": -2.6517930030822754, - "step": 252 - }, - { - "epoch": 0.5049900199600799, - "grad_norm": 12.284544480667769, - "learning_rate": 2.899702969229587e-07, - "logits/chosen": -19.874605178833008, - "logits/rejected": -20.1215763092041, - "logps/chosen": -472.7792663574219, - "logps/rejected": -599.262939453125, - "loss": 0.3955, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4192883968353271, - "rewards/margins": 1.275083065032959, - "rewards/rejected": -2.694371461868286, - "step": 253 - }, - { - "epoch": 0.5069860279441117, - "grad_norm": 15.336351403927143, - "learning_rate": 2.8824645907100955e-07, - "logits/chosen": -20.512561798095703, - "logits/rejected": -19.61855697631836, - "logps/chosen": -571.308349609375, - "logps/rejected": -582.1721801757812, - "loss": 0.4667, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.6224905252456665, - "rewards/margins": 0.5911039710044861, - "rewards/rejected": -2.213594436645508, - "step": 254 - }, - { - "epoch": 0.5089820359281437, - "grad_norm": 14.878520589776237, - "learning_rate": 2.865207571406029e-07, - "logits/chosen": -19.37316131591797, - "logits/rejected": -19.89968490600586, - "logps/chosen": -524.258056640625, - "logps/rejected": -798.5974731445312, - "loss": 0.4355, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7529133558273315, - "rewards/margins": 2.135753870010376, - "rewards/rejected": -3.888667345046997, - "step": 255 - }, - { - "epoch": 0.5109780439121756, - "grad_norm": 16.354875819925365, - "learning_rate": 2.8479327524001633e-07, - "logits/chosen": -20.016483306884766, - "logits/rejected": -20.36672019958496, - "logps/chosen": -499.89056396484375, - "logps/rejected": -636.91943359375, - "loss": 0.4246, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5554924011230469, - "rewards/margins": 1.422688364982605, - "rewards/rejected": -2.9781811237335205, - "step": 256 - }, - { - "epoch": 0.5129740518962076, - "grad_norm": 20.065788599542866, - "learning_rate": 2.830640975642806e-07, - "logits/chosen": -21.746936798095703, - "logits/rejected": -20.40896987915039, - "logps/chosen": -654.2992553710938, - "logps/rejected": -704.3209838867188, - "loss": 0.4569, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.4051055908203125, - "rewards/margins": 0.7097451090812683, - "rewards/rejected": -3.1148507595062256, - "step": 257 - }, - { - "epoch": 0.5149700598802395, - "grad_norm": 15.50568263353288, - "learning_rate": 2.8133330839107604e-07, - "logits/chosen": -19.940874099731445, - "logits/rejected": -19.60299301147461, - "logps/chosen": -545.5746459960938, - "logps/rejected": -667.3504638671875, - "loss": 0.4183, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9626977443695068, - "rewards/margins": 1.1021788120269775, - "rewards/rejected": -3.0648765563964844, - "step": 258 - }, - { - "epoch": 0.5169660678642715, - "grad_norm": 12.835594737631492, - "learning_rate": 2.796009920766253e-07, - "logits/chosen": -19.640548706054688, - "logits/rejected": -19.253395080566406, - "logps/chosen": -555.4740600585938, - "logps/rejected": -731.1700439453125, - "loss": 0.4308, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6881296634674072, - "rewards/margins": 1.8761582374572754, - "rewards/rejected": -3.5642879009246826, - "step": 259 - }, - { - "epoch": 0.5189620758483033, - "grad_norm": 13.23490536958255, - "learning_rate": 2.7786723305158135e-07, - "logits/chosen": -19.513334274291992, - "logits/rejected": -19.781173706054688, - "logps/chosen": -438.3570556640625, - "logps/rejected": -544.1898193359375, - "loss": 0.3835, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.510940432548523, - "rewards/margins": 0.972527027130127, - "rewards/rejected": -2.4834673404693604, - "step": 260 - }, - { - "epoch": 0.5209580838323353, - "grad_norm": 14.163744381342129, - "learning_rate": 2.761321158169134e-07, - "logits/chosen": -19.43235206604004, - "logits/rejected": -19.761728286743164, - "logps/chosen": -659.931396484375, - "logps/rejected": -784.9754028320312, - "loss": 0.4556, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.1872830390930176, - "rewards/margins": 1.1087684631347656, - "rewards/rejected": -3.296051502227783, - "step": 261 - }, - { - "epoch": 0.5229540918163673, - "grad_norm": 15.369097445080758, - "learning_rate": 2.7439572493978737e-07, - "logits/chosen": -20.19454002380371, - "logits/rejected": -20.246570587158203, - "logps/chosen": -498.49896240234375, - "logps/rejected": -600.5203247070312, - "loss": 0.4218, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4001764059066772, - "rewards/margins": 1.2271099090576172, - "rewards/rejected": -2.627286434173584, - "step": 262 - }, - { - "epoch": 0.5249500998003992, - "grad_norm": 13.540074391202866, - "learning_rate": 2.726581450494451e-07, - "logits/chosen": -19.287321090698242, - "logits/rejected": -19.06431770324707, - "logps/chosen": -585.2114868164062, - "logps/rejected": -710.250732421875, - "loss": 0.4398, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.719393014907837, - "rewards/margins": 1.5113741159439087, - "rewards/rejected": -3.230767250061035, - "step": 263 - }, - { - "epoch": 0.5269461077844312, - "grad_norm": 16.14836459040014, - "learning_rate": 2.709194608330789e-07, - "logits/chosen": -19.105375289916992, - "logits/rejected": -19.293088912963867, - "logps/chosen": -679.0133056640625, - "logps/rejected": -880.2947998046875, - "loss": 0.4793, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.0702013969421387, - "rewards/margins": 1.9545456171035767, - "rewards/rejected": -4.024746894836426, - "step": 264 - }, - { - "epoch": 0.5289421157684631, - "grad_norm": 13.93014612652964, - "learning_rate": 2.6917975703170465e-07, - "logits/chosen": -20.631940841674805, - "logits/rejected": -20.239978790283203, - "logps/chosen": -551.091064453125, - "logps/rejected": -643.3578491210938, - "loss": 0.4321, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.8941808938980103, - "rewards/margins": 1.0745964050292969, - "rewards/rejected": -2.9687774181365967, - "step": 265 - }, - { - "epoch": 0.530938123752495, - "grad_norm": 13.940773373889629, - "learning_rate": 2.674391184360313e-07, - "logits/chosen": -20.835819244384766, - "logits/rejected": -20.55775260925293, - "logps/chosen": -579.5352783203125, - "logps/rejected": -668.495361328125, - "loss": 0.4576, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.4945411682128906, - "rewards/margins": 0.9422330856323242, - "rewards/rejected": -3.436774730682373, - "step": 266 - }, - { - "epoch": 0.5329341317365269, - "grad_norm": 15.347566297152046, - "learning_rate": 2.6569762988232837e-07, - "logits/chosen": -20.884387969970703, - "logits/rejected": -20.510915756225586, - "logps/chosen": -554.3668823242188, - "logps/rejected": -760.4378051757812, - "loss": 0.4118, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9943656921386719, - "rewards/margins": 1.6108964681625366, - "rewards/rejected": -3.605262041091919, - "step": 267 - }, - { - "epoch": 0.5349301397205589, - "grad_norm": 15.083964912398525, - "learning_rate": 2.63955376248291e-07, - "logits/chosen": -20.903093338012695, - "logits/rejected": -20.814924240112305, - "logps/chosen": -468.0397644042969, - "logps/rejected": -557.53173828125, - "loss": 0.4452, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.621910572052002, - "rewards/margins": 0.8261189460754395, - "rewards/rejected": -2.4480295181274414, - "step": 268 - }, - { - "epoch": 0.5369261477045908, - "grad_norm": 21.070254941679767, - "learning_rate": 2.6221244244890336e-07, - "logits/chosen": -19.996227264404297, - "logits/rejected": -19.785118103027344, - "logps/chosen": -508.9876708984375, - "logps/rejected": -666.5833740234375, - "loss": 0.5038, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7276027202606201, - "rewards/margins": 1.1843074560165405, - "rewards/rejected": -2.91191029548645, - "step": 269 - }, - { - "epoch": 0.5389221556886228, - "grad_norm": 13.987780368181504, - "learning_rate": 2.6046891343229986e-07, - "logits/chosen": -19.805574417114258, - "logits/rejected": -20.00739288330078, - "logps/chosen": -516.5226440429688, - "logps/rejected": -636.6571044921875, - "loss": 0.4289, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.874422311782837, - "rewards/margins": 1.1742898225784302, - "rewards/rejected": -3.0487117767333984, - "step": 270 - }, - { - "epoch": 0.5409181636726547, - "grad_norm": 16.160524902073316, - "learning_rate": 2.5872487417562527e-07, - "logits/chosen": -18.61543083190918, - "logits/rejected": -19.55666732788086, - "logps/chosen": -656.6258544921875, - "logps/rejected": -729.110595703125, - "loss": 0.4508, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.253310203552246, - "rewards/margins": 1.0833779573440552, - "rewards/rejected": -3.33668851852417, - "step": 271 - }, - { - "epoch": 0.5429141716566867, - "grad_norm": 16.13794066326181, - "learning_rate": 2.569804096808922e-07, - "logits/chosen": -19.252918243408203, - "logits/rejected": -19.608856201171875, - "logps/chosen": -544.688232421875, - "logps/rejected": -677.6744384765625, - "loss": 0.4402, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7599573135375977, - "rewards/margins": 1.116705060005188, - "rewards/rejected": -2.876662492752075, - "step": 272 - }, - { - "epoch": 0.5449101796407185, - "grad_norm": 15.251714021976639, - "learning_rate": 2.5523560497083924e-07, - "logits/chosen": -19.35736083984375, - "logits/rejected": -19.229040145874023, - "logps/chosen": -520.7670288085938, - "logps/rejected": -589.5751953125, - "loss": 0.4691, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5088708400726318, - "rewards/margins": 0.7264382839202881, - "rewards/rejected": -2.23530912399292, - "step": 273 - }, - { - "epoch": 0.5469061876247505, - "grad_norm": 14.735102764186708, - "learning_rate": 2.5349054508478635e-07, - "logits/chosen": -20.558042526245117, - "logits/rejected": -20.376361846923828, - "logps/chosen": -581.2306518554688, - "logps/rejected": -683.8033447265625, - "loss": 0.4404, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7012970447540283, - "rewards/margins": 1.082260012626648, - "rewards/rejected": -2.783557176589966, - "step": 274 - }, - { - "epoch": 0.5489021956087824, - "grad_norm": 15.97902029283524, - "learning_rate": 2.5174531507449037e-07, - "logits/chosen": -19.68293571472168, - "logits/rejected": -19.427963256835938, - "logps/chosen": -575.7290649414062, - "logps/rejected": -604.75634765625, - "loss": 0.4362, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.912692666053772, - "rewards/margins": 0.48665523529052734, - "rewards/rejected": -2.3993477821350098, - "step": 275 - }, - { - "epoch": 0.5508982035928144, - "grad_norm": 13.384588758075404, - "learning_rate": 2.5e-07, - "logits/chosen": -19.830730438232422, - "logits/rejected": -20.07389259338379, - "logps/chosen": -524.1917114257812, - "logps/rejected": -626.604736328125, - "loss": 0.4024, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6113872528076172, - "rewards/margins": 1.069969654083252, - "rewards/rejected": -2.681356906890869, - "step": 276 - }, - { - "epoch": 0.5528942115768463, - "grad_norm": 14.356201234303862, - "learning_rate": 2.482546849255096e-07, - "logits/chosen": -20.20763397216797, - "logits/rejected": -19.939579010009766, - "logps/chosen": -454.4245300292969, - "logps/rejected": -588.796630859375, - "loss": 0.4057, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4646457433700562, - "rewards/margins": 1.3196232318878174, - "rewards/rejected": -2.784268617630005, - "step": 277 - }, - { - "epoch": 0.5548902195608783, - "grad_norm": 14.969828094442494, - "learning_rate": 2.465094549152137e-07, - "logits/chosen": -19.216341018676758, - "logits/rejected": -19.786149978637695, - "logps/chosen": -622.8614501953125, - "logps/rejected": -837.9845581054688, - "loss": 0.3914, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.7940454483032227, - "rewards/margins": 1.845615267753601, - "rewards/rejected": -3.6396608352661133, - "step": 278 - }, - { - "epoch": 0.5568862275449101, - "grad_norm": 14.687976893363183, - "learning_rate": 2.447643950291608e-07, - "logits/chosen": -18.467819213867188, - "logits/rejected": -18.3898983001709, - "logps/chosen": -511.738037109375, - "logps/rejected": -543.1464233398438, - "loss": 0.4176, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5452011823654175, - "rewards/margins": 0.6293449997901917, - "rewards/rejected": -2.174546241760254, - "step": 279 - }, - { - "epoch": 0.5588822355289421, - "grad_norm": 13.84652065752045, - "learning_rate": 2.430195903191078e-07, - "logits/chosen": -19.583114624023438, - "logits/rejected": -19.34761619567871, - "logps/chosen": -517.0689697265625, - "logps/rejected": -610.4008178710938, - "loss": 0.4747, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4040230512619019, - "rewards/margins": 1.110020399093628, - "rewards/rejected": -2.5140433311462402, - "step": 280 - }, - { - "epoch": 0.5608782435129741, - "grad_norm": 16.303319683710235, - "learning_rate": 2.412751258243748e-07, - "logits/chosen": -19.688095092773438, - "logits/rejected": -19.690338134765625, - "logps/chosen": -671.603271484375, - "logps/rejected": -790.10205078125, - "loss": 0.4407, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.3799023628234863, - "rewards/margins": 0.8681031465530396, - "rewards/rejected": -3.2480056285858154, - "step": 281 - }, - { - "epoch": 0.562874251497006, - "grad_norm": 14.15798233649853, - "learning_rate": 2.395310865677001e-07, - "logits/chosen": -19.338281631469727, - "logits/rejected": -18.604310989379883, - "logps/chosen": -572.2766723632812, - "logps/rejected": -635.2128295898438, - "loss": 0.4257, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.1379201412200928, - "rewards/margins": 0.7467140555381775, - "rewards/rejected": -2.884634017944336, - "step": 282 - }, - { - "epoch": 0.564870259481038, - "grad_norm": 40.962955478565384, - "learning_rate": 2.3778755755109667e-07, - "logits/chosen": -19.42969512939453, - "logits/rejected": -19.07590675354004, - "logps/chosen": -734.955810546875, - "logps/rejected": -692.2105102539062, - "loss": 0.5513, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.987337350845337, - "rewards/margins": -0.13384895026683807, - "rewards/rejected": -2.8534886837005615, - "step": 283 - }, - { - "epoch": 0.5668662674650699, - "grad_norm": 13.413480487115413, - "learning_rate": 2.3604462375170903e-07, - "logits/chosen": -20.070852279663086, - "logits/rejected": -20.444026947021484, - "logps/chosen": -625.17236328125, - "logps/rejected": -736.5753173828125, - "loss": 0.3945, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.200939655303955, - "rewards/margins": 1.2015022039413452, - "rewards/rejected": -3.4024417400360107, - "step": 284 - }, - { - "epoch": 0.5688622754491018, - "grad_norm": 15.606238042142097, - "learning_rate": 2.3430237011767164e-07, - "logits/chosen": -20.74646759033203, - "logits/rejected": -20.663923263549805, - "logps/chosen": -483.0484313964844, - "logps/rejected": -695.172119140625, - "loss": 0.4272, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.707973599433899, - "rewards/margins": 1.6547538042068481, - "rewards/rejected": -3.362727165222168, - "step": 285 - }, - { - "epoch": 0.5708582834331337, - "grad_norm": 14.723616978428565, - "learning_rate": 2.3256088156396868e-07, - "logits/chosen": -20.511024475097656, - "logits/rejected": -20.550968170166016, - "logps/chosen": -386.5881042480469, - "logps/rejected": -672.30224609375, - "loss": 0.4103, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3118516206741333, - "rewards/margins": 2.176816940307617, - "rewards/rejected": -3.488668918609619, - "step": 286 - }, - { - "epoch": 0.5728542914171657, - "grad_norm": 17.104594605209122, - "learning_rate": 2.3082024296829532e-07, - "logits/chosen": -20.359363555908203, - "logits/rejected": -19.974714279174805, - "logps/chosen": -381.4059143066406, - "logps/rejected": -490.9525146484375, - "loss": 0.4721, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4457169771194458, - "rewards/margins": 0.9019349813461304, - "rewards/rejected": -2.347651958465576, - "step": 287 - }, - { - "epoch": 0.5748502994011976, - "grad_norm": 12.348244088976356, - "learning_rate": 2.2908053916692116e-07, - "logits/chosen": -19.375926971435547, - "logits/rejected": -19.011024475097656, - "logps/chosen": -460.38519287109375, - "logps/rejected": -664.692626953125, - "loss": 0.3529, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.5625884532928467, - "rewards/margins": 1.8966760635375977, - "rewards/rejected": -3.4592647552490234, - "step": 288 - }, - { - "epoch": 0.5768463073852296, - "grad_norm": 18.43492666732753, - "learning_rate": 2.2734185495055498e-07, - "logits/chosen": -19.75726318359375, - "logits/rejected": -20.084196090698242, - "logps/chosen": -486.993408203125, - "logps/rejected": -599.3756103515625, - "loss": 0.4596, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9165252447128296, - "rewards/margins": 0.9002697467803955, - "rewards/rejected": -2.8167953491210938, - "step": 289 - }, - { - "epoch": 0.5788423153692615, - "grad_norm": 21.054407486526834, - "learning_rate": 2.2560427506021264e-07, - "logits/chosen": -21.07491111755371, - "logits/rejected": -20.739715576171875, - "logps/chosen": -507.4473571777344, - "logps/rejected": -605.4733276367188, - "loss": 0.4921, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7558261156082153, - "rewards/margins": 0.8664282560348511, - "rewards/rejected": -2.6222543716430664, - "step": 290 - }, - { - "epoch": 0.5808383233532934, - "grad_norm": 14.716476253084066, - "learning_rate": 2.2386788418308665e-07, - "logits/chosen": -19.136695861816406, - "logits/rejected": -19.36062240600586, - "logps/chosen": -466.1701354980469, - "logps/rejected": -597.2568359375, - "loss": 0.4432, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9274959564208984, - "rewards/margins": 0.9277093410491943, - "rewards/rejected": -2.8552052974700928, - "step": 291 - }, - { - "epoch": 0.5828343313373253, - "grad_norm": 14.446885493365698, - "learning_rate": 2.2213276694841865e-07, - "logits/chosen": -19.835556030273438, - "logits/rejected": -19.720577239990234, - "logps/chosen": -451.08441162109375, - "logps/rejected": -530.003662109375, - "loss": 0.4187, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.7320913076400757, - "rewards/margins": 0.7514263987541199, - "rewards/rejected": -2.483517646789551, - "step": 292 - }, - { - "epoch": 0.5848303393213573, - "grad_norm": 17.330269076521912, - "learning_rate": 2.2039900792337474e-07, - "logits/chosen": -21.39358901977539, - "logits/rejected": -21.413705825805664, - "logps/chosen": -463.1463623046875, - "logps/rejected": -583.657470703125, - "loss": 0.4415, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7131471633911133, - "rewards/margins": 1.0683386325836182, - "rewards/rejected": -2.7814857959747314, - "step": 293 - }, - { - "epoch": 0.5868263473053892, - "grad_norm": 14.007129468785923, - "learning_rate": 2.1866669160892389e-07, - "logits/chosen": -18.738121032714844, - "logits/rejected": -19.75026512145996, - "logps/chosen": -575.805419921875, - "logps/rejected": -824.0968017578125, - "loss": 0.4254, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4536679983139038, - "rewards/margins": 2.2131245136260986, - "rewards/rejected": -3.666792392730713, - "step": 294 - }, - { - "epoch": 0.5888223552894212, - "grad_norm": 16.60108694572921, - "learning_rate": 2.1693590243571935e-07, - "logits/chosen": -19.9157657623291, - "logits/rejected": -20.169784545898438, - "logps/chosen": -573.23046875, - "logps/rejected": -665.9788818359375, - "loss": 0.3684, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9368046522140503, - "rewards/margins": 0.7889972925186157, - "rewards/rejected": -2.725801706314087, - "step": 295 - }, - { - "epoch": 0.590818363273453, - "grad_norm": 14.64139947604633, - "learning_rate": 2.152067247599837e-07, - "logits/chosen": -20.16104507446289, - "logits/rejected": -20.16781997680664, - "logps/chosen": -652.9393920898438, - "logps/rejected": -674.0302734375, - "loss": 0.4555, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5257301330566406, - "rewards/margins": 0.5331373810768127, - "rewards/rejected": -3.0588674545288086, - "step": 296 - }, - { - "epoch": 0.592814371257485, - "grad_norm": 15.25905759535438, - "learning_rate": 2.1347924285939712e-07, - "logits/chosen": -20.704172134399414, - "logits/rejected": -19.96450424194336, - "logps/chosen": -507.278564453125, - "logps/rejected": -505.9412841796875, - "loss": 0.4724, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6988550424575806, - "rewards/margins": 0.4168252944946289, - "rewards/rejected": -2.115680456161499, - "step": 297 - }, - { - "epoch": 0.5948103792415169, - "grad_norm": 19.931215357752848, - "learning_rate": 2.117535409289905e-07, - "logits/chosen": -21.177928924560547, - "logits/rejected": -21.209348678588867, - "logps/chosen": -413.62567138671875, - "logps/rejected": -724.2362060546875, - "loss": 0.4562, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4525749683380127, - "rewards/margins": 2.526947259902954, - "rewards/rejected": -3.979522228240967, - "step": 298 - }, - { - "epoch": 0.5968063872255489, - "grad_norm": 15.466070557361297, - "learning_rate": 2.100297030770413e-07, - "logits/chosen": -20.977407455444336, - "logits/rejected": -20.545534133911133, - "logps/chosen": -453.90838623046875, - "logps/rejected": -543.0181884765625, - "loss": 0.4821, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7299668788909912, - "rewards/margins": 0.792263388633728, - "rewards/rejected": -2.5222301483154297, - "step": 299 - }, - { - "epoch": 0.5988023952095808, - "grad_norm": 15.538644854464293, - "learning_rate": 2.0830781332097445e-07, - "logits/chosen": -20.28766441345215, - "logits/rejected": -19.859888076782227, - "logps/chosen": -495.1536560058594, - "logps/rejected": -636.5488891601562, - "loss": 0.4248, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4328924417495728, - "rewards/margins": 1.6545376777648926, - "rewards/rejected": -3.087430000305176, - "step": 300 - }, - { - "epoch": 0.6007984031936128, - "grad_norm": 15.125688011588869, - "learning_rate": 2.065879555832674e-07, - "logits/chosen": -19.8198299407959, - "logits/rejected": -19.781383514404297, - "logps/chosen": -614.9022216796875, - "logps/rejected": -763.259033203125, - "loss": 0.3997, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.6993727684020996, - "rewards/margins": 1.1797199249267578, - "rewards/rejected": -3.8790926933288574, - "step": 301 - }, - { - "epoch": 0.6027944111776448, - "grad_norm": 16.044735764949653, - "learning_rate": 2.0487021368736002e-07, - "logits/chosen": -19.610458374023438, - "logits/rejected": -19.683618545532227, - "logps/chosen": -607.406005859375, - "logps/rejected": -767.1768798828125, - "loss": 0.421, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.3819403648376465, - "rewards/margins": 1.3107446432113647, - "rewards/rejected": -3.692685127258301, - "step": 302 - }, - { - "epoch": 0.6047904191616766, - "grad_norm": 23.962926461003274, - "learning_rate": 2.0315467135356878e-07, - "logits/chosen": -20.334712982177734, - "logits/rejected": -20.212602615356445, - "logps/chosen": -501.9886779785156, - "logps/rejected": -551.6432495117188, - "loss": 0.4439, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7299950122833252, - "rewards/margins": 0.6993128061294556, - "rewards/rejected": -2.4293079376220703, - "step": 303 - }, - { - "epoch": 0.6067864271457086, - "grad_norm": 18.10881682443083, - "learning_rate": 2.0144141219500704e-07, - "logits/chosen": -18.936782836914062, - "logits/rejected": -19.05160140991211, - "logps/chosen": -524.8377075195312, - "logps/rejected": -604.0562133789062, - "loss": 0.4201, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5715314149856567, - "rewards/margins": 0.9257473945617676, - "rewards/rejected": -2.4972786903381348, - "step": 304 - }, - { - "epoch": 0.6087824351297405, - "grad_norm": 17.494266000385768, - "learning_rate": 1.9973051971350888e-07, - "logits/chosen": -19.74258804321289, - "logits/rejected": -19.830686569213867, - "logps/chosen": -574.1022338867188, - "logps/rejected": -671.13427734375, - "loss": 0.4209, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.01725435256958, - "rewards/margins": 0.9205825328826904, - "rewards/rejected": -2.9378366470336914, - "step": 305 - }, - { - "epoch": 0.6107784431137725, - "grad_norm": 26.104189081412425, - "learning_rate": 1.980220772955602e-07, - "logits/chosen": -19.05501937866211, - "logits/rejected": -19.7154598236084, - "logps/chosen": -624.7476196289062, - "logps/rejected": -970.2384033203125, - "loss": 0.3823, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.797724962234497, - "rewards/margins": 2.543869972229004, - "rewards/rejected": -4.34159517288208, - "step": 306 - }, - { - "epoch": 0.6127744510978044, - "grad_norm": 16.989107574051708, - "learning_rate": 1.9631616820823418e-07, - "logits/chosen": -20.383739471435547, - "logits/rejected": -19.768543243408203, - "logps/chosen": -396.6142578125, - "logps/rejected": -431.7706298828125, - "loss": 0.5028, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.1728373765945435, - "rewards/margins": 0.7236466407775879, - "rewards/rejected": -1.8964838981628418, - "step": 307 - }, - { - "epoch": 0.6147704590818364, - "grad_norm": 12.968487062626416, - "learning_rate": 1.9461287559513318e-07, - "logits/chosen": -20.2695369720459, - "logits/rejected": -20.062299728393555, - "logps/chosen": -569.882568359375, - "logps/rejected": -713.83984375, - "loss": 0.3679, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.0745344161987305, - "rewards/margins": 1.5346819162368774, - "rewards/rejected": -3.6092166900634766, - "step": 308 - }, - { - "epoch": 0.6167664670658682, - "grad_norm": 16.448425075026044, - "learning_rate": 1.9291228247233603e-07, - "logits/chosen": -20.340492248535156, - "logits/rejected": -20.029369354248047, - "logps/chosen": -536.5447387695312, - "logps/rejected": -629.1495361328125, - "loss": 0.4095, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.9390649795532227, - "rewards/margins": 0.7921390533447266, - "rewards/rejected": -2.73120379447937, - "step": 309 - }, - { - "epoch": 0.6187624750499002, - "grad_norm": 14.227557105375345, - "learning_rate": 1.9121447172435248e-07, - "logits/chosen": -19.732128143310547, - "logits/rejected": -18.906469345092773, - "logps/chosen": -546.9534301757812, - "logps/rejected": -616.367431640625, - "loss": 0.3997, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.1328556537628174, - "rewards/margins": 0.8451984524726868, - "rewards/rejected": -2.9780540466308594, - "step": 310 - }, - { - "epoch": 0.6207584830339321, - "grad_norm": 18.990805165227112, - "learning_rate": 1.895195261000831e-07, - "logits/chosen": -19.93746566772461, - "logits/rejected": -18.98281478881836, - "logps/chosen": -620.0504150390625, - "logps/rejected": -729.6024169921875, - "loss": 0.4899, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.4116480350494385, - "rewards/margins": 1.04122793674469, - "rewards/rejected": -3.452876091003418, - "step": 311 - }, - { - "epoch": 0.6227544910179641, - "grad_norm": 16.417731370277682, - "learning_rate": 1.8782752820878633e-07, - "logits/chosen": -20.762357711791992, - "logits/rejected": -20.430248260498047, - "logps/chosen": -538.3031616210938, - "logps/rejected": -689.8028564453125, - "loss": 0.3585, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6533321142196655, - "rewards/margins": 1.5152018070220947, - "rewards/rejected": -3.16853404045105, - "step": 312 - }, - { - "epoch": 0.624750499001996, - "grad_norm": 16.28789801821355, - "learning_rate": 1.861385605160524e-07, - "logits/chosen": -20.3260440826416, - "logits/rejected": -20.39736557006836, - "logps/chosen": -523.057861328125, - "logps/rejected": -609.2344360351562, - "loss": 0.4389, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9850291013717651, - "rewards/margins": 0.9323675036430359, - "rewards/rejected": -2.9173967838287354, - "step": 313 - }, - { - "epoch": 0.626746506986028, - "grad_norm": 15.824452080667786, - "learning_rate": 1.8445270533978386e-07, - "logits/chosen": -19.66412925720215, - "logits/rejected": -20.326953887939453, - "logps/chosen": -641.4557495117188, - "logps/rejected": -730.4495849609375, - "loss": 0.4671, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.851925849914551, - "rewards/margins": 0.7540739178657532, - "rewards/rejected": -3.6059999465942383, - "step": 314 - }, - { - "epoch": 0.6287425149700598, - "grad_norm": 13.834895066668928, - "learning_rate": 1.8277004484618357e-07, - "logits/chosen": -19.389610290527344, - "logits/rejected": -18.954692840576172, - "logps/chosen": -457.67523193359375, - "logps/rejected": -551.67822265625, - "loss": 0.4086, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5931004285812378, - "rewards/margins": 0.8220267295837402, - "rewards/rejected": -2.4151270389556885, - "step": 315 - }, - { - "epoch": 0.6307385229540918, - "grad_norm": 16.231623830784724, - "learning_rate": 1.810906610457502e-07, - "logits/chosen": -19.13648796081543, - "logits/rejected": -19.79732322692871, - "logps/chosen": -523.3819580078125, - "logps/rejected": -658.5681762695312, - "loss": 0.4065, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.1683220863342285, - "rewards/margins": 1.2013542652130127, - "rewards/rejected": -3.369676351547241, - "step": 316 - }, - { - "epoch": 0.6327345309381237, - "grad_norm": 16.28726347208119, - "learning_rate": 1.7941463578928083e-07, - "logits/chosen": -21.37050437927246, - "logits/rejected": -20.991573333740234, - "logps/chosen": -530.572021484375, - "logps/rejected": -699.7788696289062, - "loss": 0.5084, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0319900512695312, - "rewards/margins": 1.6102505922317505, - "rewards/rejected": -3.6422407627105713, - "step": 317 - }, - { - "epoch": 0.6347305389221557, - "grad_norm": 14.532870476828629, - "learning_rate": 1.7774205076388205e-07, - "logits/chosen": -20.120237350463867, - "logits/rejected": -20.24286651611328, - "logps/chosen": -584.264404296875, - "logps/rejected": -733.0006103515625, - "loss": 0.411, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.146928548812866, - "rewards/margins": 1.5503309965133667, - "rewards/rejected": -3.6972596645355225, - "step": 318 - }, - { - "epoch": 0.6367265469061876, - "grad_norm": 13.681025865009596, - "learning_rate": 1.760729874889884e-07, - "logits/chosen": -18.877016067504883, - "logits/rejected": -19.01573944091797, - "logps/chosen": -565.078125, - "logps/rejected": -804.6910400390625, - "loss": 0.398, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.9724178314208984, - "rewards/margins": 1.8352855443954468, - "rewards/rejected": -3.8077030181884766, - "step": 319 - }, - { - "epoch": 0.6387225548902196, - "grad_norm": 15.7704191898093, - "learning_rate": 1.744075273123889e-07, - "logits/chosen": -19.746397018432617, - "logits/rejected": -19.476728439331055, - "logps/chosen": -529.2221069335938, - "logps/rejected": -647.2295532226562, - "loss": 0.4649, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6059311628341675, - "rewards/margins": 1.3847919702529907, - "rewards/rejected": -2.990723133087158, - "step": 320 - }, - { - "epoch": 0.6407185628742516, - "grad_norm": 14.126841947457866, - "learning_rate": 1.7274575140626315e-07, - "logits/chosen": -20.280803680419922, - "logits/rejected": -20.373706817626953, - "logps/chosen": -603.7615356445312, - "logps/rejected": -756.317626953125, - "loss": 0.3544, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.087045431137085, - "rewards/margins": 1.3426419496536255, - "rewards/rejected": -3.429687261581421, - "step": 321 - }, - { - "epoch": 0.6427145708582834, - "grad_norm": 15.438444258816508, - "learning_rate": 1.710877407632244e-07, - "logits/chosen": -19.929384231567383, - "logits/rejected": -19.751436233520508, - "logps/chosen": -588.960693359375, - "logps/rejected": -618.856689453125, - "loss": 0.426, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.159414291381836, - "rewards/margins": 0.37070193886756897, - "rewards/rejected": -2.530116558074951, - "step": 322 - }, - { - "epoch": 0.6447105788423154, - "grad_norm": 14.55784121973574, - "learning_rate": 1.6943357619237225e-07, - "logits/chosen": -19.727588653564453, - "logits/rejected": -19.87441062927246, - "logps/chosen": -596.117919921875, - "logps/rejected": -776.1685180664062, - "loss": 0.3911, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8160898685455322, - "rewards/margins": 1.6515451669692993, - "rewards/rejected": -3.467634916305542, - "step": 323 - }, - { - "epoch": 0.6467065868263473, - "grad_norm": 13.660945117659038, - "learning_rate": 1.6778333831535417e-07, - "logits/chosen": -18.712209701538086, - "logits/rejected": -18.91518783569336, - "logps/chosen": -471.828369140625, - "logps/rejected": -498.8101501464844, - "loss": 0.4356, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.51183021068573, - "rewards/margins": 0.41257262229919434, - "rewards/rejected": -1.9244028329849243, - "step": 324 - }, - { - "epoch": 0.6487025948103793, - "grad_norm": 14.268490761675793, - "learning_rate": 1.6613710756243627e-07, - "logits/chosen": -20.252544403076172, - "logits/rejected": -19.898006439208984, - "logps/chosen": -504.5931396484375, - "logps/rejected": -638.2498168945312, - "loss": 0.3753, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.921488881111145, - "rewards/margins": 1.3233587741851807, - "rewards/rejected": -3.2448477745056152, - "step": 325 - }, - { - "epoch": 0.6506986027944112, - "grad_norm": 15.278109072504288, - "learning_rate": 1.6449496416858282e-07, - "logits/chosen": -20.0165958404541, - "logits/rejected": -19.995262145996094, - "logps/chosen": -691.3213500976562, - "logps/rejected": -679.55859375, - "loss": 0.4287, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.590402603149414, - "rewards/margins": 0.43861329555511475, - "rewards/rejected": -3.0290160179138184, - "step": 326 - }, - { - "epoch": 0.6526946107784432, - "grad_norm": 15.482749752170122, - "learning_rate": 1.6285698816954624e-07, - "logits/chosen": -18.340465545654297, - "logits/rejected": -18.536087036132812, - "logps/chosen": -747.0191650390625, - "logps/rejected": -799.79931640625, - "loss": 0.4452, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6365926265716553, - "rewards/margins": 0.8290849924087524, - "rewards/rejected": -3.465677261352539, - "step": 327 - }, - { - "epoch": 0.654690618762475, - "grad_norm": 39.62551655522353, - "learning_rate": 1.6122325939796578e-07, - "logits/chosen": -20.012954711914062, - "logits/rejected": -19.96734046936035, - "logps/chosen": -476.8362731933594, - "logps/rejected": -573.0825805664062, - "loss": 0.4182, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.8147090673446655, - "rewards/margins": 1.109825849533081, - "rewards/rejected": -2.924534797668457, - "step": 328 - }, - { - "epoch": 0.656686626746507, - "grad_norm": 13.896657489041019, - "learning_rate": 1.5959385747947695e-07, - "logits/chosen": -20.275821685791016, - "logits/rejected": -20.038957595825195, - "logps/chosen": -496.6339416503906, - "logps/rejected": -587.96630859375, - "loss": 0.4041, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7912815809249878, - "rewards/margins": 0.8887139558792114, - "rewards/rejected": -2.679995536804199, - "step": 329 - }, - { - "epoch": 0.6586826347305389, - "grad_norm": 14.121557644039507, - "learning_rate": 1.579688618288305e-07, - "logits/chosen": -19.625347137451172, - "logits/rejected": -19.073387145996094, - "logps/chosen": -505.35906982421875, - "logps/rejected": -594.4898681640625, - "loss": 0.4132, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7679235935211182, - "rewards/margins": 1.1094515323638916, - "rewards/rejected": -2.8773748874664307, - "step": 330 - }, - { - "epoch": 0.6606786427145709, - "grad_norm": 15.286311030110486, - "learning_rate": 1.5634835164602196e-07, - "logits/chosen": -20.13866424560547, - "logits/rejected": -20.248882293701172, - "logps/chosen": -440.9212646484375, - "logps/rejected": -538.68212890625, - "loss": 0.4262, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2733371257781982, - "rewards/margins": 1.1516056060791016, - "rewards/rejected": -2.4249427318573, - "step": 331 - }, - { - "epoch": 0.6626746506986028, - "grad_norm": 15.924081453100836, - "learning_rate": 1.5473240591243149e-07, - "logits/chosen": -20.336015701293945, - "logits/rejected": -19.631010055541992, - "logps/chosen": -538.7817993164062, - "logps/rejected": -691.6004028320312, - "loss": 0.4536, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.037027597427368, - "rewards/margins": 1.2660547494888306, - "rewards/rejected": -3.3030824661254883, - "step": 332 - }, - { - "epoch": 0.6646706586826348, - "grad_norm": 15.763551518922286, - "learning_rate": 1.5312110338697427e-07, - "logits/chosen": -20.395259857177734, - "logits/rejected": -20.275157928466797, - "logps/chosen": -472.280517578125, - "logps/rejected": -671.739501953125, - "loss": 0.4072, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.6096742153167725, - "rewards/margins": 1.9192960262298584, - "rewards/rejected": -3.528970241546631, - "step": 333 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 17.626302766622175, - "learning_rate": 1.5151452260226221e-07, - "logits/chosen": -20.2435302734375, - "logits/rejected": -20.33843994140625, - "logps/chosen": -616.4385375976562, - "logps/rejected": -653.4850463867188, - "loss": 0.429, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.501798629760742, - "rewards/margins": 0.9154990315437317, - "rewards/rejected": -3.417297840118408, - "step": 334 - }, - { - "epoch": 0.6686626746506986, - "grad_norm": 63.27015094274589, - "learning_rate": 1.4991274186077628e-07, - "logits/chosen": -19.318078994750977, - "logits/rejected": -18.871360778808594, - "logps/chosen": -683.593505859375, - "logps/rejected": -818.3577880859375, - "loss": 0.4233, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.2589824199676514, - "rewards/margins": 1.4071542024612427, - "rewards/rejected": -3.6661362648010254, - "step": 335 - }, - { - "epoch": 0.6706586826347305, - "grad_norm": 15.79033358862706, - "learning_rate": 1.4831583923104998e-07, - "logits/chosen": -19.07425308227539, - "logits/rejected": -19.29555320739746, - "logps/chosen": -735.476318359375, - "logps/rejected": -827.9088134765625, - "loss": 0.4393, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.4406747817993164, - "rewards/margins": 1.2893245220184326, - "rewards/rejected": -3.729999303817749, - "step": 336 - }, - { - "epoch": 0.6726546906187625, - "grad_norm": 17.61745253414518, - "learning_rate": 1.4672389254386457e-07, - "logits/chosen": -20.20433235168457, - "logits/rejected": -20.500490188598633, - "logps/chosen": -464.4842834472656, - "logps/rejected": -629.8096923828125, - "loss": 0.4282, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.407528042793274, - "rewards/margins": 1.6521190404891968, - "rewards/rejected": -3.059647560119629, - "step": 337 - }, - { - "epoch": 0.6746506986027944, - "grad_norm": 13.399050900409183, - "learning_rate": 1.451369793884557e-07, - "logits/chosen": -20.96167755126953, - "logits/rejected": -20.80148696899414, - "logps/chosen": -532.9072265625, - "logps/rejected": -853.7723388671875, - "loss": 0.4119, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.063987970352173, - "rewards/margins": 2.5306684970855713, - "rewards/rejected": -4.594656467437744, - "step": 338 - }, - { - "epoch": 0.6766467065868264, - "grad_norm": 14.221672502387845, - "learning_rate": 1.4355517710873182e-07, - "logits/chosen": -19.60066795349121, - "logits/rejected": -19.468841552734375, - "logps/chosen": -558.4423217773438, - "logps/rejected": -655.749267578125, - "loss": 0.3793, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9699652194976807, - "rewards/margins": 1.0505293607711792, - "rewards/rejected": -3.020494222640991, - "step": 339 - }, - { - "epoch": 0.6786427145708582, - "grad_norm": 14.369407606284954, - "learning_rate": 1.4197856279950437e-07, - "logits/chosen": -19.646114349365234, - "logits/rejected": -19.738296508789062, - "logps/chosen": -569.77392578125, - "logps/rejected": -649.2036743164062, - "loss": 0.4077, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.038227081298828, - "rewards/margins": 0.7805340886116028, - "rewards/rejected": -2.818761110305786, - "step": 340 - }, - { - "epoch": 0.6806387225548902, - "grad_norm": 14.850499236465488, - "learning_rate": 1.404072133027306e-07, - "logits/chosen": -21.245651245117188, - "logits/rejected": -20.8592529296875, - "logps/chosen": -528.7147216796875, - "logps/rejected": -652.779541015625, - "loss": 0.4052, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.072338342666626, - "rewards/margins": 1.2713418006896973, - "rewards/rejected": -3.343679904937744, - "step": 341 - }, - { - "epoch": 0.6826347305389222, - "grad_norm": 14.30640696385691, - "learning_rate": 1.388412052037682e-07, - "logits/chosen": -19.93825340270996, - "logits/rejected": -19.845121383666992, - "logps/chosen": -601.0985717773438, - "logps/rejected": -764.6405029296875, - "loss": 0.4366, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2762889862060547, - "rewards/margins": 1.2482759952545166, - "rewards/rejected": -3.524564743041992, - "step": 342 - }, - { - "epoch": 0.6846307385229541, - "grad_norm": 14.757223882174646, - "learning_rate": 1.3728061482764235e-07, - "logits/chosen": -20.238370895385742, - "logits/rejected": -19.90258026123047, - "logps/chosen": -501.98553466796875, - "logps/rejected": -651.3265380859375, - "loss": 0.4348, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6277523040771484, - "rewards/margins": 1.769914150238037, - "rewards/rejected": -3.3976664543151855, - "step": 343 - }, - { - "epoch": 0.6866267465069861, - "grad_norm": 13.524905585391163, - "learning_rate": 1.357255182353265e-07, - "logits/chosen": -19.395383834838867, - "logits/rejected": -19.172569274902344, - "logps/chosen": -535.5953369140625, - "logps/rejected": -589.7423706054688, - "loss": 0.3883, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7902706861495972, - "rewards/margins": 0.8703600764274597, - "rewards/rejected": -2.660630702972412, - "step": 344 - }, - { - "epoch": 0.688622754491018, - "grad_norm": 17.781027201144273, - "learning_rate": 1.341759912200346e-07, - "logits/chosen": -21.359405517578125, - "logits/rejected": -20.672643661499023, - "logps/chosen": -608.5810546875, - "logps/rejected": -678.4393310546875, - "loss": 0.495, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.7323977947235107, - "rewards/margins": 0.7647709846496582, - "rewards/rejected": -3.497169017791748, - "step": 345 - }, - { - "epoch": 0.6906187624750499, - "grad_norm": 15.715650307532133, - "learning_rate": 1.3263210930352737e-07, - "logits/chosen": -19.987049102783203, - "logits/rejected": -19.922182083129883, - "logps/chosen": -554.219482421875, - "logps/rejected": -680.3238525390625, - "loss": 0.4275, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.0270988941192627, - "rewards/margins": 1.0889631509780884, - "rewards/rejected": -3.1160621643066406, - "step": 346 - }, - { - "epoch": 0.6926147704590818, - "grad_norm": 15.43434324340175, - "learning_rate": 1.3109394773243115e-07, - "logits/chosen": -20.757238388061523, - "logits/rejected": -20.43233871459961, - "logps/chosen": -776.577880859375, - "logps/rejected": -881.4364624023438, - "loss": 0.4177, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.7323343753814697, - "rewards/margins": 1.1349782943725586, - "rewards/rejected": -4.867312431335449, - "step": 347 - }, - { - "epoch": 0.6946107784431138, - "grad_norm": 15.464400218085176, - "learning_rate": 1.2956158147457114e-07, - "logits/chosen": -19.5211124420166, - "logits/rejected": -19.82351303100586, - "logps/chosen": -670.1201782226562, - "logps/rejected": -733.384765625, - "loss": 0.4515, - "rewards/accuracies": 0.5625, - "rewards/chosen": -2.685201644897461, - "rewards/margins": 0.9089731574058533, - "rewards/rejected": -3.594175100326538, - "step": 348 - }, - { - "epoch": 0.6966067864271457, - "grad_norm": 13.245301947491186, - "learning_rate": 1.2803508521531677e-07, - "logits/chosen": -20.308961868286133, - "logits/rejected": -20.309410095214844, - "logps/chosen": -480.03692626953125, - "logps/rejected": -580.734375, - "loss": 0.3761, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.045827865600586, - "rewards/margins": 1.016439437866211, - "rewards/rejected": -3.062267303466797, - "step": 349 - }, - { - "epoch": 0.6986027944111777, - "grad_norm": 15.01774170684481, - "learning_rate": 1.265145333539423e-07, - "logits/chosen": -19.670490264892578, - "logits/rejected": -20.109798431396484, - "logps/chosen": -478.83978271484375, - "logps/rejected": -550.7167358398438, - "loss": 0.4211, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8340487480163574, - "rewards/margins": 0.8229854106903076, - "rewards/rejected": -2.657033920288086, - "step": 350 - }, - { - "epoch": 0.7005988023952096, - "grad_norm": 15.019733347079256, - "learning_rate": 1.2500000000000005e-07, - "logits/chosen": -20.130739212036133, - "logits/rejected": -20.091732025146484, - "logps/chosen": -595.3517456054688, - "logps/rejected": -649.5227661132812, - "loss": 0.4201, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.877355933189392, - "rewards/margins": 0.6590969562530518, - "rewards/rejected": -2.5364530086517334, - "step": 351 - }, - { - "epoch": 0.7025948103792415, - "grad_norm": 15.114776277100695, - "learning_rate": 1.234915589697091e-07, - "logits/chosen": -20.528398513793945, - "logits/rejected": -20.764583587646484, - "logps/chosen": -487.8934326171875, - "logps/rejected": -634.1742553710938, - "loss": 0.4114, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5121595859527588, - "rewards/margins": 1.2231650352478027, - "rewards/rejected": -2.7353246212005615, - "step": 352 - }, - { - "epoch": 0.7045908183632734, - "grad_norm": 13.6256765847296, - "learning_rate": 1.2198928378235715e-07, - "logits/chosen": -19.799575805664062, - "logits/rejected": -19.218843460083008, - "logps/chosen": -591.076904296875, - "logps/rejected": -690.492919921875, - "loss": 0.4153, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.067112684249878, - "rewards/margins": 0.8536612391471863, - "rewards/rejected": -2.920773983001709, - "step": 353 - }, - { - "epoch": 0.7065868263473054, - "grad_norm": 15.080531725450582, - "learning_rate": 1.2049324765671747e-07, - "logits/chosen": -19.41089630126953, - "logits/rejected": -19.16267967224121, - "logps/chosen": -424.4087829589844, - "logps/rejected": -470.61358642578125, - "loss": 0.4255, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4005568027496338, - "rewards/margins": 0.6558549404144287, - "rewards/rejected": -2.0564117431640625, - "step": 354 - }, - { - "epoch": 0.7085828343313373, - "grad_norm": 15.234160764908268, - "learning_rate": 1.1900352350748024e-07, - "logits/chosen": -20.29700469970703, - "logits/rejected": -19.255632400512695, - "logps/chosen": -559.0546264648438, - "logps/rejected": -733.3956298828125, - "loss": 0.4042, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.0236408710479736, - "rewards/margins": 1.7938932180404663, - "rewards/rejected": -3.8175342082977295, - "step": 355 - }, - { - "epoch": 0.7105788423153693, - "grad_norm": 13.932645173640877, - "learning_rate": 1.175201839416988e-07, - "logits/chosen": -19.759885787963867, - "logits/rejected": -19.602773666381836, - "logps/chosen": -538.2057495117188, - "logps/rejected": -627.2590942382812, - "loss": 0.4119, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7387853860855103, - "rewards/margins": 1.22702956199646, - "rewards/rejected": -2.9658150672912598, - "step": 356 - }, - { - "epoch": 0.7125748502994012, - "grad_norm": 13.442801821898552, - "learning_rate": 1.1604330125525078e-07, - "logits/chosen": -21.212749481201172, - "logits/rejected": -20.86857795715332, - "logps/chosen": -528.8177490234375, - "logps/rejected": -667.633056640625, - "loss": 0.3941, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.0190160274505615, - "rewards/margins": 1.2670211791992188, - "rewards/rejected": -3.286036968231201, - "step": 357 - }, - { - "epoch": 0.7145708582834331, - "grad_norm": 14.006526788598732, - "learning_rate": 1.1457294742931506e-07, - "logits/chosen": -18.911666870117188, - "logits/rejected": -18.453922271728516, - "logps/chosen": -758.29443359375, - "logps/rejected": -801.0388793945312, - "loss": 0.428, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5985617637634277, - "rewards/margins": 0.4230879545211792, - "rewards/rejected": -3.0216493606567383, - "step": 358 - }, - { - "epoch": 0.716566866267465, - "grad_norm": 12.614257733242685, - "learning_rate": 1.1310919412686245e-07, - "logits/chosen": -20.433425903320312, - "logits/rejected": -20.836620330810547, - "logps/chosen": -476.3095397949219, - "logps/rejected": -570.728515625, - "loss": 0.4237, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7495883703231812, - "rewards/margins": 1.0333952903747559, - "rewards/rejected": -2.7829835414886475, - "step": 359 - }, - { - "epoch": 0.718562874251497, - "grad_norm": 12.909204559523792, - "learning_rate": 1.11652112689164e-07, - "logits/chosen": -20.962121963500977, - "logits/rejected": -20.562122344970703, - "logps/chosen": -440.939453125, - "logps/rejected": -604.5936889648438, - "loss": 0.4066, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.9059672355651855, - "rewards/margins": 1.3158305883407593, - "rewards/rejected": -3.2217981815338135, - "step": 360 - }, - { - "epoch": 0.720558882235529, - "grad_norm": 15.08942520449824, - "learning_rate": 1.1020177413231332e-07, - "logits/chosen": -18.539854049682617, - "logits/rejected": -18.9150447845459, - "logps/chosen": -570.9627685546875, - "logps/rejected": -645.0427856445312, - "loss": 0.4283, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.8263628482818604, - "rewards/margins": 0.8770366907119751, - "rewards/rejected": -2.703399658203125, - "step": 361 - }, - { - "epoch": 0.7225548902195609, - "grad_norm": 14.430827963578226, - "learning_rate": 1.0875824914376553e-07, - "logits/chosen": -19.89458465576172, - "logits/rejected": -19.718782424926758, - "logps/chosen": -523.622802734375, - "logps/rejected": -695.9458618164062, - "loss": 0.3941, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9959505796432495, - "rewards/margins": 1.525893211364746, - "rewards/rejected": -3.521843910217285, - "step": 362 - }, - { - "epoch": 0.7245508982035929, - "grad_norm": 16.868494488083627, - "learning_rate": 1.073216080788921e-07, - "logits/chosen": -19.86290168762207, - "logits/rejected": -20.225648880004883, - "logps/chosen": -545.2635498046875, - "logps/rejected": -563.8896484375, - "loss": 0.4354, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.3746070861816406, - "rewards/margins": 0.27832096815109253, - "rewards/rejected": -2.652928352355957, - "step": 363 - }, - { - "epoch": 0.7265469061876247, - "grad_norm": 13.034702828139887, - "learning_rate": 1.058919209575517e-07, - "logits/chosen": -19.361488342285156, - "logits/rejected": -19.3696346282959, - "logps/chosen": -430.3180847167969, - "logps/rejected": -602.2634887695312, - "loss": 0.3693, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5194342136383057, - "rewards/margins": 1.5140842199325562, - "rewards/rejected": -3.0335183143615723, - "step": 364 - }, - { - "epoch": 0.7285429141716567, - "grad_norm": 15.45308792548943, - "learning_rate": 1.0446925746067766e-07, - "logits/chosen": -19.043651580810547, - "logits/rejected": -17.862253189086914, - "logps/chosen": -552.1558227539062, - "logps/rejected": -660.024658203125, - "loss": 0.448, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.276698112487793, - "rewards/margins": 0.8926166892051697, - "rewards/rejected": -3.1693146228790283, - "step": 365 - }, - { - "epoch": 0.7305389221556886, - "grad_norm": 13.439945637326407, - "learning_rate": 1.0305368692688174e-07, - "logits/chosen": -19.70649528503418, - "logits/rejected": -20.203184127807617, - "logps/chosen": -549.0379638671875, - "logps/rejected": -643.2531127929688, - "loss": 0.4017, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.016526699066162, - "rewards/margins": 0.9828741550445557, - "rewards/rejected": -2.999401092529297, - "step": 366 - }, - { - "epoch": 0.7325349301397206, - "grad_norm": 16.270374860886132, - "learning_rate": 1.0164527834907466e-07, - "logits/chosen": -20.50018310546875, - "logits/rejected": -20.18191146850586, - "logps/chosen": -502.46173095703125, - "logps/rejected": -628.9412841796875, - "loss": 0.4483, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9385945796966553, - "rewards/margins": 1.0971810817718506, - "rewards/rejected": -3.0357754230499268, - "step": 367 - }, - { - "epoch": 0.7345309381237525, - "grad_norm": 17.691856671277147, - "learning_rate": 1.0024410037110356e-07, - "logits/chosen": -21.22451400756836, - "logits/rejected": -21.156221389770508, - "logps/chosen": -495.2363586425781, - "logps/rejected": -613.3316650390625, - "loss": 0.3957, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.790381669998169, - "rewards/margins": 1.1306113004684448, - "rewards/rejected": -2.920992851257324, - "step": 368 - }, - { - "epoch": 0.7365269461077845, - "grad_norm": 17.71870669147819, - "learning_rate": 9.885022128440629e-08, - "logits/chosen": -20.220726013183594, - "logits/rejected": -20.3509578704834, - "logps/chosen": -580.0787353515625, - "logps/rejected": -699.8051147460938, - "loss": 0.3823, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.1798484325408936, - "rewards/margins": 1.233259677886963, - "rewards/rejected": -3.4131078720092773, - "step": 369 - }, - { - "epoch": 0.7385229540918163, - "grad_norm": 13.967093829846899, - "learning_rate": 9.746370902468309e-08, - "logits/chosen": -19.74071502685547, - "logits/rejected": -19.97284507751465, - "logps/chosen": -559.4572143554688, - "logps/rejected": -725.63037109375, - "loss": 0.3773, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.8109045028686523, - "rewards/margins": 1.5691444873809814, - "rewards/rejected": -3.380049228668213, - "step": 370 - }, - { - "epoch": 0.7405189620758483, - "grad_norm": 15.427869059063354, - "learning_rate": 9.608463116858542e-08, - "logits/chosen": -20.12067413330078, - "logits/rejected": -20.155805587768555, - "logps/chosen": -467.1358642578125, - "logps/rejected": -623.1273193359375, - "loss": 0.4187, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.784905195236206, - "rewards/margins": 1.2622867822647095, - "rewards/rejected": -3.047191858291626, - "step": 371 - }, - { - "epoch": 0.7425149700598802, - "grad_norm": 14.788752344708447, - "learning_rate": 9.471305493042242e-08, - "logits/chosen": -19.2255859375, - "logits/rejected": -20.1333065032959, - "logps/chosen": -635.54736328125, - "logps/rejected": -853.2119140625, - "loss": 0.4114, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.3935046195983887, - "rewards/margins": 1.8072738647460938, - "rewards/rejected": -4.200778961181641, - "step": 372 - }, - { - "epoch": 0.7445109780439122, - "grad_norm": 17.495642850120454, - "learning_rate": 9.334904715888494e-08, - "logits/chosen": -20.933143615722656, - "logits/rejected": -20.832096099853516, - "logps/chosen": -411.86676025390625, - "logps/rejected": -503.8995666503906, - "loss": 0.4113, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6132818460464478, - "rewards/margins": 1.026564121246338, - "rewards/rejected": -2.639845848083496, - "step": 373 - }, - { - "epoch": 0.7465069860279441, - "grad_norm": 16.02805046832174, - "learning_rate": 9.199267433378727e-08, - "logits/chosen": -20.639259338378906, - "logits/rejected": -20.546987533569336, - "logps/chosen": -523.702880859375, - "logps/rejected": -672.5869750976562, - "loss": 0.4, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8315907716751099, - "rewards/margins": 1.46940016746521, - "rewards/rejected": -3.3009910583496094, - "step": 374 - }, - { - "epoch": 0.7485029940119761, - "grad_norm": 15.457442027742408, - "learning_rate": 9.064400256282755e-08, - "logits/chosen": -20.409318923950195, - "logits/rejected": -19.4872989654541, - "logps/chosen": -578.9743041992188, - "logps/rejected": -649.5741577148438, - "loss": 0.389, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9740164279937744, - "rewards/margins": 0.8397660255432129, - "rewards/rejected": -2.8137826919555664, - "step": 375 - }, - { - "epoch": 0.7504990019960079, - "grad_norm": 13.600344719262973, - "learning_rate": 8.930309757836516e-08, - "logits/chosen": -21.005847930908203, - "logits/rejected": -20.700740814208984, - "logps/chosen": -519.6953735351562, - "logps/rejected": -564.7655639648438, - "loss": 0.4088, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.914379358291626, - "rewards/margins": 0.7501688599586487, - "rewards/rejected": -2.66454815864563, - "step": 376 - }, - { - "epoch": 0.7524950099800399, - "grad_norm": 16.875851886052402, - "learning_rate": 8.797002473421727e-08, - "logits/chosen": -20.600692749023438, - "logits/rejected": -20.884218215942383, - "logps/chosen": -666.4806518554688, - "logps/rejected": -765.6790771484375, - "loss": 0.4457, - "rewards/accuracies": 0.875, - "rewards/chosen": -3.1328279972076416, - "rewards/margins": 0.959656834602356, - "rewards/rejected": -4.092484951019287, - "step": 377 - }, - { - "epoch": 0.7544910179640718, - "grad_norm": 17.47565380119221, - "learning_rate": 8.664484900247363e-08, - "logits/chosen": -21.40182876586914, - "logits/rejected": -21.741256713867188, - "logps/chosen": -405.0975341796875, - "logps/rejected": -557.7005615234375, - "loss": 0.5039, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.402137041091919, - "rewards/margins": 1.5041407346725464, - "rewards/rejected": -2.906277894973755, - "step": 378 - }, - { - "epoch": 0.7564870259481038, - "grad_norm": 16.636183378171886, - "learning_rate": 8.532763497032986e-08, - "logits/chosen": -22.009544372558594, - "logits/rejected": -22.248573303222656, - "logps/chosen": -501.2933654785156, - "logps/rejected": -655.1139526367188, - "loss": 0.474, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6780036687850952, - "rewards/margins": 1.678252100944519, - "rewards/rejected": -3.3562557697296143, - "step": 379 - }, - { - "epoch": 0.7584830339321357, - "grad_norm": 14.487354680287742, - "learning_rate": 8.401844683693959e-08, - "logits/chosen": -20.488567352294922, - "logits/rejected": -20.180971145629883, - "logps/chosen": -434.21392822265625, - "logps/rejected": -620.0761108398438, - "loss": 0.4048, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.5215814113616943, - "rewards/margins": 1.4483155012130737, - "rewards/rejected": -2.9698970317840576, - "step": 380 - }, - { - "epoch": 0.7604790419161677, - "grad_norm": 16.309393285411627, - "learning_rate": 8.271734841028552e-08, - "logits/chosen": -21.052242279052734, - "logits/rejected": -20.410202026367188, - "logps/chosen": -530.8050537109375, - "logps/rejected": -799.7926635742188, - "loss": 0.4111, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.0542612075805664, - "rewards/margins": 2.227598190307617, - "rewards/rejected": -4.281859874725342, - "step": 381 - }, - { - "epoch": 0.7624750499001997, - "grad_norm": 15.18164269630782, - "learning_rate": 8.142440310406923e-08, - "logits/chosen": -20.607084274291992, - "logits/rejected": -20.46668815612793, - "logps/chosen": -568.8605346679688, - "logps/rejected": -732.0130615234375, - "loss": 0.3876, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2562711238861084, - "rewards/margins": 1.3708157539367676, - "rewards/rejected": -3.627086877822876, - "step": 382 - }, - { - "epoch": 0.7644710578842315, - "grad_norm": 15.843870118454515, - "learning_rate": 8.013967393462093e-08, - "logits/chosen": -19.776172637939453, - "logits/rejected": -20.000160217285156, - "logps/chosen": -602.1343383789062, - "logps/rejected": -681.888671875, - "loss": 0.4666, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8674306869506836, - "rewards/margins": 0.9732953906059265, - "rewards/rejected": -2.840725898742676, - "step": 383 - }, - { - "epoch": 0.7664670658682635, - "grad_norm": 16.409007623679, - "learning_rate": 7.886322351782782e-08, - "logits/chosen": -19.91303825378418, - "logits/rejected": -19.690052032470703, - "logps/chosen": -669.9967041015625, - "logps/rejected": -714.0025634765625, - "loss": 0.3973, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.5477182865142822, - "rewards/margins": 0.8782477378845215, - "rewards/rejected": -3.4259660243988037, - "step": 384 - }, - { - "epoch": 0.7684630738522954, - "grad_norm": 16.487955137219046, - "learning_rate": 7.759511406608255e-08, - "logits/chosen": -20.416107177734375, - "logits/rejected": -20.148622512817383, - "logps/chosen": -500.3988952636719, - "logps/rejected": -588.0291748046875, - "loss": 0.4116, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.080512762069702, - "rewards/margins": 0.6933166980743408, - "rewards/rejected": -2.773829460144043, - "step": 385 - }, - { - "epoch": 0.7704590818363274, - "grad_norm": 12.101869383533744, - "learning_rate": 7.633540738525066e-08, - "logits/chosen": -20.018726348876953, - "logits/rejected": -20.265872955322266, - "logps/chosen": -630.1484375, - "logps/rejected": -860.3380126953125, - "loss": 0.3322, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.424983501434326, - "rewards/margins": 1.9725841283798218, - "rewards/rejected": -4.3975677490234375, - "step": 386 - }, - { - "epoch": 0.7724550898203593, - "grad_norm": 15.343874076897551, - "learning_rate": 7.508416487165862e-08, - "logits/chosen": -19.619956970214844, - "logits/rejected": -19.398181915283203, - "logps/chosen": -536.6625366210938, - "logps/rejected": -645.3226318359375, - "loss": 0.4061, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8896986246109009, - "rewards/margins": 1.0346204042434692, - "rewards/rejected": -2.92431902885437, - "step": 387 - }, - { - "epoch": 0.7744510978043913, - "grad_norm": 18.888211698764017, - "learning_rate": 7.384144750910132e-08, - "logits/chosen": -20.33465003967285, - "logits/rejected": -20.288190841674805, - "logps/chosen": -664.9688720703125, - "logps/rejected": -774.9567260742188, - "loss": 0.4651, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.5558035373687744, - "rewards/margins": 1.0801535844802856, - "rewards/rejected": -3.6359572410583496, - "step": 388 - }, - { - "epoch": 0.7764471057884231, - "grad_norm": 15.472053370205279, - "learning_rate": 7.260731586586982e-08, - "logits/chosen": -21.066429138183594, - "logits/rejected": -21.032623291015625, - "logps/chosen": -608.3118286132812, - "logps/rejected": -721.432861328125, - "loss": 0.4025, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.139827251434326, - "rewards/margins": 1.1138910055160522, - "rewards/rejected": -3.2537178993225098, - "step": 389 - }, - { - "epoch": 0.7784431137724551, - "grad_norm": 16.192414128371517, - "learning_rate": 7.138183009179921e-08, - "logits/chosen": -19.891376495361328, - "logits/rejected": -19.18803596496582, - "logps/chosen": -497.37030029296875, - "logps/rejected": -598.454345703125, - "loss": 0.4694, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.445165753364563, - "rewards/margins": 1.31147038936615, - "rewards/rejected": -2.756636142730713, - "step": 390 - }, - { - "epoch": 0.780439121756487, - "grad_norm": 12.312288434176681, - "learning_rate": 7.016504991533726e-08, - "logits/chosen": -20.122936248779297, - "logits/rejected": -19.829761505126953, - "logps/chosen": -536.530517578125, - "logps/rejected": -579.5504760742188, - "loss": 0.417, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.375357151031494, - "rewards/margins": 0.6135118007659912, - "rewards/rejected": -2.9888687133789062, - "step": 391 - }, - { - "epoch": 0.782435129740519, - "grad_norm": 14.868394634270913, - "learning_rate": 6.895703464063318e-08, - "logits/chosen": -20.536144256591797, - "logits/rejected": -20.883195877075195, - "logps/chosen": -415.7984619140625, - "logps/rejected": -509.4052734375, - "loss": 0.4212, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.4413790702819824, - "rewards/margins": 0.9446898102760315, - "rewards/rejected": -2.386068820953369, - "step": 392 - }, - { - "epoch": 0.7844311377245509, - "grad_norm": 12.507727828796943, - "learning_rate": 6.775784314464716e-08, - "logits/chosen": -19.6154727935791, - "logits/rejected": -20.296825408935547, - "logps/chosen": -599.3359375, - "logps/rejected": -793.7149658203125, - "loss": 0.3785, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.3708200454711914, - "rewards/margins": 1.5517452955245972, - "rewards/rejected": -3.922565460205078, - "step": 393 - }, - { - "epoch": 0.7864271457085829, - "grad_norm": 16.10552971520174, - "learning_rate": 6.656753387428088e-08, - "logits/chosen": -20.372236251831055, - "logits/rejected": -20.05748748779297, - "logps/chosen": -546.3995361328125, - "logps/rejected": -587.6973266601562, - "loss": 0.4116, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.1204771995544434, - "rewards/margins": 0.6429656744003296, - "rewards/rejected": -2.7634429931640625, - "step": 394 - }, - { - "epoch": 0.7884231536926147, - "grad_norm": 13.213009124377368, - "learning_rate": 6.538616484352902e-08, - "logits/chosen": -20.40473747253418, - "logits/rejected": -20.52154541015625, - "logps/chosen": -517.912841796875, - "logps/rejected": -728.2640991210938, - "loss": 0.3337, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.709905743598938, - "rewards/margins": 1.5683302879333496, - "rewards/rejected": -3.278235912322998, - "step": 395 - }, - { - "epoch": 0.7904191616766467, - "grad_norm": 15.982324852281675, - "learning_rate": 6.42137936306514e-08, - "logits/chosen": -20.089094161987305, - "logits/rejected": -19.609643936157227, - "logps/chosen": -516.9925537109375, - "logps/rejected": -605.076171875, - "loss": 0.3902, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7272955179214478, - "rewards/margins": 0.9996792078018188, - "rewards/rejected": -2.7269749641418457, - "step": 396 - }, - { - "epoch": 0.7924151696606786, - "grad_norm": 14.080014810725027, - "learning_rate": 6.305047737536707e-08, - "logits/chosen": -19.353548049926758, - "logits/rejected": -19.089752197265625, - "logps/chosen": -640.895751953125, - "logps/rejected": -749.78466796875, - "loss": 0.3926, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.3223495483398438, - "rewards/margins": 1.1758687496185303, - "rewards/rejected": -3.498218536376953, - "step": 397 - }, - { - "epoch": 0.7944111776447106, - "grad_norm": 15.67403057877649, - "learning_rate": 6.189627277606893e-08, - "logits/chosen": -19.294418334960938, - "logits/rejected": -20.157922744750977, - "logps/chosen": -514.27490234375, - "logps/rejected": -710.0952758789062, - "loss": 0.4427, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.871453046798706, - "rewards/margins": 1.5966830253601074, - "rewards/rejected": -3.4681363105773926, - "step": 398 - }, - { - "epoch": 0.7964071856287425, - "grad_norm": 17.52367685410827, - "learning_rate": 6.075123608706093e-08, - "logits/chosen": -20.73387908935547, - "logits/rejected": -20.402624130249023, - "logps/chosen": -612.9823608398438, - "logps/rejected": -689.6102294921875, - "loss": 0.4862, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.1650824546813965, - "rewards/margins": 0.8385013937950134, - "rewards/rejected": -3.0035839080810547, - "step": 399 - }, - { - "epoch": 0.7984031936127745, - "grad_norm": 14.898718189895368, - "learning_rate": 5.961542311581585e-08, - "logits/chosen": -20.519458770751953, - "logits/rejected": -20.2457332611084, - "logps/chosen": -501.14654541015625, - "logps/rejected": -638.9808959960938, - "loss": 0.3868, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.750710129737854, - "rewards/margins": 1.4607923030853271, - "rewards/rejected": -3.2115025520324707, - "step": 400 - }, - { - "epoch": 0.8003992015968064, - "grad_norm": 14.068518132884332, - "learning_rate": 5.848888922025552e-08, - "logits/chosen": -19.87328338623047, - "logits/rejected": -19.895517349243164, - "logps/chosen": -479.4627685546875, - "logps/rejected": -598.6085815429688, - "loss": 0.3979, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.564753770828247, - "rewards/margins": 1.1341055631637573, - "rewards/rejected": -2.698859453201294, - "step": 401 - }, - { - "epoch": 0.8023952095808383, - "grad_norm": 15.767966201578563, - "learning_rate": 5.737168930605271e-08, - "logits/chosen": -21.595972061157227, - "logits/rejected": -21.527589797973633, - "logps/chosen": -449.24468994140625, - "logps/rejected": -512.97998046875, - "loss": 0.452, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.6318031549453735, - "rewards/margins": 0.8296475410461426, - "rewards/rejected": -2.4614505767822266, - "step": 402 - }, - { - "epoch": 0.8043912175648703, - "grad_norm": 30.339527885915878, - "learning_rate": 5.6263877823955115e-08, - "logits/chosen": -21.442642211914062, - "logits/rejected": -20.76935386657715, - "logps/chosen": -395.5703125, - "logps/rejected": -497.1997375488281, - "loss": 0.4302, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4801020622253418, - "rewards/margins": 0.9958056211471558, - "rewards/rejected": -2.475907802581787, - "step": 403 - }, - { - "epoch": 0.8063872255489022, - "grad_norm": 16.001479137202306, - "learning_rate": 5.516550876713141e-08, - "logits/chosen": -19.882049560546875, - "logits/rejected": -19.06197738647461, - "logps/chosen": -391.6086120605469, - "logps/rejected": -492.8154296875, - "loss": 0.4042, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.2724910974502563, - "rewards/margins": 1.1560478210449219, - "rewards/rejected": -2.4285390377044678, - "step": 404 - }, - { - "epoch": 0.8083832335329342, - "grad_norm": 14.24142266471028, - "learning_rate": 5.4076635668540065e-08, - "logits/chosen": -20.164100646972656, - "logits/rejected": -20.2769775390625, - "logps/chosen": -508.83251953125, - "logps/rejected": -776.5447998046875, - "loss": 0.3886, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.000663995742798, - "rewards/margins": 1.9279603958129883, - "rewards/rejected": -3.928624153137207, - "step": 405 - }, - { - "epoch": 0.810379241516966, - "grad_norm": 14.546560136389694, - "learning_rate": 5.299731159831952e-08, - "logits/chosen": -20.356386184692383, - "logits/rejected": -20.53049087524414, - "logps/chosen": -424.28167724609375, - "logps/rejected": -521.5020751953125, - "loss": 0.397, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.3355718851089478, - "rewards/margins": 0.9573720097541809, - "rewards/rejected": -2.2929439544677734, - "step": 406 - }, - { - "epoch": 0.812375249500998, - "grad_norm": 13.675060344355824, - "learning_rate": 5.192758916120235e-08, - "logits/chosen": -20.50723648071289, - "logits/rejected": -20.877363204956055, - "logps/chosen": -513.4188842773438, - "logps/rejected": -668.2745971679688, - "loss": 0.4191, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.7191814184188843, - "rewards/margins": 1.512561559677124, - "rewards/rejected": -3.2317428588867188, - "step": 407 - }, - { - "epoch": 0.8143712574850299, - "grad_norm": 15.788785978229196, - "learning_rate": 5.086752049395093e-08, - "logits/chosen": -20.162742614746094, - "logits/rejected": -19.610361099243164, - "logps/chosen": -496.881103515625, - "logps/rejected": -705.387939453125, - "loss": 0.3645, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.958635926246643, - "rewards/margins": 1.9726451635360718, - "rewards/rejected": -3.931281089782715, - "step": 408 - }, - { - "epoch": 0.8163672654690619, - "grad_norm": 15.5751557671111, - "learning_rate": 4.981715726281666e-08, - "logits/chosen": -20.163679122924805, - "logits/rejected": -20.057456970214844, - "logps/chosen": -631.61767578125, - "logps/rejected": -792.810302734375, - "loss": 0.4698, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.7149715423583984, - "rewards/margins": 1.2157440185546875, - "rewards/rejected": -3.930715560913086, - "step": 409 - }, - { - "epoch": 0.8183632734530938, - "grad_norm": 13.408157070752505, - "learning_rate": 4.8776550661021484e-08, - "logits/chosen": -20.862926483154297, - "logits/rejected": -20.99250030517578, - "logps/chosen": -529.020263671875, - "logps/rejected": -702.3614501953125, - "loss": 0.3629, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.0231356620788574, - "rewards/margins": 1.8261414766311646, - "rewards/rejected": -3.8492770195007324, - "step": 410 - }, - { - "epoch": 0.8203592814371258, - "grad_norm": 18.264216697145486, - "learning_rate": 4.774575140626316e-08, - "logits/chosen": -20.707500457763672, - "logits/rejected": -19.66729736328125, - "logps/chosen": -489.5726623535156, - "logps/rejected": -574.4754028320312, - "loss": 0.4102, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.770430564880371, - "rewards/margins": 0.9761725664138794, - "rewards/rejected": -2.746603488922119, - "step": 411 - }, - { - "epoch": 0.8223552894211577, - "grad_norm": 17.876984045214385, - "learning_rate": 4.672480973824311e-08, - "logits/chosen": -19.217308044433594, - "logits/rejected": -20.45569610595703, - "logps/chosen": -514.0562744140625, - "logps/rejected": -655.2940673828125, - "loss": 0.4146, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.8182786703109741, - "rewards/margins": 1.2063030004501343, - "rewards/rejected": -3.0245816707611084, - "step": 412 - }, - { - "epoch": 0.8243512974051896, - "grad_norm": 15.630413650700229, - "learning_rate": 4.5713775416217875e-08, - "logits/chosen": -22.35773468017578, - "logits/rejected": -21.519878387451172, - "logps/chosen": -537.8572387695312, - "logps/rejected": -665.5428466796875, - "loss": 0.4058, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.3368654251098633, - "rewards/margins": 1.2681982517242432, - "rewards/rejected": -3.6050634384155273, - "step": 413 - }, - { - "epoch": 0.8263473053892215, - "grad_norm": 13.210770445042035, - "learning_rate": 4.471269771657399e-08, - "logits/chosen": -20.452251434326172, - "logits/rejected": -20.73741912841797, - "logps/chosen": -586.2193603515625, - "logps/rejected": -735.9384155273438, - "loss": 0.3872, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.931977391242981, - "rewards/margins": 1.634932041168213, - "rewards/rejected": -3.5669093132019043, - "step": 414 - }, - { - "epoch": 0.8283433133732535, - "grad_norm": 16.73411677249594, - "learning_rate": 4.372162543042623e-08, - "logits/chosen": -19.061689376831055, - "logits/rejected": -19.270170211791992, - "logps/chosen": -577.607666015625, - "logps/rejected": -750.1259155273438, - "loss": 0.4336, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3514492511749268, - "rewards/margins": 1.1840051412582397, - "rewards/rejected": -3.535454511642456, - "step": 415 - }, - { - "epoch": 0.8303393213572854, - "grad_norm": 17.36364004961067, - "learning_rate": 4.2740606861239594e-08, - "logits/chosen": -20.45831298828125, - "logits/rejected": -20.373794555664062, - "logps/chosen": -552.2564697265625, - "logps/rejected": -767.511474609375, - "loss": 0.41, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.2376856803894043, - "rewards/margins": 1.532149314880371, - "rewards/rejected": -3.7698352336883545, - "step": 416 - }, - { - "epoch": 0.8323353293413174, - "grad_norm": 15.880407235617247, - "learning_rate": 4.176968982247514e-08, - "logits/chosen": -20.106103897094727, - "logits/rejected": -19.89539337158203, - "logps/chosen": -449.50616455078125, - "logps/rejected": -572.6887817382812, - "loss": 0.4384, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4362719058990479, - "rewards/margins": 1.1138153076171875, - "rewards/rejected": -2.5500874519348145, - "step": 417 - }, - { - "epoch": 0.8343313373253493, - "grad_norm": 14.27517499963331, - "learning_rate": 4.080892163525959e-08, - "logits/chosen": -19.676637649536133, - "logits/rejected": -19.48198127746582, - "logps/chosen": -779.2783203125, - "logps/rejected": -857.2685546875, - "loss": 0.3944, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.660714626312256, - "rewards/margins": 0.9152790904045105, - "rewards/rejected": -3.575993776321411, - "step": 418 - }, - { - "epoch": 0.8363273453093812, - "grad_norm": 14.114737949930701, - "learning_rate": 3.9858349126078936e-08, - "logits/chosen": -20.25637435913086, - "logits/rejected": -19.62478256225586, - "logps/chosen": -432.681640625, - "logps/rejected": -700.2333374023438, - "loss": 0.3826, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7364020347595215, - "rewards/margins": 2.0794200897216797, - "rewards/rejected": -3.815821886062622, - "step": 419 - }, - { - "epoch": 0.8383233532934131, - "grad_norm": 12.721830425267019, - "learning_rate": 3.8918018624496286e-08, - "logits/chosen": -20.793569564819336, - "logits/rejected": -19.76634979248047, - "logps/chosen": -648.701904296875, - "logps/rejected": -802.005615234375, - "loss": 0.3583, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.319666624069214, - "rewards/margins": 1.9068641662597656, - "rewards/rejected": -4.226531028747559, - "step": 420 - }, - { - "epoch": 0.8403193612774451, - "grad_norm": 15.112054065991872, - "learning_rate": 3.798797596089351e-08, - "logits/chosen": -21.272022247314453, - "logits/rejected": -20.976158142089844, - "logps/chosen": -696.375244140625, - "logps/rejected": -798.5341796875, - "loss": 0.3917, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.075582981109619, - "rewards/margins": 0.944496750831604, - "rewards/rejected": -4.020079612731934, - "step": 421 - }, - { - "epoch": 0.8423153692614771, - "grad_norm": 12.687066965454209, - "learning_rate": 3.706826646423808e-08, - "logits/chosen": -20.454601287841797, - "logits/rejected": -20.02032470703125, - "logps/chosen": -635.1240234375, - "logps/rejected": -789.3438110351562, - "loss": 0.3582, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.158295154571533, - "rewards/margins": 1.379619836807251, - "rewards/rejected": -3.5379152297973633, - "step": 422 - }, - { - "epoch": 0.844311377245509, - "grad_norm": 16.432472773274338, - "learning_rate": 3.615893495987335e-08, - "logits/chosen": -19.735931396484375, - "logits/rejected": -19.114166259765625, - "logps/chosen": -492.92498779296875, - "logps/rejected": -583.1378784179688, - "loss": 0.4298, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.1515309810638428, - "rewards/margins": 0.9132018685340881, - "rewards/rejected": -3.064732789993286, - "step": 423 - }, - { - "epoch": 0.846307385229541, - "grad_norm": 16.84315735095117, - "learning_rate": 3.526002576733389e-08, - "logits/chosen": -17.177181243896484, - "logits/rejected": -17.286401748657227, - "logps/chosen": -703.1956787109375, - "logps/rejected": -817.9109497070312, - "loss": 0.4633, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.135676145553589, - "rewards/margins": 1.490731954574585, - "rewards/rejected": -3.626408100128174, - "step": 424 - }, - { - "epoch": 0.8483033932135728, - "grad_norm": 15.418927002727168, - "learning_rate": 3.437158269818563e-08, - "logits/chosen": -20.871904373168945, - "logits/rejected": -20.74217987060547, - "logps/chosen": -640.6961669921875, - "logps/rejected": -753.02734375, - "loss": 0.4238, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.382235050201416, - "rewards/margins": 1.1841036081314087, - "rewards/rejected": -3.5663387775421143, - "step": 425 - }, - { - "epoch": 0.8502994011976048, - "grad_norm": 16.8898208426858, - "learning_rate": 3.349364905389032e-08, - "logits/chosen": -20.90660858154297, - "logits/rejected": -20.36172866821289, - "logps/chosen": -616.0280151367188, - "logps/rejected": -636.6360473632812, - "loss": 0.4623, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.402761936187744, - "rewards/margins": 0.360880970954895, - "rewards/rejected": -2.7636427879333496, - "step": 426 - }, - { - "epoch": 0.8522954091816367, - "grad_norm": 16.38668038544721, - "learning_rate": 3.262626762369525e-08, - "logits/chosen": -19.73676109313965, - "logits/rejected": -20.069122314453125, - "logps/chosen": -571.5435180664062, - "logps/rejected": -691.9410400390625, - "loss": 0.3796, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.097475528717041, - "rewards/margins": 1.2289046049118042, - "rewards/rejected": -3.3263802528381348, - "step": 427 - }, - { - "epoch": 0.8542914171656687, - "grad_norm": 15.599949297352975, - "learning_rate": 3.176948068254762e-08, - "logits/chosen": -20.783435821533203, - "logits/rejected": -19.67549705505371, - "logps/chosen": -560.0304565429688, - "logps/rejected": -661.19384765625, - "loss": 0.4255, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.0766711235046387, - "rewards/margins": 0.9344357252120972, - "rewards/rejected": -3.0111067295074463, - "step": 428 - }, - { - "epoch": 0.8562874251497006, - "grad_norm": 13.882191831008232, - "learning_rate": 3.092332998903416e-08, - "logits/chosen": -19.56271743774414, - "logits/rejected": -20.0837345123291, - "logps/chosen": -551.2194213867188, - "logps/rejected": -703.2940673828125, - "loss": 0.3956, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9828118085861206, - "rewards/margins": 1.4499356746673584, - "rewards/rejected": -3.4327471256256104, - "step": 429 - }, - { - "epoch": 0.8582834331337326, - "grad_norm": 25.98507197118227, - "learning_rate": 3.008785678334591e-08, - "logits/chosen": -20.70747947692871, - "logits/rejected": -20.464248657226562, - "logps/chosen": -567.925048828125, - "logps/rejected": -619.5768432617188, - "loss": 0.5152, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.5486812591552734, - "rewards/margins": 0.4738078713417053, - "rewards/rejected": -3.022488832473755, - "step": 430 - }, - { - "epoch": 0.8602794411177644, - "grad_norm": 15.82014776030212, - "learning_rate": 2.9263101785268252e-08, - "logits/chosen": -20.147571563720703, - "logits/rejected": -20.304765701293945, - "logps/chosen": -585.3394165039062, - "logps/rejected": -850.2744140625, - "loss": 0.4136, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.9204425811767578, - "rewards/margins": 1.9781417846679688, - "rewards/rejected": -3.8985838890075684, - "step": 431 - }, - { - "epoch": 0.8622754491017964, - "grad_norm": 14.556599943488065, - "learning_rate": 2.8449105192196315e-08, - "logits/chosen": -19.850107192993164, - "logits/rejected": -19.554677963256836, - "logps/chosen": -483.4244384765625, - "logps/rejected": -699.2830810546875, - "loss": 0.3998, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.5225160121917725, - "rewards/margins": 2.198775291442871, - "rewards/rejected": -3.7212913036346436, - "step": 432 - }, - { - "epoch": 0.8642714570858283, - "grad_norm": 17.92172824988793, - "learning_rate": 2.764590667717562e-08, - "logits/chosen": -20.4770565032959, - "logits/rejected": -20.09276580810547, - "logps/chosen": -717.6397094726562, - "logps/rejected": -787.7511596679688, - "loss": 0.4979, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.572091579437256, - "rewards/margins": 0.8177656531333923, - "rewards/rejected": -3.389857530593872, - "step": 433 - }, - { - "epoch": 0.8662674650698603, - "grad_norm": 17.749347080565766, - "learning_rate": 2.68535453869686e-08, - "logits/chosen": -20.485157012939453, - "logits/rejected": -20.50052261352539, - "logps/chosen": -534.2379150390625, - "logps/rejected": -607.3992919921875, - "loss": 0.4416, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.946094036102295, - "rewards/margins": 0.8893178105354309, - "rewards/rejected": -2.835411787033081, - "step": 434 - }, - { - "epoch": 0.8682634730538922, - "grad_norm": 15.402832369660697, - "learning_rate": 2.6072059940146772e-08, - "logits/chosen": -21.266874313354492, - "logits/rejected": -21.090681076049805, - "logps/chosen": -527.8031005859375, - "logps/rejected": -675.1356201171875, - "loss": 0.4451, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8210252523422241, - "rewards/margins": 1.4943686723709106, - "rewards/rejected": -3.3153936862945557, - "step": 435 - }, - { - "epoch": 0.8702594810379242, - "grad_norm": 14.619711241287206, - "learning_rate": 2.5301488425208296e-08, - "logits/chosen": -19.8941707611084, - "logits/rejected": -20.014163970947266, - "logps/chosen": -527.5570678710938, - "logps/rejected": -705.1380004882812, - "loss": 0.4377, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.890594482421875, - "rewards/margins": 1.558457851409912, - "rewards/rejected": -3.449052333831787, - "step": 436 - }, - { - "epoch": 0.872255489021956, - "grad_norm": 17.364445387102048, - "learning_rate": 2.4541868398721576e-08, - "logits/chosen": -19.487449645996094, - "logits/rejected": -19.69962501525879, - "logps/chosen": -566.1126098632812, - "logps/rejected": -720.592041015625, - "loss": 0.4279, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.260136842727661, - "rewards/margins": 1.0848456621170044, - "rewards/rejected": -3.344982862472534, - "step": 437 - }, - { - "epoch": 0.874251497005988, - "grad_norm": 13.895536448492699, - "learning_rate": 2.379323688349516e-08, - "logits/chosen": -21.22002410888672, - "logits/rejected": -20.869359970092773, - "logps/chosen": -695.900634765625, - "logps/rejected": -889.2630615234375, - "loss": 0.3745, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.888676404953003, - "rewards/margins": 2.137964963912964, - "rewards/rejected": -5.026641845703125, - "step": 438 - }, - { - "epoch": 0.8762475049900199, - "grad_norm": 15.62553041470151, - "learning_rate": 2.3055630366772856e-08, - "logits/chosen": -20.140289306640625, - "logits/rejected": -20.757360458374023, - "logps/chosen": -782.4682006835938, - "logps/rejected": -996.408203125, - "loss": 0.4253, - "rewards/accuracies": 0.9375, - "rewards/chosen": -3.031949758529663, - "rewards/margins": 1.7247015237808228, - "rewards/rejected": -4.756651401519775, - "step": 439 - }, - { - "epoch": 0.8782435129740519, - "grad_norm": 17.56982859158523, - "learning_rate": 2.2329084798455745e-08, - "logits/chosen": -20.021549224853516, - "logits/rejected": -19.943166732788086, - "logps/chosen": -569.3162231445312, - "logps/rejected": -720.0825805664062, - "loss": 0.4424, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7821518182754517, - "rewards/margins": 1.2261866331100464, - "rewards/rejected": -3.008338451385498, - "step": 440 - }, - { - "epoch": 0.8802395209580839, - "grad_norm": 14.376197989084849, - "learning_rate": 2.1613635589349756e-08, - "logits/chosen": -20.313098907470703, - "logits/rejected": -20.171749114990234, - "logps/chosen": -463.3466796875, - "logps/rejected": -586.16455078125, - "loss": 0.4191, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8441109657287598, - "rewards/margins": 0.97612464427948, - "rewards/rejected": -2.8202357292175293, - "step": 441 - }, - { - "epoch": 0.8822355289421158, - "grad_norm": 19.86496417620754, - "learning_rate": 2.090931760944009e-08, - "logits/chosen": -19.7161865234375, - "logits/rejected": -19.880416870117188, - "logps/chosen": -543.35693359375, - "logps/rejected": -787.716064453125, - "loss": 0.445, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6974327564239502, - "rewards/margins": 2.214301824569702, - "rewards/rejected": -3.9117345809936523, - "step": 442 - }, - { - "epoch": 0.8842315369261478, - "grad_norm": 16.399191622441986, - "learning_rate": 2.0216165186191404e-08, - "logits/chosen": -20.813325881958008, - "logits/rejected": -21.052209854125977, - "logps/chosen": -457.1396484375, - "logps/rejected": -594.8156127929688, - "loss": 0.4033, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5977610349655151, - "rewards/margins": 1.061667561531067, - "rewards/rejected": -2.659428834915161, - "step": 443 - }, - { - "epoch": 0.8862275449101796, - "grad_norm": 21.256078717009572, - "learning_rate": 1.9534212102874897e-08, - "logits/chosen": -20.78481674194336, - "logits/rejected": -20.350690841674805, - "logps/chosen": -379.0267333984375, - "logps/rejected": -507.3790588378906, - "loss": 0.5203, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.342707872390747, - "rewards/margins": 1.0672755241394043, - "rewards/rejected": -2.4099833965301514, - "step": 444 - }, - { - "epoch": 0.8882235528942116, - "grad_norm": 15.981558889035352, - "learning_rate": 1.8863491596921743e-08, - "logits/chosen": -21.123565673828125, - "logits/rejected": -21.21666717529297, - "logps/chosen": -575.4490356445312, - "logps/rejected": -765.37451171875, - "loss": 0.4178, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.0760347843170166, - "rewards/margins": 1.7245965003967285, - "rewards/rejected": -3.800631523132324, - "step": 445 - }, - { - "epoch": 0.8902195608782435, - "grad_norm": 17.42782380961723, - "learning_rate": 1.8204036358303172e-08, - "logits/chosen": -20.571834564208984, - "logits/rejected": -19.874500274658203, - "logps/chosen": -624.26806640625, - "logps/rejected": -718.889404296875, - "loss": 0.4327, - "rewards/accuracies": 0.625, - "rewards/chosen": -2.3383870124816895, - "rewards/margins": 1.085201382637024, - "rewards/rejected": -3.423588275909424, - "step": 446 - }, - { - "epoch": 0.8922155688622755, - "grad_norm": 14.235093800229222, - "learning_rate": 1.7555878527937163e-08, - "logits/chosen": -19.834335327148438, - "logits/rejected": -19.786022186279297, - "logps/chosen": -473.0978698730469, - "logps/rejected": -566.6396484375, - "loss": 0.4257, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6218547821044922, - "rewards/margins": 0.8998000621795654, - "rewards/rejected": -2.5216548442840576, - "step": 447 - }, - { - "epoch": 0.8942115768463074, - "grad_norm": 19.346872737859343, - "learning_rate": 1.6919049696121957e-08, - "logits/chosen": -20.57476043701172, - "logits/rejected": -20.308488845825195, - "logps/chosen": -551.8422241210938, - "logps/rejected": -636.97216796875, - "loss": 0.3865, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.958795428276062, - "rewards/margins": 0.8836429715156555, - "rewards/rejected": -2.8424386978149414, - "step": 448 - }, - { - "epoch": 0.8962075848303394, - "grad_norm": 13.957674091312546, - "learning_rate": 1.629358090099639e-08, - "logits/chosen": -20.768329620361328, - "logits/rejected": -20.287025451660156, - "logps/chosen": -571.2577514648438, - "logps/rejected": -715.926513671875, - "loss": 0.3711, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2563109397888184, - "rewards/margins": 1.4301239252090454, - "rewards/rejected": -3.6864352226257324, - "step": 449 - }, - { - "epoch": 0.8982035928143712, - "grad_norm": 17.00793094346187, - "learning_rate": 1.5679502627027136e-08, - "logits/chosen": -19.75269889831543, - "logits/rejected": -19.269916534423828, - "logps/chosen": -667.9724731445312, - "logps/rejected": -755.5040283203125, - "loss": 0.4259, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5590667724609375, - "rewards/margins": 0.594080924987793, - "rewards/rejected": -3.1531479358673096, - "step": 450 - }, - { - "epoch": 0.9001996007984032, - "grad_norm": 14.53960956084184, - "learning_rate": 1.507684480352292e-08, - "logits/chosen": -20.826148986816406, - "logits/rejected": -20.845027923583984, - "logps/chosen": -609.6602783203125, - "logps/rejected": -766.3357543945312, - "loss": 0.4199, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.4122769832611084, - "rewards/margins": 1.4562079906463623, - "rewards/rejected": -3.8684849739074707, - "step": 451 - }, - { - "epoch": 0.9021956087824351, - "grad_norm": 15.526496498416185, - "learning_rate": 1.4485636803175827e-08, - "logits/chosen": -20.792835235595703, - "logits/rejected": -20.55864906311035, - "logps/chosen": -620.74609375, - "logps/rejected": -725.6182861328125, - "loss": 0.4232, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.548922061920166, - "rewards/margins": 1.2787765264511108, - "rewards/rejected": -3.8276987075805664, - "step": 452 - }, - { - "epoch": 0.9041916167664671, - "grad_norm": 16.17239094525683, - "learning_rate": 1.390590744062975e-08, - "logits/chosen": -19.851058959960938, - "logits/rejected": -19.955947875976562, - "logps/chosen": -686.9035034179688, - "logps/rejected": -862.1982421875, - "loss": 0.4447, - "rewards/accuracies": 0.75, - "rewards/chosen": -3.0674149990081787, - "rewards/margins": 1.2306870222091675, - "rewards/rejected": -4.298101902008057, - "step": 453 - }, - { - "epoch": 0.906187624750499, - "grad_norm": 16.504222943900153, - "learning_rate": 1.333768497107593e-08, - "logits/chosen": -18.378997802734375, - "logits/rejected": -18.724864959716797, - "logps/chosen": -704.4053955078125, - "logps/rejected": -782.1048583984375, - "loss": 0.4966, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.608297824859619, - "rewards/margins": 1.0223803520202637, - "rewards/rejected": -3.630678415298462, - "step": 454 - }, - { - "epoch": 0.908183632734531, - "grad_norm": 12.566626082686115, - "learning_rate": 1.2780997088875866e-08, - "logits/chosen": -20.15782928466797, - "logits/rejected": -20.431690216064453, - "logps/chosen": -459.6881103515625, - "logps/rejected": -601.560302734375, - "loss": 0.3509, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.5113787651062012, - "rewards/margins": 1.2105661630630493, - "rewards/rejected": -2.72194504737854, - "step": 455 - }, - { - "epoch": 0.9101796407185628, - "grad_norm": 12.551833386648413, - "learning_rate": 1.2235870926211616e-08, - "logits/chosen": -21.343883514404297, - "logits/rejected": -20.625051498413086, - "logps/chosen": -543.25634765625, - "logps/rejected": -719.2687377929688, - "loss": 0.3485, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.1575820446014404, - "rewards/margins": 1.5838088989257812, - "rewards/rejected": -3.741391181945801, - "step": 456 - }, - { - "epoch": 0.9121756487025948, - "grad_norm": 14.335085396159439, - "learning_rate": 1.1702333051763268e-08, - "logits/chosen": -21.20443344116211, - "logits/rejected": -21.087547302246094, - "logps/chosen": -489.47100830078125, - "logps/rejected": -594.5681762695312, - "loss": 0.3969, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.0749175548553467, - "rewards/margins": 1.0219776630401611, - "rewards/rejected": -3.096895456314087, - "step": 457 - }, - { - "epoch": 0.9141716566866267, - "grad_norm": 14.916416548266803, - "learning_rate": 1.1180409469414093e-08, - "logits/chosen": -19.998384475708008, - "logits/rejected": -19.118911743164062, - "logps/chosen": -574.1597900390625, - "logps/rejected": -651.4364624023438, - "loss": 0.3682, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.2586848735809326, - "rewards/margins": 1.0062986612319946, - "rewards/rejected": -3.2649834156036377, - "step": 458 - }, - { - "epoch": 0.9161676646706587, - "grad_norm": 15.304833090303761, - "learning_rate": 1.0670125616983189e-08, - "logits/chosen": -20.066612243652344, - "logits/rejected": -19.84128761291504, - "logps/chosen": -495.0837707519531, - "logps/rejected": -705.328369140625, - "loss": 0.4195, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.4251233339309692, - "rewards/margins": 2.084500789642334, - "rewards/rejected": -3.5096240043640137, - "step": 459 - }, - { - "epoch": 0.9181636726546906, - "grad_norm": 15.100643017147776, - "learning_rate": 1.0171506364985622e-08, - "logits/chosen": -20.487680435180664, - "logits/rejected": -20.18891143798828, - "logps/chosen": -464.17669677734375, - "logps/rejected": -517.4482421875, - "loss": 0.4288, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.4962084293365479, - "rewards/margins": 0.9155182838439941, - "rewards/rejected": -2.411726474761963, - "step": 460 - }, - { - "epoch": 0.9201596806387226, - "grad_norm": 14.074235206185678, - "learning_rate": 9.684576015420275e-09, - "logits/chosen": -21.07552719116211, - "logits/rejected": -21.549148559570312, - "logps/chosen": -499.9134521484375, - "logps/rejected": -800.8197021484375, - "loss": 0.3271, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.7806061506271362, - "rewards/margins": 2.3184056282043457, - "rewards/rejected": -4.0990118980407715, - "step": 461 - }, - { - "epoch": 0.9221556886227545, - "grad_norm": 18.4061522713881, - "learning_rate": 9.209358300585473e-09, - "logits/chosen": -19.343280792236328, - "logits/rejected": -19.654582977294922, - "logps/chosen": -479.7654113769531, - "logps/rejected": -560.2235717773438, - "loss": 0.4658, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.009866237640381, - "rewards/margins": 0.5540248155593872, - "rewards/rejected": -2.5638911724090576, - "step": 462 - }, - { - "epoch": 0.9241516966067864, - "grad_norm": 26.610964345302957, - "learning_rate": 8.745876381922146e-09, - "logits/chosen": -20.410703659057617, - "logits/rejected": -20.179195404052734, - "logps/chosen": -547.2416381835938, - "logps/rejected": -712.9237060546875, - "loss": 0.4187, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8671215772628784, - "rewards/margins": 1.6667143106460571, - "rewards/rejected": -3.5338358879089355, - "step": 463 - }, - { - "epoch": 0.9261477045908184, - "grad_norm": 14.676685568488681, - "learning_rate": 8.294152848885155e-09, - "logits/chosen": -18.99413299560547, - "logits/rejected": -18.99550437927246, - "logps/chosen": -535.1954956054688, - "logps/rejected": -711.4642944335938, - "loss": 0.3592, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9930760860443115, - "rewards/margins": 1.5009886026382446, - "rewards/rejected": -3.4940645694732666, - "step": 464 - }, - { - "epoch": 0.9281437125748503, - "grad_norm": 15.512911588656682, - "learning_rate": 7.85420971784223e-09, - "logits/chosen": -20.32876205444336, - "logits/rejected": -20.24429702758789, - "logps/chosen": -628.5169067382812, - "logps/rejected": -750.5574340820312, - "loss": 0.3753, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.304827928543091, - "rewards/margins": 1.3203117847442627, - "rewards/rejected": -3.6251397132873535, - "step": 465 - }, - { - "epoch": 0.9301397205588823, - "grad_norm": 13.797948623517673, - "learning_rate": 7.4260684310008815e-09, - "logits/chosen": -21.487295150756836, - "logits/rejected": -21.362478256225586, - "logps/chosen": -560.105712890625, - "logps/rejected": -661.5278930664062, - "loss": 0.3915, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.2337305545806885, - "rewards/margins": 1.259868860244751, - "rewards/rejected": -3.4935996532440186, - "step": 466 - }, - { - "epoch": 0.9321357285429142, - "grad_norm": 11.943926186166795, - "learning_rate": 7.009749855363456e-09, - "logits/chosen": -19.233264923095703, - "logits/rejected": -19.451881408691406, - "logps/chosen": -733.60986328125, - "logps/rejected": -849.3505859375, - "loss": 0.3335, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.4640982151031494, - "rewards/margins": 1.114315152168274, - "rewards/rejected": -3.578413248062134, - "step": 467 - }, - { - "epoch": 0.9341317365269461, - "grad_norm": 14.503620615718729, - "learning_rate": 6.6052742817099274e-09, - "logits/chosen": -21.178070068359375, - "logits/rejected": -21.404176712036133, - "logps/chosen": -416.739013671875, - "logps/rejected": -557.4736328125, - "loss": 0.4248, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4158062934875488, - "rewards/margins": 1.3348989486694336, - "rewards/rejected": -2.7507052421569824, - "step": 468 - }, - { - "epoch": 0.936127744510978, - "grad_norm": 15.833870876370792, - "learning_rate": 6.2126614236091834e-09, - "logits/chosen": -19.73335838317871, - "logits/rejected": -19.94558334350586, - "logps/chosen": -500.3441162109375, - "logps/rejected": -622.7894897460938, - "loss": 0.3858, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.753172755241394, - "rewards/margins": 1.131605863571167, - "rewards/rejected": -2.8847784996032715, - "step": 469 - }, - { - "epoch": 0.93812375249501, - "grad_norm": 17.519277806262906, - "learning_rate": 5.83193041645802e-09, - "logits/chosen": -19.44824981689453, - "logits/rejected": -19.341487884521484, - "logps/chosen": -648.4024047851562, - "logps/rejected": -717.9725952148438, - "loss": 0.4417, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.534775495529175, - "rewards/margins": 0.7995470762252808, - "rewards/rejected": -3.334322452545166, - "step": 470 - }, - { - "epoch": 0.9401197604790419, - "grad_norm": 16.435405709302675, - "learning_rate": 5.463099816548577e-09, - "logits/chosen": -19.042818069458008, - "logits/rejected": -19.184558868408203, - "logps/chosen": -613.079833984375, - "logps/rejected": -668.34423828125, - "loss": 0.3898, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.6256651878356934, - "rewards/margins": 0.6088076829910278, - "rewards/rejected": -3.2344729900360107, - "step": 471 - }, - { - "epoch": 0.9421157684630739, - "grad_norm": 14.15586691409437, - "learning_rate": 5.106187600163986e-09, - "logits/chosen": -21.499380111694336, - "logits/rejected": -21.410926818847656, - "logps/chosen": -531.342041015625, - "logps/rejected": -651.987548828125, - "loss": 0.395, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.9257639646530151, - "rewards/margins": 1.1272857189178467, - "rewards/rejected": -3.0530495643615723, - "step": 472 - }, - { - "epoch": 0.9441117764471058, - "grad_norm": 15.403932926381009, - "learning_rate": 4.761211162702117e-09, - "logits/chosen": -20.26272964477539, - "logits/rejected": -20.957340240478516, - "logps/chosen": -498.70367431640625, - "logps/rejected": -617.1505126953125, - "loss": 0.4018, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.1088998317718506, - "rewards/margins": 1.0999847650527954, - "rewards/rejected": -3.2088844776153564, - "step": 473 - }, - { - "epoch": 0.9461077844311377, - "grad_norm": 17.491981052693117, - "learning_rate": 4.4281873178278475e-09, - "logits/chosen": -20.143478393554688, - "logits/rejected": -19.252201080322266, - "logps/chosen": -613.1995849609375, - "logps/rejected": -740.1754150390625, - "loss": 0.3663, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.0901284217834473, - "rewards/margins": 1.1570186614990234, - "rewards/rejected": -3.2471470832824707, - "step": 474 - }, - { - "epoch": 0.9481037924151696, - "grad_norm": 15.837465653860951, - "learning_rate": 4.107132296653548e-09, - "logits/chosen": -20.780521392822266, - "logits/rejected": -20.315956115722656, - "logps/chosen": -497.16717529296875, - "logps/rejected": -613.8842163085938, - "loss": 0.4256, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.0837228298187256, - "rewards/margins": 1.1131471395492554, - "rewards/rejected": -3.1968698501586914, - "step": 475 - }, - { - "epoch": 0.9500998003992016, - "grad_norm": 15.96542728453676, - "learning_rate": 3.798061746947995e-09, - "logits/chosen": -20.668846130371094, - "logits/rejected": -21.105388641357422, - "logps/chosen": -566.572021484375, - "logps/rejected": -691.3670654296875, - "loss": 0.4217, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.2949371337890625, - "rewards/margins": 1.2112789154052734, - "rewards/rejected": -3.506216049194336, - "step": 476 - }, - { - "epoch": 0.9520958083832335, - "grad_norm": 12.86557957396558, - "learning_rate": 3.5009907323737818e-09, - "logits/chosen": -19.3477725982666, - "logits/rejected": -20.240049362182617, - "logps/chosen": -568.7761840820312, - "logps/rejected": -702.0484008789062, - "loss": 0.3686, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.1555397510528564, - "rewards/margins": 0.9599153399467468, - "rewards/rejected": -3.1154556274414062, - "step": 477 - }, - { - "epoch": 0.9540918163672655, - "grad_norm": 17.617649889989814, - "learning_rate": 3.215933731753023e-09, - "logits/chosen": -20.7457332611084, - "logits/rejected": -20.778764724731445, - "logps/chosen": -442.6141052246094, - "logps/rejected": -605.4117431640625, - "loss": 0.4298, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.6654844284057617, - "rewards/margins": 1.4296069145202637, - "rewards/rejected": -3.0950918197631836, - "step": 478 - }, - { - "epoch": 0.9560878243512974, - "grad_norm": 14.777134374792304, - "learning_rate": 2.9429046383618038e-09, - "logits/chosen": -19.94468116760254, - "logits/rejected": -19.983610153198242, - "logps/chosen": -733.73095703125, - "logps/rejected": -855.1060791015625, - "loss": 0.3981, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.79736065864563, - "rewards/margins": 1.11807119846344, - "rewards/rejected": -3.9154319763183594, - "step": 479 - }, - { - "epoch": 0.9580838323353293, - "grad_norm": 15.081662489261191, - "learning_rate": 2.6819167592529168e-09, - "logits/chosen": -18.71021270751953, - "logits/rejected": -18.878259658813477, - "logps/chosen": -541.9618530273438, - "logps/rejected": -703.8439331054688, - "loss": 0.4446, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.6335082054138184, - "rewards/margins": 1.6381094455718994, - "rewards/rejected": -3.2716176509857178, - "step": 480 - }, - { - "epoch": 0.9600798403193613, - "grad_norm": 17.24487290020538, - "learning_rate": 2.4329828146074096e-09, - "logits/chosen": -20.34225082397461, - "logits/rejected": -19.895858764648438, - "logps/chosen": -424.185791015625, - "logps/rejected": -556.8890380859375, - "loss": 0.3847, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.5855990648269653, - "rewards/margins": 1.075500249862671, - "rewards/rejected": -2.6610991954803467, - "step": 481 - }, - { - "epoch": 0.9620758483033932, - "grad_norm": 14.220314617872566, - "learning_rate": 2.1961149371145792e-09, - "logits/chosen": -20.574399948120117, - "logits/rejected": -20.078720092773438, - "logps/chosen": -489.7257385253906, - "logps/rejected": -521.2189331054688, - "loss": 0.4188, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.6003961563110352, - "rewards/margins": 0.48941802978515625, - "rewards/rejected": -2.0898141860961914, - "step": 482 - }, - { - "epoch": 0.9640718562874252, - "grad_norm": 15.808989994483928, - "learning_rate": 1.9713246713805587e-09, - "logits/chosen": -21.16196060180664, - "logits/rejected": -20.88861083984375, - "logps/chosen": -511.3900146484375, - "logps/rejected": -617.3922729492188, - "loss": 0.3763, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.8091882467269897, - "rewards/margins": 1.2133337259292603, - "rewards/rejected": -3.02252197265625, - "step": 483 - }, - { - "epoch": 0.9660678642714571, - "grad_norm": 16.52779079764264, - "learning_rate": 1.7586229733657643e-09, - "logits/chosen": -21.060434341430664, - "logits/rejected": -20.106260299682617, - "logps/chosen": -623.4896850585938, - "logps/rejected": -785.6814575195312, - "loss": 0.4059, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.2155380249023438, - "rewards/margins": 1.6182777881622314, - "rewards/rejected": -3.8338160514831543, - "step": 484 - }, - { - "epoch": 0.9680638722554891, - "grad_norm": 20.55748826302738, - "learning_rate": 1.5580202098509076e-09, - "logits/chosen": -19.086421966552734, - "logits/rejected": -19.13943099975586, - "logps/chosen": -653.7385864257812, - "logps/rejected": -788.5634155273438, - "loss": 0.5591, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.365367889404297, - "rewards/margins": 1.2615199089050293, - "rewards/rejected": -3.626887798309326, - "step": 485 - }, - { - "epoch": 0.9700598802395209, - "grad_norm": 16.155660822289832, - "learning_rate": 1.3695261579316775e-09, - "logits/chosen": -19.985042572021484, - "logits/rejected": -20.003768920898438, - "logps/chosen": -549.3853759765625, - "logps/rejected": -645.8590087890625, - "loss": 0.455, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.37552547454834, - "rewards/margins": 0.6537982225418091, - "rewards/rejected": -3.0293240547180176, - "step": 486 - }, - { - "epoch": 0.9720558882235529, - "grad_norm": 12.799564879786711, - "learning_rate": 1.1931500045422038e-09, - "logits/chosen": -19.6976375579834, - "logits/rejected": -19.338512420654297, - "logps/chosen": -609.3573608398438, - "logps/rejected": -729.5746459960938, - "loss": 0.324, - "rewards/accuracies": 0.875, - "rewards/chosen": -2.285135507583618, - "rewards/margins": 1.4025579690933228, - "rewards/rejected": -3.6876933574676514, - "step": 487 - }, - { - "epoch": 0.9740518962075848, - "grad_norm": 13.783924221281117, - "learning_rate": 1.0289003460074165e-09, - "logits/chosen": -21.08749008178711, - "logits/rejected": -20.79568862915039, - "logps/chosen": -537.5322875976562, - "logps/rejected": -654.921630859375, - "loss": 0.3562, - "rewards/accuracies": 1.0, - "rewards/chosen": -2.3624069690704346, - "rewards/margins": 1.3793262243270874, - "rewards/rejected": -3.7417330741882324, - "step": 488 - }, - { - "epoch": 0.9760479041916168, - "grad_norm": 15.309320742613716, - "learning_rate": 8.767851876239074e-10, - "logits/chosen": -20.728225708007812, - "logits/rejected": -20.385595321655273, - "logps/chosen": -513.3759155273438, - "logps/rejected": -596.8172607421875, - "loss": 0.4025, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.7735494375228882, - "rewards/margins": 0.9200107455253601, - "rewards/rejected": -2.6935603618621826, - "step": 489 - }, - { - "epoch": 0.9780439121756487, - "grad_norm": 15.568864682945687, - "learning_rate": 7.368119432699382e-10, - "logits/chosen": -19.903289794921875, - "logits/rejected": -20.307979583740234, - "logps/chosen": -630.302978515625, - "logps/rejected": -687.8621826171875, - "loss": 0.4896, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.7445573806762695, - "rewards/margins": 0.6833914518356323, - "rewards/rejected": -3.427948474884033, - "step": 490 - }, - { - "epoch": 0.9800399201596807, - "grad_norm": 15.48353189199115, - "learning_rate": 6.089874350439505e-10, - "logits/chosen": -19.985660552978516, - "logits/rejected": -19.806968688964844, - "logps/chosen": -546.8287353515625, - "logps/rejected": -818.223388671875, - "loss": 0.3878, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.4326379299163818, - "rewards/margins": 1.880547046661377, - "rewards/rejected": -3.3131847381591797, - "step": 491 - }, - { - "epoch": 0.9820359281437125, - "grad_norm": 15.777439479613324, - "learning_rate": 4.933178929321102e-10, - "logits/chosen": -20.85559844970703, - "logits/rejected": -20.4979248046875, - "logps/chosen": -476.0706787109375, - "logps/rejected": -618.8011474609375, - "loss": 0.3955, - "rewards/accuracies": 0.9375, - "rewards/chosen": -2.0278639793395996, - "rewards/margins": 1.1638784408569336, - "rewards/rejected": -3.191742420196533, - "step": 492 - }, - { - "epoch": 0.9840319361277445, - "grad_norm": 17.839219780429094, - "learning_rate": 3.898089545047445e-10, - "logits/chosen": -19.268781661987305, - "logits/rejected": -18.75286865234375, - "logps/chosen": -519.21728515625, - "logps/rejected": -564.7702026367188, - "loss": 0.427, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.034489154815674, - "rewards/margins": 0.6288854479789734, - "rewards/rejected": -2.663374900817871, - "step": 493 - }, - { - "epoch": 0.9860279441117764, - "grad_norm": 14.5684169318795, - "learning_rate": 2.9846566464150626e-10, - "logits/chosen": -19.779115676879883, - "logits/rejected": -19.5999698638916, - "logps/chosen": -583.4820556640625, - "logps/rejected": -669.7979125976562, - "loss": 0.4368, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5221195220947266, - "rewards/margins": 1.1899384260177612, - "rewards/rejected": -3.712057590484619, - "step": 494 - }, - { - "epoch": 0.9880239520958084, - "grad_norm": 17.580664206986, - "learning_rate": 2.1929247528540418e-10, - "logits/chosen": -19.53438949584961, - "logits/rejected": -19.96921157836914, - "logps/chosen": -570.0394897460938, - "logps/rejected": -735.1627197265625, - "loss": 0.4737, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8571494817733765, - "rewards/margins": 1.5648341178894043, - "rewards/rejected": -3.4219837188720703, - "step": 495 - }, - { - "epoch": 0.9900199600798403, - "grad_norm": 17.173836526546737, - "learning_rate": 1.5229324522605947e-10, - "logits/chosen": -20.75967788696289, - "logits/rejected": -20.18760871887207, - "logps/chosen": -475.73583984375, - "logps/rejected": -676.0049438476562, - "loss": 0.4687, - "rewards/accuracies": 1.0, - "rewards/chosen": -1.8086845874786377, - "rewards/margins": 1.8916891813278198, - "rewards/rejected": -3.700373411178589, - "step": 496 - }, - { - "epoch": 0.9920159680638723, - "grad_norm": 15.260844554178602, - "learning_rate": 9.747123991141193e-11, - "logits/chosen": -21.364585876464844, - "logits/rejected": -21.309356689453125, - "logps/chosen": -587.7421875, - "logps/rejected": -742.9869995117188, - "loss": 0.4397, - "rewards/accuracies": 0.875, - "rewards/chosen": -1.9390954971313477, - "rewards/margins": 1.6428402662277222, - "rewards/rejected": -3.5819358825683594, - "step": 497 - }, - { - "epoch": 0.9940119760479041, - "grad_norm": 12.560876894689768, - "learning_rate": 5.4829131288625096e-11, - "logits/chosen": -20.501996994018555, - "logits/rejected": -20.2353572845459, - "logps/chosen": -491.1131591796875, - "logps/rejected": -573.8976440429688, - "loss": 0.4025, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.5727652311325073, - "rewards/margins": 1.0524019002914429, - "rewards/rejected": -2.62516713142395, - "step": 498 - }, - { - "epoch": 0.9960079840319361, - "grad_norm": 13.797868022456017, - "learning_rate": 2.4368997673940294e-11, - "logits/chosen": -20.825347900390625, - "logits/rejected": -19.912729263305664, - "logps/chosen": -534.9151611328125, - "logps/rejected": -644.46142578125, - "loss": 0.3842, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.111253261566162, - "rewards/margins": 0.9820639491081238, - "rewards/rejected": -3.0933175086975098, - "step": 499 - }, - { - "epoch": 0.998003992015968, - "grad_norm": 15.509605037385542, - "learning_rate": 6.092323651313291e-12, - "logits/chosen": -20.30014991760254, - "logits/rejected": -18.998384475708008, - "logps/chosen": -434.7257080078125, - "logps/rejected": -615.57861328125, - "loss": 0.3789, - "rewards/accuracies": 0.625, - "rewards/chosen": -1.846671223640442, - "rewards/margins": 1.5862188339233398, - "rewards/rejected": -3.432889938354492, - "step": 500 - }, - { - "epoch": 1.0, - "grad_norm": 16.224300295566376, - "learning_rate": 0.0, - "logits/chosen": -19.65430450439453, - "logits/rejected": -19.525461196899414, - "logps/chosen": -516.9768676757812, - "logps/rejected": -590.5892333984375, - "loss": 0.3751, - "rewards/accuracies": 0.9375, - "rewards/chosen": -1.8650116920471191, - "rewards/margins": 0.9615846872329712, - "rewards/rejected": -2.826596260070801, - "step": 501 - }, - { - "epoch": 1.0, - "step": 501, - "total_flos": 0.0, - "train_loss": 0.490256760589139, - "train_runtime": 40162.2344, - "train_samples_per_second": 1.597, - "train_steps_per_second": 0.012 - } - ], - "logging_steps": 1, - "max_steps": 501, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 101, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 0.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}