{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9983492901947838, "eval_steps": 1000, "global_step": 378, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002641135688345989, "grad_norm": 0.9479714304489147, "learning_rate": 1.3157894736842104e-08, "logits/chosen": -2.2716729640960693, "logits/rejected": -2.241565704345703, "logps/chosen": -156.80194091796875, "logps/rejected": -147.06320190429688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02641135688345989, "grad_norm": 0.9657211466886476, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -2.2696707248687744, "logits/rejected": -2.2592086791992188, "logps/chosen": -173.62896728515625, "logps/rejected": -168.29458618164062, "loss": 0.6931, "rewards/accuracies": 0.4470486044883728, "rewards/chosen": 0.00015925339539535344, "rewards/margins": 9.125900396611542e-05, "rewards/rejected": 6.799438415328041e-05, "step": 10 }, { "epoch": 0.05282271376691978, "grad_norm": 0.952148373000707, "learning_rate": 2.631578947368421e-07, "logits/chosen": -2.2733893394470215, "logits/rejected": -2.2671706676483154, "logps/chosen": -169.05018615722656, "logps/rejected": -169.22433471679688, "loss": 0.6931, "rewards/accuracies": 0.522656261920929, "rewards/chosen": -9.141029295278713e-05, "rewards/margins": 0.0001715569815132767, "rewards/rejected": -0.00026296728174202144, "step": 20 }, { "epoch": 0.07923407065037966, "grad_norm": 0.9659022104815798, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -2.2659945487976074, "logits/rejected": -2.2476842403411865, "logps/chosen": -178.9219512939453, "logps/rejected": -169.46163940429688, "loss": 0.693, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": -0.0015892453957349062, "rewards/margins": 0.0002000469685299322, "rewards/rejected": -0.0017892923206090927, "step": 30 }, { "epoch": 0.10564542753383956, "grad_norm": 0.9273301854247932, "learning_rate": 4.999573126145131e-07, "logits/chosen": -2.2755210399627686, "logits/rejected": -2.2551403045654297, "logps/chosen": -179.63047790527344, "logps/rejected": -163.6714630126953, "loss": 0.6926, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0064210728742182255, "rewards/margins": 0.0012297846842557192, "rewards/rejected": -0.007650857325643301, "step": 40 }, { "epoch": 0.13205678441729943, "grad_norm": 0.9799377076485188, "learning_rate": 4.984647842238184e-07, "logits/chosen": -2.28324294090271, "logits/rejected": -2.2799413204193115, "logps/chosen": -168.7937469482422, "logps/rejected": -170.1733856201172, "loss": 0.6917, "rewards/accuracies": 0.5640624761581421, "rewards/chosen": -0.0193162951618433, "rewards/margins": 0.0024972439277917147, "rewards/rejected": -0.021813539788126945, "step": 50 }, { "epoch": 0.1584681413007593, "grad_norm": 0.9599459222723423, "learning_rate": 4.948524419003415e-07, "logits/chosen": -2.2814371585845947, "logits/rejected": -2.273639440536499, "logps/chosen": -173.1953582763672, "logps/rejected": -171.6744384765625, "loss": 0.6911, "rewards/accuracies": 0.547656238079071, "rewards/chosen": -0.03955007344484329, "rewards/margins": 0.00362972030416131, "rewards/rejected": -0.043179791420698166, "step": 60 }, { "epoch": 0.18487949818421923, "grad_norm": 0.9816456439800273, "learning_rate": 4.891511048751102e-07, "logits/chosen": -2.3176677227020264, "logits/rejected": -2.3122127056121826, "logps/chosen": -177.39488220214844, "logps/rejected": -167.91244506835938, "loss": 0.6896, "rewards/accuracies": 0.5625, "rewards/chosen": -0.06663568317890167, "rewards/margins": 0.008323188871145248, "rewards/rejected": -0.07495887577533722, "step": 70 }, { "epoch": 0.2112908550676791, "grad_norm": 1.0368469143747399, "learning_rate": 4.81409414945389e-07, "logits/chosen": -2.342482328414917, "logits/rejected": -2.3203022480010986, "logps/chosen": -195.66636657714844, "logps/rejected": -187.387939453125, "loss": 0.6878, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08949045091867447, "rewards/margins": 0.01014741975814104, "rewards/rejected": -0.09963786602020264, "step": 80 }, { "epoch": 0.237702211951139, "grad_norm": 1.0442549418705969, "learning_rate": 4.7169342148001546e-07, "logits/chosen": -2.3382809162139893, "logits/rejected": -2.3266994953155518, "logps/chosen": -195.09518432617188, "logps/rejected": -182.37796020507812, "loss": 0.6874, "rewards/accuracies": 0.573437511920929, "rewards/chosen": -0.11916174739599228, "rewards/margins": 0.014569459483027458, "rewards/rejected": -0.13373121619224548, "step": 90 }, { "epoch": 0.26411356883459886, "grad_norm": 1.064035782351039, "learning_rate": 4.6008601790947314e-07, "logits/chosen": -2.321099042892456, "logits/rejected": -2.319929838180542, "logps/chosen": -187.78164672851562, "logps/rejected": -180.61904907226562, "loss": 0.6849, "rewards/accuracies": 0.5726562738418579, "rewards/chosen": -0.13006748259067535, "rewards/margins": 0.01908385381102562, "rewards/rejected": -0.14915132522583008, "step": 100 }, { "epoch": 0.29052492571805877, "grad_norm": 1.2007773205984222, "learning_rate": 4.466862345083708e-07, "logits/chosen": -2.321152925491333, "logits/rejected": -2.3174567222595215, "logps/chosen": -189.49288940429688, "logps/rejected": -185.0203094482422, "loss": 0.6838, "rewards/accuracies": 0.5640624761581421, "rewards/chosen": -0.15054509043693542, "rewards/margins": 0.022373218089342117, "rewards/rejected": -0.17291830480098724, "step": 110 }, { "epoch": 0.3169362826015186, "grad_norm": 1.4026602647859983, "learning_rate": 4.3160839350405605e-07, "logits/chosen": -2.310743808746338, "logits/rejected": -2.309847831726074, "logps/chosen": -188.81431579589844, "logps/rejected": -188.99505615234375, "loss": 0.6824, "rewards/accuracies": 0.578125, "rewards/chosen": -0.16854415833950043, "rewards/margins": 0.026360681280493736, "rewards/rejected": -0.19490481913089752, "step": 120 }, { "epoch": 0.34334763948497854, "grad_norm": 1.2567707162408688, "learning_rate": 4.149811337196807e-07, "logits/chosen": -2.3110547065734863, "logits/rejected": -2.302724599838257, "logps/chosen": -201.25418090820312, "logps/rejected": -193.19810485839844, "loss": 0.6801, "rewards/accuracies": 0.5703125, "rewards/chosen": -0.20925810933113098, "rewards/margins": 0.028272386640310287, "rewards/rejected": -0.23753049969673157, "step": 130 }, { "epoch": 0.36975899636843845, "grad_norm": 1.3171163187862418, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -2.310455560684204, "logits/rejected": -2.3021938800811768, "logps/chosen": -201.97586059570312, "logps/rejected": -196.26681518554688, "loss": 0.6814, "rewards/accuracies": 0.5679687261581421, "rewards/chosen": -0.267736554145813, "rewards/margins": 0.03569976985454559, "rewards/rejected": -0.3034363389015198, "step": 140 }, { "epoch": 0.3961703532518983, "grad_norm": 1.362073777172477, "learning_rate": 3.776577982952267e-07, "logits/chosen": -2.2907283306121826, "logits/rejected": -2.2906551361083984, "logps/chosen": -201.7469482421875, "logps/rejected": -202.30398559570312, "loss": 0.6805, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.3111319839954376, "rewards/margins": 0.03633497655391693, "rewards/rejected": -0.34746694564819336, "step": 150 }, { "epoch": 0.4225817101353582, "grad_norm": 2.093940167532845, "learning_rate": 3.572801521931522e-07, "logits/chosen": -2.309051752090454, "logits/rejected": -2.302381992340088, "logps/chosen": -203.4032745361328, "logps/rejected": -199.6255340576172, "loss": 0.6787, "rewards/accuracies": 0.577343761920929, "rewards/chosen": -0.34017449617385864, "rewards/margins": 0.04339217022061348, "rewards/rejected": -0.38356661796569824, "step": 160 }, { "epoch": 0.4489930670188181, "grad_norm": 1.9348127254360297, "learning_rate": 3.35987229658482e-07, "logits/chosen": -2.3065199851989746, "logits/rejected": -2.2967591285705566, "logps/chosen": -206.21694946289062, "logps/rejected": -201.9674835205078, "loss": 0.6763, "rewards/accuracies": 0.5726562738418579, "rewards/chosen": -0.3637041449546814, "rewards/margins": 0.04687776044011116, "rewards/rejected": -0.41058191657066345, "step": 170 }, { "epoch": 0.475404423902278, "grad_norm": 1.6138340474053996, "learning_rate": 3.139606943986089e-07, "logits/chosen": -2.295161247253418, "logits/rejected": -2.2992234230041504, "logps/chosen": -216.4457550048828, "logps/rejected": -214.0793914794922, "loss": 0.6756, "rewards/accuracies": 0.5601562261581421, "rewards/chosen": -0.4113912582397461, "rewards/margins": 0.0445592924952507, "rewards/rejected": -0.4559505581855774, "step": 180 }, { "epoch": 0.5018157807857379, "grad_norm": 1.8694312571116225, "learning_rate": 2.913884690460325e-07, "logits/chosen": -2.332610607147217, "logits/rejected": -2.3261446952819824, "logps/chosen": -223.5069122314453, "logps/rejected": -212.2234344482422, "loss": 0.6734, "rewards/accuracies": 0.5757812261581421, "rewards/chosen": -0.45355916023254395, "rewards/margins": 0.04530250281095505, "rewards/rejected": -0.4988616406917572, "step": 190 }, { "epoch": 0.5282271376691977, "grad_norm": 1.6606210676628292, "learning_rate": 2.684631318687185e-07, "logits/chosen": -2.3509981632232666, "logits/rejected": -2.3406052589416504, "logps/chosen": -226.03836059570312, "logps/rejected": -223.758544921875, "loss": 0.678, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -0.45515409111976624, "rewards/margins": 0.0432661809027195, "rewards/rejected": -0.4984202980995178, "step": 200 }, { "epoch": 0.5546384945526577, "grad_norm": 1.7021845659792814, "learning_rate": 2.4538027376021755e-07, "logits/chosen": -2.3477015495300293, "logits/rejected": -2.338550090789795, "logps/chosen": -220.02816772460938, "logps/rejected": -216.43405151367188, "loss": 0.6703, "rewards/accuracies": 0.5804687738418579, "rewards/chosen": -0.44584885239601135, "rewards/margins": 0.060957133769989014, "rewards/rejected": -0.5068060159683228, "step": 210 }, { "epoch": 0.5810498514361175, "grad_norm": 1.8757866922329256, "learning_rate": 2.2233682952712483e-07, "logits/chosen": -2.33669376373291, "logits/rejected": -2.332843780517578, "logps/chosen": -223.82211303710938, "logps/rejected": -217.61489868164062, "loss": 0.6712, "rewards/accuracies": 0.586718738079071, "rewards/chosen": -0.4329432547092438, "rewards/margins": 0.06651361286640167, "rewards/rejected": -0.49945688247680664, "step": 220 }, { "epoch": 0.6074612083195774, "grad_norm": 2.334294772609141, "learning_rate": 1.995293977107475e-07, "logits/chosen": -2.3312466144561768, "logits/rejected": -2.326164722442627, "logps/chosen": -219.48025512695312, "logps/rejected": -222.7107391357422, "loss": 0.671, "rewards/accuracies": 0.5757812261581421, "rewards/chosen": -0.4052560329437256, "rewards/margins": 0.058724187314510345, "rewards/rejected": -0.4639802575111389, "step": 230 }, { "epoch": 0.6338725652030373, "grad_norm": 1.8694677367469326, "learning_rate": 1.7715256327766884e-07, "logits/chosen": -2.3552398681640625, "logits/rejected": -2.340351104736328, "logps/chosen": -226.2518310546875, "logps/rejected": -220.1430206298828, "loss": 0.6717, "rewards/accuracies": 0.563281238079071, "rewards/chosen": -0.43269434571266174, "rewards/margins": 0.05727803707122803, "rewards/rejected": -0.48997241258621216, "step": 240 }, { "epoch": 0.6602839220864972, "grad_norm": 2.0002930172446627, "learning_rate": 1.5539723748942242e-07, "logits/chosen": -2.35202956199646, "logits/rejected": -2.3460869789123535, "logps/chosen": -219.1632537841797, "logps/rejected": -222.41403198242188, "loss": 0.6715, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.4845377504825592, "rewards/margins": 0.07130294293165207, "rewards/rejected": -0.5558406710624695, "step": 250 }, { "epoch": 0.6866952789699571, "grad_norm": 2.0423010510210973, "learning_rate": 1.3444902911492174e-07, "logits/chosen": -2.3457489013671875, "logits/rejected": -2.3403260707855225, "logps/chosen": -229.6208038330078, "logps/rejected": -227.60733032226562, "loss": 0.6732, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.5344967842102051, "rewards/margins": 0.058640915900468826, "rewards/rejected": -0.5931377410888672, "step": 260 }, { "epoch": 0.7131066358534169, "grad_norm": 1.810992406616959, "learning_rate": 1.1448666088188763e-07, "logits/chosen": -2.341614007949829, "logits/rejected": -2.337841510772705, "logps/chosen": -222.2272186279297, "logps/rejected": -223.3246307373047, "loss": 0.6716, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.5301269292831421, "rewards/margins": 0.058558739721775055, "rewards/rejected": -0.5886856317520142, "step": 270 }, { "epoch": 0.7395179927368769, "grad_norm": 2.254614754539, "learning_rate": 9.56804446775518e-08, "logits/chosen": -2.372023582458496, "logits/rejected": -2.3646726608276367, "logps/chosen": -224.5756378173828, "logps/rejected": -218.9865264892578, "loss": 0.6733, "rewards/accuracies": 0.567187488079071, "rewards/chosen": -0.5263770818710327, "rewards/margins": 0.061575133353471756, "rewards/rejected": -0.5879522562026978, "step": 280 }, { "epoch": 0.7659293496203368, "grad_norm": 2.086058658480454, "learning_rate": 7.819082850768432e-08, "logits/chosen": -2.3455469608306885, "logits/rejected": -2.3464303016662598, "logps/chosen": -224.189208984375, "logps/rejected": -223.93930053710938, "loss": 0.6693, "rewards/accuracies": 0.5796874761581421, "rewards/chosen": -0.499728262424469, "rewards/margins": 0.06420499831438065, "rewards/rejected": -0.563933253288269, "step": 290 }, { "epoch": 0.7923407065037966, "grad_norm": 2.4783165290141382, "learning_rate": 6.216702761078166e-08, "logits/chosen": -2.3515076637268066, "logits/rejected": -2.349012851715088, "logps/chosen": -224.17892456054688, "logps/rejected": -218.49209594726562, "loss": 0.6696, "rewards/accuracies": 0.578125, "rewards/chosen": -0.5156034827232361, "rewards/margins": 0.06623221933841705, "rewards/rejected": -0.5818357467651367, "step": 300 }, { "epoch": 0.8187520633872565, "grad_norm": 3.5523303199324854, "learning_rate": 4.774575140626316e-08, "logits/chosen": -2.342184543609619, "logits/rejected": -2.3424227237701416, "logps/chosen": -228.24533081054688, "logps/rejected": -227.98974609375, "loss": 0.6715, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -0.5175895094871521, "rewards/margins": 0.06086786836385727, "rewards/rejected": -0.5784574151039124, "step": 310 }, { "epoch": 0.8451634202707164, "grad_norm": 2.029707974108137, "learning_rate": 3.5050037137906885e-08, "logits/chosen": -2.3485968112945557, "logits/rejected": -2.348895788192749, "logps/chosen": -220.9480438232422, "logps/rejected": -220.4337615966797, "loss": 0.6734, "rewards/accuracies": 0.586718738079071, "rewards/chosen": -0.5326020121574402, "rewards/margins": 0.06538228690624237, "rewards/rejected": -0.5979843139648438, "step": 320 }, { "epoch": 0.8715747771541763, "grad_norm": 1.9445736642879892, "learning_rate": 2.4188200163467786e-08, "logits/chosen": -2.3561959266662598, "logits/rejected": -2.3535940647125244, "logps/chosen": -223.76058959960938, "logps/rejected": -222.8463592529297, "loss": 0.671, "rewards/accuracies": 0.5835937261581421, "rewards/chosen": -0.5285231471061707, "rewards/margins": 0.06788322329521179, "rewards/rejected": -0.5964063405990601, "step": 330 }, { "epoch": 0.8979861340376362, "grad_norm": 2.29874813836561, "learning_rate": 1.5252909846235894e-08, "logits/chosen": -2.358121395111084, "logits/rejected": -2.3539624214172363, "logps/chosen": -228.19039916992188, "logps/rejected": -224.6275634765625, "loss": 0.6684, "rewards/accuracies": 0.58984375, "rewards/chosen": -0.5251844525337219, "rewards/margins": 0.06964431703090668, "rewards/rejected": -0.5948287844657898, "step": 340 }, { "epoch": 0.924397490921096, "grad_norm": 1.8422698404022058, "learning_rate": 8.320398932703144e-09, "logits/chosen": -2.3642985820770264, "logits/rejected": -2.3563034534454346, "logps/chosen": -229.8364715576172, "logps/rejected": -225.88497924804688, "loss": 0.669, "rewards/accuracies": 0.577343761920929, "rewards/chosen": -0.5176088213920593, "rewards/margins": 0.06735256314277649, "rewards/rejected": -0.5849614143371582, "step": 350 }, { "epoch": 0.950808847804556, "grad_norm": 1.9065023664446652, "learning_rate": 3.4498131616493565e-09, "logits/chosen": -2.3470730781555176, "logits/rejected": -2.347114324569702, "logps/chosen": -225.67822265625, "logps/rejected": -217.3546600341797, "loss": 0.6727, "rewards/accuracies": 0.5859375, "rewards/chosen": -0.5201060771942139, "rewards/margins": 0.0647495836019516, "rewards/rejected": -0.5848556756973267, "step": 360 }, { "epoch": 0.9772202046880158, "grad_norm": 2.1089219617743082, "learning_rate": 6.827066535529947e-10, "logits/chosen": -2.3542428016662598, "logits/rejected": -2.351058006286621, "logps/chosen": -226.4674835205078, "logps/rejected": -225.4059295654297, "loss": 0.6697, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": -0.5389624834060669, "rewards/margins": 0.05195971205830574, "rewards/rejected": -0.5909221768379211, "step": 370 }, { "epoch": 0.9983492901947838, "step": 378, "total_flos": 0.0, "train_loss": 0.6782766637347993, "train_runtime": 9304.817, "train_samples_per_second": 20.828, "train_steps_per_second": 0.041 } ], "logging_steps": 10, "max_steps": 378, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }