{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968652037617555, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-08, "logits/chosen": -1.995415210723877, "logits/rejected": -2.0361223220825195, "logps/chosen": -475.42401123046875, "logps/pi_response": -236.53262329101562, "logps/ref_response": -236.53262329101562, "logps/rejected": -571.512451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.0094504356384277, "logits/rejected": -1.9493383169174194, "logps/chosen": -398.70062255859375, "logps/pi_response": -160.86041259765625, "logps/ref_response": -161.05593872070312, "logps/rejected": -436.8709411621094, "loss": 0.6906, "rewards/accuracies": 0.4513888955116272, "rewards/chosen": -0.010171439498662949, "rewards/margins": 0.0004877760075032711, "rewards/rejected": -0.010659217834472656, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": -1.9780817031860352, "logits/rejected": -1.9359722137451172, "logps/chosen": -396.9286193847656, "logps/pi_response": -165.99008178710938, "logps/ref_response": -170.16830444335938, "logps/rejected": -542.5230712890625, "loss": 0.6473, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.28707781434059143, "rewards/margins": 0.1905716061592102, "rewards/rejected": -0.4776495099067688, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": -1.929377794265747, "logits/rejected": -1.9014297723770142, "logps/chosen": -477.72247314453125, "logps/pi_response": -175.51304626464844, "logps/ref_response": -176.73623657226562, "logps/rejected": -558.6651000976562, "loss": 0.6794, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.7204787135124207, "rewards/margins": 0.3288179636001587, "rewards/rejected": -1.0492966175079346, "step": 30 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -1.9873077869415283, "logits/rejected": -1.9341981410980225, "logps/chosen": -473.9336853027344, "logps/pi_response": -187.4553680419922, "logps/ref_response": -175.1200408935547, "logps/rejected": -595.444091796875, "loss": 0.6401, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7007579803466797, "rewards/margins": 0.272124320268631, "rewards/rejected": -0.9728823900222778, "step": 40 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -1.8471710681915283, "logits/rejected": -1.641099214553833, "logps/chosen": -511.49871826171875, "logps/pi_response": -183.08493041992188, "logps/ref_response": -184.63198852539062, "logps/rejected": -632.9375, "loss": 0.6017, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8219456672668457, "rewards/margins": 0.4130307137966156, "rewards/rejected": -1.2349765300750732, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": -0.9633913040161133, "logits/rejected": -0.7572700381278992, "logps/chosen": -479.69775390625, "logps/pi_response": -182.07357788085938, "logps/ref_response": -175.98574829101562, "logps/rejected": -607.6907958984375, "loss": 0.5892, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6976214051246643, "rewards/margins": 0.5892900228500366, "rewards/rejected": -1.2869113683700562, "step": 60 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -0.3013351559638977, "logits/rejected": 0.12189887464046478, "logps/chosen": -462.14190673828125, "logps/pi_response": -178.45272827148438, "logps/ref_response": -161.18075561523438, "logps/rejected": -615.686279296875, "loss": 0.5997, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.698739767074585, "rewards/margins": 0.6993860006332397, "rewards/rejected": -1.3981258869171143, "step": 70 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": -0.2273164540529251, "logits/rejected": 0.14264746010303497, "logps/chosen": -505.594970703125, "logps/pi_response": -187.5181121826172, "logps/ref_response": -175.85252380371094, "logps/rejected": -639.3907470703125, "loss": 0.5449, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.9091317057609558, "rewards/margins": 0.6041504144668579, "rewards/rejected": -1.5132819414138794, "step": 80 }, { "epoch": 0.56, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 0.14409923553466797, "logits/rejected": 0.62093585729599, "logps/chosen": -543.806640625, "logps/pi_response": -203.8004608154297, "logps/ref_response": -184.4833526611328, "logps/rejected": -670.08984375, "loss": 0.5898, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1904714107513428, "rewards/margins": 0.5562640428543091, "rewards/rejected": -1.7467355728149414, "step": 90 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 0.1400291472673416, "logits/rejected": 0.6031023263931274, "logps/chosen": -544.8568115234375, "logps/pi_response": -222.86630249023438, "logps/ref_response": -191.95008850097656, "logps/rejected": -661.2197265625, "loss": 0.566, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1847233772277832, "rewards/margins": 0.5735085010528564, "rewards/rejected": -1.7582318782806396, "step": 100 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 0.13480663299560547, "logits/rejected": 0.6124667525291443, "logps/chosen": -551.712158203125, "logps/pi_response": -201.6997833251953, "logps/ref_response": -172.76661682128906, "logps/rejected": -627.7642211914062, "loss": 0.5799, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.042403221130371, "rewards/margins": 0.4938434660434723, "rewards/rejected": -1.536246657371521, "step": 110 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.29530060291290283, "logits/rejected": 0.561973512172699, "logps/chosen": -507.9276428222656, "logps/pi_response": -198.93344116210938, "logps/ref_response": -166.47335815429688, "logps/rejected": -658.5081787109375, "loss": 0.5442, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0466843843460083, "rewards/margins": 0.6271711587905884, "rewards/rejected": -1.6738557815551758, "step": 120 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": 0.4397956430912018, "logits/rejected": 0.7937263250350952, "logps/chosen": -523.13525390625, "logps/pi_response": -213.9718017578125, "logps/ref_response": -178.47569274902344, "logps/rejected": -653.7128295898438, "loss": 0.5582, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0310872793197632, "rewards/margins": 0.6268772482872009, "rewards/rejected": -1.6579644680023193, "step": 130 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 0.6194955706596375, "logits/rejected": 1.0747044086456299, "logps/chosen": -514.37744140625, "logps/pi_response": -208.5031280517578, "logps/ref_response": -176.0931396484375, "logps/rejected": -671.9991455078125, "loss": 0.5577, "rewards/accuracies": 0.6875, "rewards/chosen": -1.111130952835083, "rewards/margins": 0.6132029294967651, "rewards/rejected": -1.7243340015411377, "step": 140 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 0.08109824359416962, "logits/rejected": 0.7494246959686279, "logps/chosen": -524.765625, "logps/pi_response": -212.7974090576172, "logps/ref_response": -180.5318145751953, "logps/rejected": -664.6768798828125, "loss": 0.5556, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9752191305160522, "rewards/margins": 0.7606547474861145, "rewards/rejected": -1.735873818397522, "step": 150 }, { "epoch": 1.0, "step": 159, "total_flos": 0.0, "train_loss": 0.5931253643155848, "train_runtime": 4224.9045, "train_samples_per_second": 4.823, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }