{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 142, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.818629616705012, "learning_rate": 3.3333333333333334e-08, "logits/chosen": 0.34349873661994934, "logits/rejected": -0.45936429500579834, "logps/chosen": -645.8182373046875, "logps/rejected": -960.7478637695312, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07, "grad_norm": 3.7015477071804805, "learning_rate": 3.333333333333333e-07, "logits/chosen": 0.2855316400527954, "logits/rejected": -0.5074439644813538, "logps/chosen": -515.9026489257812, "logps/rejected": -943.4110107421875, "loss": 0.6931, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": 9.945142664946616e-05, "rewards/margins": -0.0002705494989641011, "rewards/rejected": 0.00037000092561356723, "step": 10 }, { "epoch": 0.14, "grad_norm": 3.8654438865386216, "learning_rate": 4.980901968537757e-07, "logits/chosen": 0.22748669981956482, "logits/rejected": -0.6011324524879456, "logps/chosen": -554.2417602539062, "logps/rejected": -920.9329833984375, "loss": 0.6914, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0033154767006635666, "rewards/margins": 0.003788186237215996, "rewards/rejected": -0.00047270936192944646, "step": 20 }, { "epoch": 0.21, "grad_norm": 4.007801368289745, "learning_rate": 4.829863985848586e-07, "logits/chosen": 0.36973315477371216, "logits/rejected": -0.5559585094451904, "logps/chosen": -484.40032958984375, "logps/rejected": -927.1837158203125, "loss": 0.6845, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.012849527411162853, "rewards/margins": 0.01983761414885521, "rewards/rejected": -0.006988088134676218, "step": 30 }, { "epoch": 0.28, "grad_norm": 3.8106369880633078, "learning_rate": 4.5369832536975604e-07, "logits/chosen": 0.1922769993543625, "logits/rejected": -0.5385528802871704, "logps/chosen": -541.2227783203125, "logps/rejected": -886.4552001953125, "loss": 0.6726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03025916777551174, "rewards/margins": 0.04895365983247757, "rewards/rejected": -0.018694492056965828, "step": 40 }, { "epoch": 0.35, "grad_norm": 3.566281633809558, "learning_rate": 4.120090430254013e-07, "logits/chosen": 0.16315576434135437, "logits/rejected": -0.6133657097816467, "logps/chosen": -582.3228149414062, "logps/rejected": -962.0523681640625, "loss": 0.6498, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.038968753069639206, "rewards/margins": 0.06734822690486908, "rewards/rejected": -0.02837948501110077, "step": 50 }, { "epoch": 0.42, "grad_norm": 3.941251849542362, "learning_rate": 3.6045660633384666e-07, "logits/chosen": 0.25063854455947876, "logits/rejected": -0.7097708582878113, "logps/chosen": -553.4964599609375, "logps/rejected": -958.4275512695312, "loss": 0.6382, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.054004330188035965, "rewards/margins": 0.16838756203651428, "rewards/rejected": -0.1143832579255104, "step": 60 }, { "epoch": 0.49, "grad_norm": 3.6354849237348916, "learning_rate": 3.021795415808338e-07, "logits/chosen": 0.2733720541000366, "logits/rejected": -0.47119006514549255, "logps/chosen": -546.3372192382812, "logps/rejected": -853.4588012695312, "loss": 0.6301, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.05035785585641861, "rewards/margins": 0.14240628480911255, "rewards/rejected": -0.09204842895269394, "step": 70 }, { "epoch": 0.56, "grad_norm": 4.178631940798131, "learning_rate": 2.407257722268487e-07, "logits/chosen": 0.17784562706947327, "logits/rejected": -0.545692503452301, "logps/chosen": -513.2195434570312, "logps/rejected": -911.9866943359375, "loss": 0.6075, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.056872475892305374, "rewards/margins": 0.31286248564720154, "rewards/rejected": -0.25598999857902527, "step": 80 }, { "epoch": 0.63, "grad_norm": 3.917354076328557, "learning_rate": 1.798366203674768e-07, "logits/chosen": 0.3149748742580414, "logits/rejected": -0.6348077654838562, "logps/chosen": -531.9932861328125, "logps/rejected": -937.4884643554688, "loss": 0.6101, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.052259791642427444, "rewards/margins": 0.3956056237220764, "rewards/rejected": -0.3433458209037781, "step": 90 }, { "epoch": 0.7, "grad_norm": 4.051764538146448, "learning_rate": 1.232190340325567e-07, "logits/chosen": 0.29592758417129517, "logits/rejected": -0.5284489989280701, "logps/chosen": -476.4107360839844, "logps/rejected": -907.6671142578125, "loss": 0.5991, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.07296852767467499, "rewards/margins": 0.3048384487628937, "rewards/rejected": -0.2318699061870575, "step": 100 }, { "epoch": 0.7, "eval_logits/chosen": -0.001659675850532949, "eval_logits/rejected": -1.119870662689209, "eval_logps/chosen": -485.54742431640625, "eval_logps/rejected": -1095.5645751953125, "eval_loss": 0.6539692282676697, "eval_rewards/accuracies": 0.7777777910232544, "eval_rewards/chosen": 0.0512981191277504, "eval_rewards/margins": 0.10516992956399918, "eval_rewards/rejected": -0.05387180671095848, "eval_runtime": 191.4237, "eval_samples_per_second": 9.027, "eval_steps_per_second": 0.282, "step": 100 }, { "epoch": 0.77, "grad_norm": 4.049764432150976, "learning_rate": 7.431990718775249e-08, "logits/chosen": 0.20402272045612335, "logits/rejected": -0.6543243527412415, "logps/chosen": -562.2398681640625, "logps/rejected": -969.9613037109375, "loss": 0.5843, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 0.03413739800453186, "rewards/margins": 0.4432193338871002, "rewards/rejected": -0.40908199548721313, "step": 110 }, { "epoch": 0.85, "grad_norm": 4.618779944055989, "learning_rate": 3.6116231897871026e-08, "logits/chosen": 0.23138892650604248, "logits/rejected": -0.6778791546821594, "logps/chosen": -575.7042236328125, "logps/rejected": -981.9583129882812, "loss": 0.5762, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.045423753559589386, "rewards/margins": 0.4969947934150696, "rewards/rejected": -0.451570987701416, "step": 120 }, { "epoch": 0.92, "grad_norm": 4.683249145311086, "learning_rate": 1.0933858244968147e-08, "logits/chosen": 0.1574718952178955, "logits/rejected": -0.5027952194213867, "logps/chosen": -536.2589111328125, "logps/rejected": -921.5875854492188, "loss": 0.5781, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.050459813326597214, "rewards/margins": 0.41949304938316345, "rewards/rejected": -0.3690332770347595, "step": 130 }, { "epoch": 0.99, "grad_norm": 4.206754290452048, "learning_rate": 3.058959464346811e-10, "logits/chosen": 0.16254039108753204, "logits/rejected": -0.5900125503540039, "logps/chosen": -552.6573486328125, "logps/rejected": -974.9669799804688, "loss": 0.5848, "rewards/accuracies": 0.90625, "rewards/chosen": 0.05284979194402695, "rewards/margins": 0.2659812569618225, "rewards/rejected": -0.21313147246837616, "step": 140 }, { "epoch": 1.0, "step": 142, "total_flos": 0.0, "train_loss": 0.6277184326883772, "train_runtime": 2301.3874, "train_samples_per_second": 3.942, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 142, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }