{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987152034261242, "eval_steps": 50, "global_step": 465, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "chosen_logps": -95.33267211914062, "chosen_rewards": 0.0, "epoch": 0.006423982869379015, "grad_norm": 25.29540828053309, "learning_rate": 2.127659574468085e-08, "log_diff_policy": 3.4655230045318604, "logits": -1.0830460786819458, "logp_accuracy": 0.5416666865348816, "loss": 0.4586, "objective": 0.4850379526615143, "ranking_simple": 0.5416666865348816, "rejected_logps": -98.7981948852539, "rejected_rewards": 0.0, "reward_accuracy": 0.0, "step": 1 }, { "chosen_logps": -93.5468521118164, "chosen_rewards": 0.006756411399692297, "epoch": 0.032119914346895075, "grad_norm": 26.804077499485743, "learning_rate": 1.0638297872340425e-07, "log_diff_policy": -2.5355889797210693, "logits": -1.2330905199050903, "logp_accuracy": 0.46875, "loss": 0.4677, "objective": 0.4401715099811554, "ranking_simple": 0.46875, "rejected_logps": -91.01126861572266, "rejected_rewards": 0.004852155689150095, "reward_accuracy": 0.4166666567325592, "step": 5 }, { "chosen_logps": -92.04216766357422, "chosen_rewards": 0.017275752499699593, "epoch": 0.06423982869379015, "grad_norm": 25.126369649829037, "learning_rate": 2.127659574468085e-07, "log_diff_policy": 2.8229520320892334, "logits": -1.2231515645980835, "logp_accuracy": 0.5375000238418579, "loss": 0.4232, "objective": 0.4300234913825989, "ranking_simple": 0.5375000238418579, "rejected_logps": -94.86511993408203, "rejected_rewards": 0.011349495500326157, "reward_accuracy": 0.5625, "step": 10 }, { "chosen_logps": -92.67517852783203, "chosen_rewards": 0.024549953639507294, "epoch": 0.09635974304068523, "grad_norm": 27.00937611071894, "learning_rate": 3.1914893617021275e-07, "log_diff_policy": 2.5176284313201904, "logits": -1.2052661180496216, "logp_accuracy": 0.5458333492279053, "loss": 0.437, "objective": 0.40562108159065247, "ranking_simple": 0.5458333492279053, "rejected_logps": -95.19280242919922, "rejected_rewards": -0.002655384596437216, "reward_accuracy": 0.625, "step": 15 }, { "chosen_logps": -94.5601806640625, "chosen_rewards": 0.017942415550351143, "epoch": 0.1284796573875803, "grad_norm": 26.11627134386328, "learning_rate": 4.25531914893617e-07, "log_diff_policy": 1.416998267173767, "logits": -1.1798421144485474, "logp_accuracy": 0.5666666626930237, "loss": 0.4446, "objective": 0.4640241861343384, "ranking_simple": 0.5666666626930237, "rejected_logps": -95.97718048095703, "rejected_rewards": -0.004993783310055733, "reward_accuracy": 0.5249999761581421, "step": 20 }, { "chosen_logps": -92.53931427001953, "chosen_rewards": 0.029711071401834488, "epoch": 0.16059957173447537, "grad_norm": 25.27000879467044, "learning_rate": 5.319148936170212e-07, "log_diff_policy": 1.688908338546753, "logits": -1.2128067016601562, "logp_accuracy": 0.5541666746139526, "loss": 0.4246, "objective": 0.46693068742752075, "ranking_simple": 0.5541666746139526, "rejected_logps": -94.22822570800781, "rejected_rewards": -0.005338592454791069, "reward_accuracy": 0.6291666626930237, "step": 25 }, { "chosen_logps": -91.68006134033203, "chosen_rewards": 0.04121263697743416, "epoch": 0.19271948608137046, "grad_norm": 30.029105912075256, "learning_rate": 6.382978723404255e-07, "log_diff_policy": 3.065168857574463, "logits": -1.1860508918762207, "logp_accuracy": 0.5458333492279053, "loss": 0.4419, "objective": 0.45536676049232483, "ranking_simple": 0.5458333492279053, "rejected_logps": -94.74523162841797, "rejected_rewards": -0.0008615786791779101, "reward_accuracy": 0.6000000238418579, "step": 30 }, { "chosen_logps": -92.44457244873047, "chosen_rewards": 0.12479421496391296, "epoch": 0.22483940042826553, "grad_norm": 26.07370315896774, "learning_rate": 7.446808510638297e-07, "log_diff_policy": 1.213740587234497, "logits": -1.175734281539917, "logp_accuracy": 0.5375000238418579, "loss": 0.4299, "objective": 0.43924617767333984, "ranking_simple": 0.5375000238418579, "rejected_logps": -93.65831756591797, "rejected_rewards": 0.051300570368766785, "reward_accuracy": 0.7083333134651184, "step": 35 }, { "chosen_logps": -92.34425354003906, "chosen_rewards": 0.07783517986536026, "epoch": 0.2569593147751606, "grad_norm": 25.50361133750606, "learning_rate": 8.51063829787234e-07, "log_diff_policy": 3.3206112384796143, "logits": -1.296377420425415, "logp_accuracy": 0.5416666865348816, "loss": 0.4073, "objective": 0.389123797416687, "ranking_simple": 0.5416666865348816, "rejected_logps": -95.66484832763672, "rejected_rewards": 0.020596763119101524, "reward_accuracy": 0.612500011920929, "step": 40 }, { "chosen_logps": -93.70889282226562, "chosen_rewards": 0.043319471180438995, "epoch": 0.2890792291220557, "grad_norm": 25.913590629075212, "learning_rate": 9.574468085106384e-07, "log_diff_policy": 1.6663633584976196, "logits": -1.3147988319396973, "logp_accuracy": 0.5083333253860474, "loss": 0.4122, "objective": 0.4088054895401001, "ranking_simple": 0.5083333253860474, "rejected_logps": -95.37525177001953, "rejected_rewards": -0.028023438528180122, "reward_accuracy": 0.6625000238418579, "step": 45 }, { "chosen_logps": -93.3112564086914, "chosen_rewards": -0.01281350664794445, "epoch": 0.32119914346895073, "grad_norm": 28.249603087696812, "learning_rate": 9.998729100230496e-07, "log_diff_policy": 2.8222155570983887, "logits": -1.2909948825836182, "logp_accuracy": 0.5458333492279053, "loss": 0.3806, "objective": 0.3943421542644501, "ranking_simple": 0.5458333492279053, "rejected_logps": -96.13346099853516, "rejected_rewards": -0.11900155991315842, "reward_accuracy": 0.6958333253860474, "step": 50 }, { "epoch": 0.32119914346895073, "eval_chosen_logps": -96.1307601928711, "eval_chosen_rewards": -0.07925291359424591, "eval_log_diff_policy": 0.3852413296699524, "eval_logits": -1.2623121738433838, "eval_logp_accuracy": 0.48641303181648254, "eval_loss": 0.441945880651474, "eval_objective": 0.45166319608688354, "eval_ranking_simple": 0.48641303181648254, "eval_rejected_logps": -96.5160140991211, "eval_rejected_rewards": -0.12365706264972687, "eval_reward_accuracy": 0.626358687877655, "eval_runtime": 259.9006, "eval_samples_per_second": 16.956, "eval_steps_per_second": 0.708, "step": 50 }, { "chosen_logps": -93.82705688476562, "chosen_rewards": 0.08971155434846878, "epoch": 0.3533190578158458, "grad_norm": 30.58144755601296, "learning_rate": 9.990964830098245e-07, "log_diff_policy": -1.3893320560455322, "logits": -1.3590095043182373, "logp_accuracy": 0.47083333134651184, "loss": 0.3883, "objective": 0.3681974709033966, "ranking_simple": 0.47083333134651184, "rejected_logps": -92.43772888183594, "rejected_rewards": 0.0006031145458109677, "reward_accuracy": 0.6541666388511658, "step": 55 }, { "chosen_logps": -92.90776062011719, "chosen_rewards": 0.23018625378608704, "epoch": 0.3854389721627409, "grad_norm": 23.844856478231595, "learning_rate": 9.97615329515112e-07, "log_diff_policy": 1.9414401054382324, "logits": -1.3098902702331543, "logp_accuracy": 0.5, "loss": 0.3818, "objective": 0.40217113494873047, "ranking_simple": 0.5, "rejected_logps": -94.84921264648438, "rejected_rewards": 0.10347142070531845, "reward_accuracy": 0.7166666388511658, "step": 60 }, { "chosen_logps": -92.75626373291016, "chosen_rewards": 0.16772286593914032, "epoch": 0.41755888650963596, "grad_norm": 23.68646057563372, "learning_rate": 9.954315409343168e-07, "log_diff_policy": 2.559760332107544, "logits": -1.313169240951538, "logp_accuracy": 0.5458333492279053, "loss": 0.3786, "objective": 0.3682008385658264, "ranking_simple": 0.5458333492279053, "rejected_logps": -95.31603240966797, "rejected_rewards": 0.061052825301885605, "reward_accuracy": 0.675000011920929, "step": 65 }, { "chosen_logps": -94.4072036743164, "chosen_rewards": -0.10250440984964371, "epoch": 0.44967880085653106, "grad_norm": 29.818812525073238, "learning_rate": 9.925482007867483e-07, "log_diff_policy": 3.321316719055176, "logits": -1.2431812286376953, "logp_accuracy": 0.6000000238418579, "loss": 0.3636, "objective": 0.3513278365135193, "ranking_simple": 0.6000000238418579, "rejected_logps": -97.72850799560547, "rejected_rewards": -0.24798235297203064, "reward_accuracy": 0.7124999761581421, "step": 70 }, { "chosen_logps": -93.59406280517578, "chosen_rewards": -0.07322915643453598, "epoch": 0.4817987152034261, "grad_norm": 24.02331472718552, "learning_rate": 9.889693803616791e-07, "log_diff_policy": 3.919586658477783, "logits": -1.3332768678665161, "logp_accuracy": 0.5583333373069763, "loss": 0.3587, "objective": 0.34834274649620056, "ranking_simple": 0.5583333373069763, "rejected_logps": -97.51364135742188, "rejected_rewards": -0.24331532418727875, "reward_accuracy": 0.7958333492279053, "step": 75 }, { "chosen_logps": -92.71011352539062, "chosen_rewards": 0.024246821179986, "epoch": 0.5139186295503212, "grad_norm": 25.560188870297008, "learning_rate": 9.847001329696652e-07, "log_diff_policy": 0.9480332136154175, "logits": -1.2388370037078857, "logp_accuracy": 0.4958333373069763, "loss": 0.3478, "objective": 0.3841496407985687, "ranking_simple": 0.4958333373069763, "rejected_logps": -93.65815734863281, "rejected_rewards": -0.11854672431945801, "reward_accuracy": 0.762499988079071, "step": 80 }, { "chosen_logps": -93.56654357910156, "chosen_rewards": 0.05095100402832031, "epoch": 0.5460385438972163, "grad_norm": 25.013988585006754, "learning_rate": 9.797464868072486e-07, "log_diff_policy": 1.3223497867584229, "logits": -1.3439558744430542, "logp_accuracy": 0.5375000238418579, "loss": 0.3491, "objective": 0.3220667243003845, "ranking_simple": 0.5375000238418579, "rejected_logps": -94.88890075683594, "rejected_rewards": -0.12381982803344727, "reward_accuracy": 0.7541666626930237, "step": 85 }, { "chosen_logps": -94.28719329833984, "chosen_rewards": -0.06435166299343109, "epoch": 0.5781584582441114, "grad_norm": 23.082981317904384, "learning_rate": 9.741154364451177e-07, "log_diff_policy": 1.6581075191497803, "logits": -1.259577989578247, "logp_accuracy": 0.5416666865348816, "loss": 0.3556, "objective": 0.36888834834098816, "ranking_simple": 0.5416666865348816, "rejected_logps": -95.9452896118164, "rejected_rewards": -0.2110510617494583, "reward_accuracy": 0.7083333134651184, "step": 90 }, { "chosen_logps": -95.14549255371094, "chosen_rewards": -0.0923268124461174, "epoch": 0.6102783725910065, "grad_norm": 26.53982653266266, "learning_rate": 9.678149329517409e-07, "log_diff_policy": 0.934082567691803, "logits": -1.2546571493148804, "logp_accuracy": 0.5208333134651184, "loss": 0.3581, "objective": 0.3683505058288574, "ranking_simple": 0.5208333134651184, "rejected_logps": -96.07955932617188, "rejected_rewards": -0.2480526566505432, "reward_accuracy": 0.7166666388511658, "step": 95 }, { "chosen_logps": -94.09268951416016, "chosen_rewards": -0.0445202998816967, "epoch": 0.6423982869379015, "grad_norm": 25.760380699024093, "learning_rate": 9.608538726664224e-07, "log_diff_policy": 3.1634230613708496, "logits": -1.2221773862838745, "logp_accuracy": 0.5625, "loss": 0.3564, "objective": 0.32819443941116333, "ranking_simple": 0.5625, "rejected_logps": -97.256103515625, "rejected_rewards": -0.2686512768268585, "reward_accuracy": 0.8083333373069763, "step": 100 }, { "epoch": 0.6423982869379015, "eval_chosen_logps": -96.44969940185547, "eval_chosen_rewards": -0.11114552617073059, "eval_log_diff_policy": 0.5638590455055237, "eval_logits": -1.2708278894424438, "eval_logp_accuracy": 0.49320653080940247, "eval_loss": 0.4331083297729492, "eval_objective": 0.4486199617385864, "eval_ranking_simple": 0.49320653080940247, "eval_rejected_logps": -97.01356506347656, "eval_rejected_rewards": -0.17341142892837524, "eval_reward_accuracy": 0.6114130616188049, "eval_runtime": 253.1355, "eval_samples_per_second": 17.41, "eval_steps_per_second": 0.727, "step": 100 }, { "chosen_logps": -92.7875747680664, "chosen_rewards": 0.08123679459095001, "epoch": 0.6745182012847966, "grad_norm": 22.248093384444704, "learning_rate": 9.532420846376315e-07, "log_diff_policy": 1.5082199573516846, "logits": -1.2687418460845947, "logp_accuracy": 0.5, "loss": 0.3513, "objective": 0.3515733480453491, "ranking_simple": 0.5, "rejected_logps": -94.29579162597656, "rejected_rewards": -0.09235174208879471, "reward_accuracy": 0.7791666388511658, "step": 105 }, { "chosen_logps": -93.75534057617188, "chosen_rewards": 0.0705016627907753, "epoch": 0.7066381156316917, "grad_norm": 24.597464895572983, "learning_rate": 9.449903167443414e-07, "log_diff_policy": 2.2167460918426514, "logits": -1.3001577854156494, "logp_accuracy": 0.5583333373069763, "loss": 0.3418, "objective": 0.32322683930397034, "ranking_simple": 0.5583333373069763, "rejected_logps": -95.97208404541016, "rejected_rewards": -0.13538284599781036, "reward_accuracy": 0.8125, "step": 110 }, { "chosen_logps": -94.81763458251953, "chosen_rewards": 0.0830930545926094, "epoch": 0.7387580299785867, "grad_norm": 25.648417393945536, "learning_rate": 9.36110220519976e-07, "log_diff_policy": 1.995680570602417, "logits": -1.2974680662155151, "logp_accuracy": 0.4958333373069763, "loss": 0.333, "objective": 0.35374876856803894, "ranking_simple": 0.4958333373069763, "rejected_logps": -96.81330871582031, "rejected_rewards": -0.09351801127195358, "reward_accuracy": 0.7541666626930237, "step": 115 }, { "chosen_logps": -93.62238311767578, "chosen_rewards": 0.14073045551776886, "epoch": 0.7708779443254818, "grad_norm": 23.42316403560204, "learning_rate": 9.266143347003918e-07, "log_diff_policy": 2.7447712421417236, "logits": -1.3252606391906738, "logp_accuracy": 0.5708333253860474, "loss": 0.346, "objective": 0.3560214042663574, "ranking_simple": 0.5708333253860474, "rejected_logps": -96.36715698242188, "rejected_rewards": -0.0354219414293766, "reward_accuracy": 0.7333333492279053, "step": 120 }, { "chosen_logps": -89.9226303100586, "chosen_rewards": 0.2193768471479416, "epoch": 0.8029978586723768, "grad_norm": 28.765365425565363, "learning_rate": 9.165160675191271e-07, "log_diff_policy": 3.3330423831939697, "logits": -1.3067100048065186, "logp_accuracy": 0.5583333373069763, "loss": 0.3289, "objective": 0.31955718994140625, "ranking_simple": 0.5583333373069763, "rejected_logps": -93.25568389892578, "rejected_rewards": 0.027603892609477043, "reward_accuracy": 0.7708333134651184, "step": 125 }, { "chosen_logps": -91.68359375, "chosen_rewards": 0.30248725414276123, "epoch": 0.8351177730192719, "grad_norm": 22.611256720596312, "learning_rate": 9.058296777749152e-07, "log_diff_policy": 2.5365095138549805, "logits": -1.338497519493103, "logp_accuracy": 0.5375000238418579, "loss": 0.3351, "objective": 0.33691561222076416, "ranking_simple": 0.5375000238418579, "rejected_logps": -94.22010040283203, "rejected_rewards": 0.09416310489177704, "reward_accuracy": 0.7958333492279053, "step": 130 }, { "chosen_logps": -91.21343231201172, "chosen_rewards": 0.29551607370376587, "epoch": 0.867237687366167, "grad_norm": 25.296472840693795, "learning_rate": 8.945702546981968e-07, "log_diff_policy": 3.09786057472229, "logits": -1.3471318483352661, "logp_accuracy": 0.5416666865348816, "loss": 0.3418, "objective": 0.310558021068573, "ranking_simple": 0.5416666865348816, "rejected_logps": -94.31129455566406, "rejected_rewards": 0.1065615564584732, "reward_accuracy": 0.75, "step": 135 }, { "chosen_logps": -90.44215393066406, "chosen_rewards": 0.34312593936920166, "epoch": 0.8993576017130621, "grad_norm": 22.68211418129166, "learning_rate": 8.827536966450583e-07, "log_diff_policy": 4.385303020477295, "logits": -1.3340150117874146, "logp_accuracy": 0.574999988079071, "loss": 0.3314, "objective": 0.32155176997184753, "ranking_simple": 0.574999988079071, "rejected_logps": -94.82746124267578, "rejected_rewards": 0.131618469953537, "reward_accuracy": 0.7916666865348816, "step": 140 }, { "chosen_logps": -89.7942123413086, "chosen_rewards": 0.3487202823162079, "epoch": 0.9314775160599572, "grad_norm": 22.71675078878125, "learning_rate": 8.703966886486818e-07, "log_diff_policy": 3.6614508628845215, "logits": -1.3244807720184326, "logp_accuracy": 0.5791666507720947, "loss": 0.3203, "objective": 0.29724279046058655, "ranking_simple": 0.5791666507720947, "rejected_logps": -93.45568084716797, "rejected_rewards": 0.10642104595899582, "reward_accuracy": 0.8291666507720947, "step": 145 }, { "chosen_logps": -91.82856750488281, "chosen_rewards": 0.3639621436595917, "epoch": 0.9635974304068522, "grad_norm": 23.50039406775209, "learning_rate": 8.57516678860003e-07, "log_diff_policy": 3.226095676422119, "logits": -1.2904103994369507, "logp_accuracy": 0.5249999761581421, "loss": 0.3184, "objective": 0.30414384603500366, "ranking_simple": 0.5249999761581421, "rejected_logps": -95.05464935302734, "rejected_rewards": 0.13737621903419495, "reward_accuracy": 0.8041666746139526, "step": 150 }, { "epoch": 0.9635974304068522, "eval_chosen_logps": -93.31350708007812, "eval_chosen_rewards": 0.20247410237789154, "eval_log_diff_policy": 0.6347798705101013, "eval_logits": -1.2473042011260986, "eval_logp_accuracy": 0.49728259444236755, "eval_loss": 0.4228622317314148, "eval_objective": 0.4380165934562683, "eval_ranking_simple": 0.49728259444236755, "eval_rejected_logps": -93.94828796386719, "eval_rejected_rewards": 0.13311611115932465, "eval_reward_accuracy": 0.623641312122345, "eval_runtime": 252.8222, "eval_samples_per_second": 17.431, "eval_steps_per_second": 0.728, "step": 150 }, { "chosen_logps": -89.85877227783203, "chosen_rewards": 0.4100703001022339, "epoch": 0.9957173447537473, "grad_norm": 24.044882848353204, "learning_rate": 8.441318539108432e-07, "log_diff_policy": 2.9472341537475586, "logits": -1.1912480592727661, "logp_accuracy": 0.5666666626930237, "loss": 0.3312, "objective": 0.3233080506324768, "ranking_simple": 0.5666666626930237, "rejected_logps": -92.80601501464844, "rejected_rewards": 0.15804290771484375, "reward_accuracy": 0.824999988079071, "step": 155 }, { "chosen_logps": -89.56094360351562, "chosen_rewards": 0.41586926579475403, "epoch": 1.0278372591006424, "grad_norm": 24.50044066310441, "learning_rate": 8.302611132343041e-07, "log_diff_policy": 2.900397777557373, "logits": -1.2766525745391846, "logp_accuracy": 0.5625, "loss": 0.2781, "objective": 0.301949679851532, "ranking_simple": 0.5625, "rejected_logps": -92.46134185791016, "rejected_rewards": 0.17179752886295319, "reward_accuracy": 0.8333333134651184, "step": 160 }, { "chosen_logps": -90.14144134521484, "chosen_rewards": 0.4032049775123596, "epoch": 1.0599571734475375, "grad_norm": 20.25292316832607, "learning_rate": 8.159240423786819e-07, "log_diff_policy": 3.3492562770843506, "logits": -1.2823920249938965, "logp_accuracy": 0.5541666746139526, "loss": 0.2616, "objective": 0.2580924332141876, "ranking_simple": 0.5541666746139526, "rejected_logps": -93.4906997680664, "rejected_rewards": 0.08329837024211884, "reward_accuracy": 0.8999999761581421, "step": 165 }, { "chosen_logps": -86.15848541259766, "chosen_rewards": 0.3804631531238556, "epoch": 1.0920770877944326, "grad_norm": 24.076827597953375, "learning_rate": 8.011408853525861e-07, "log_diff_policy": 4.773606300354004, "logits": -1.277355432510376, "logp_accuracy": 0.625, "loss": 0.2737, "objective": 0.24181896448135376, "ranking_simple": 0.625, "rejected_logps": -90.93209838867188, "rejected_rewards": 0.08991222083568573, "reward_accuracy": 0.8541666865348816, "step": 170 }, { "chosen_logps": -89.10989379882812, "chosen_rewards": 0.42462998628616333, "epoch": 1.1241970021413277, "grad_norm": 22.19365033726379, "learning_rate": 7.859325160403071e-07, "log_diff_policy": 6.029292106628418, "logits": -1.245682954788208, "logp_accuracy": 0.625, "loss": 0.2661, "objective": 0.26358726620674133, "ranking_simple": 0.625, "rejected_logps": -95.13919067382812, "rejected_rewards": 0.10703911632299423, "reward_accuracy": 0.9083333611488342, "step": 175 }, { "chosen_logps": -88.6956558227539, "chosen_rewards": 0.44047945737838745, "epoch": 1.1563169164882228, "grad_norm": 25.51019205806316, "learning_rate": 7.703204087277988e-07, "log_diff_policy": 1.8489243984222412, "logits": -1.2523506879806519, "logp_accuracy": 0.5625, "loss": 0.2561, "objective": 0.2906259596347809, "ranking_simple": 0.5625, "rejected_logps": -90.5445785522461, "rejected_rewards": 0.15187840163707733, "reward_accuracy": 0.862500011920929, "step": 180 }, { "chosen_logps": -89.39094543457031, "chosen_rewards": 0.4826006591320038, "epoch": 1.1884368308351179, "grad_norm": 22.73861838365072, "learning_rate": 7.543266077808892e-07, "log_diff_policy": 2.7822341918945312, "logits": -1.2815226316452026, "logp_accuracy": 0.5333333611488342, "loss": 0.2677, "objective": 0.2516113519668579, "ranking_simple": 0.5333333611488342, "rejected_logps": -92.17317962646484, "rejected_rewards": 0.193745955824852, "reward_accuracy": 0.8500000238418579, "step": 185 }, { "chosen_logps": -88.03435516357422, "chosen_rewards": 0.4610452651977539, "epoch": 1.2205567451820127, "grad_norm": 21.163830443635362, "learning_rate": 7.379736965185368e-07, "log_diff_policy": 4.70356559753418, "logits": -1.3062714338302612, "logp_accuracy": 0.5833333134651184, "loss": 0.2645, "objective": 0.2658800184726715, "ranking_simple": 0.5833333134651184, "rejected_logps": -92.7379150390625, "rejected_rewards": 0.17333266139030457, "reward_accuracy": 0.8416666388511658, "step": 190 }, { "chosen_logps": -88.67347717285156, "chosen_rewards": 0.4372544586658478, "epoch": 1.252676659528908, "grad_norm": 24.288282084732373, "learning_rate": 7.212847653250828e-07, "log_diff_policy": 4.752214431762695, "logits": -1.1994919776916504, "logp_accuracy": 0.5874999761581421, "loss": 0.2712, "objective": 0.25489628314971924, "ranking_simple": 0.5874999761581421, "rejected_logps": -93.42569732666016, "rejected_rewards": 0.17621363699436188, "reward_accuracy": 0.8083333373069763, "step": 195 }, { "chosen_logps": -89.87748718261719, "chosen_rewards": 0.39736318588256836, "epoch": 1.284796573875803, "grad_norm": 21.540888108656127, "learning_rate": 7.042833790465241e-07, "log_diff_policy": 1.4937970638275146, "logits": -1.1533904075622559, "logp_accuracy": 0.5375000238418579, "loss": 0.2504, "objective": 0.27253374457359314, "ranking_simple": 0.5375000238418579, "rejected_logps": -91.37129211425781, "rejected_rewards": 0.10253804177045822, "reward_accuracy": 0.8500000238418579, "step": 200 }, { "epoch": 1.284796573875803, "eval_chosen_logps": -93.76664733886719, "eval_chosen_rewards": 0.1571601778268814, "eval_log_diff_policy": 0.6747260689735413, "eval_logits": -1.2087041139602661, "eval_logp_accuracy": 0.49184781312942505, "eval_loss": 0.41806715726852417, "eval_objective": 0.4327898323535919, "eval_ranking_simple": 0.49184781312942505, "eval_rejected_logps": -94.44136047363281, "eval_rejected_rewards": 0.0838075801730156, "eval_reward_accuracy": 0.645380437374115, "eval_runtime": 252.0687, "eval_samples_per_second": 17.483, "eval_steps_per_second": 0.73, "step": 200 }, { "chosen_logps": -92.08726501464844, "chosen_rewards": 0.3464154005050659, "epoch": 1.316916488222698, "grad_norm": 21.70695609823354, "learning_rate": 6.869935437168449e-07, "log_diff_policy": 5.8689703941345215, "logits": -1.2797820568084717, "logp_accuracy": 0.612500011920929, "loss": 0.2613, "objective": 0.24516534805297852, "ranking_simple": 0.612500011920929, "rejected_logps": -97.95622253417969, "rejected_rewards": 0.01421588845551014, "reward_accuracy": 0.9041666388511658, "step": 205 }, { "chosen_logps": -90.03158569335938, "chosen_rewards": 0.27640679478645325, "epoch": 1.3490364025695931, "grad_norm": 21.229628621826144, "learning_rate": 6.694396726613882e-07, "log_diff_policy": 2.6674647331237793, "logits": -1.2098687887191772, "logp_accuracy": 0.5375000238418579, "loss": 0.2515, "objective": 0.2278667390346527, "ranking_simple": 0.5375000238418579, "rejected_logps": -92.69905853271484, "rejected_rewards": 0.0070241158828139305, "reward_accuracy": 0.8500000238418579, "step": 210 }, { "chosen_logps": -90.69125366210938, "chosen_rewards": 0.29854997992515564, "epoch": 1.3811563169164882, "grad_norm": 22.55437425298298, "learning_rate": 6.516465520251313e-07, "log_diff_policy": 2.911858558654785, "logits": -1.1753002405166626, "logp_accuracy": 0.5416666865348816, "loss": 0.2539, "objective": 0.2530354857444763, "ranking_simple": 0.5416666865348816, "rejected_logps": -93.60311889648438, "rejected_rewards": 0.012208380736410618, "reward_accuracy": 0.887499988079071, "step": 215 }, { "chosen_logps": -90.42436981201172, "chosen_rewards": 0.3347964882850647, "epoch": 1.4132762312633833, "grad_norm": 27.739707396423526, "learning_rate": 6.336393057745364e-07, "log_diff_policy": 4.839695453643799, "logits": -1.214857578277588, "logp_accuracy": 0.5958333611488342, "loss": 0.2568, "objective": 0.2565983235836029, "ranking_simple": 0.5958333611488342, "rejected_logps": -95.2640609741211, "rejected_rewards": 0.011433997191488743, "reward_accuracy": 0.8708333373069763, "step": 220 }, { "chosen_logps": -90.56299591064453, "chosen_rewards": 0.3776172697544098, "epoch": 1.4453961456102784, "grad_norm": 21.760281393589654, "learning_rate": 6.154433602223978e-07, "log_diff_policy": 2.292363166809082, "logits": -1.2873131036758423, "logp_accuracy": 0.5583333373069763, "loss": 0.2551, "objective": 0.23114843666553497, "ranking_simple": 0.5583333373069763, "rejected_logps": -92.85536193847656, "rejected_rewards": 0.0627303421497345, "reward_accuracy": 0.925000011920929, "step": 225 }, { "chosen_logps": -89.62771606445312, "chosen_rewards": 0.4355151057243347, "epoch": 1.4775160599571735, "grad_norm": 21.763063732173002, "learning_rate": 5.970844081257733e-07, "log_diff_policy": 2.696823835372925, "logits": -1.1932998895645142, "logp_accuracy": 0.5833333134651184, "loss": 0.259, "objective": 0.2560196816921234, "ranking_simple": 0.5833333134651184, "rejected_logps": -92.32455444335938, "rejected_rewards": 0.14893841743469238, "reward_accuracy": 0.8500000238418579, "step": 230 }, { "chosen_logps": -87.42562866210938, "chosen_rewards": 0.4877433478832245, "epoch": 1.5096359743040684, "grad_norm": 26.09605026265361, "learning_rate": 5.78588372407695e-07, "log_diff_policy": 3.6128320693969727, "logits": -1.272985577583313, "logp_accuracy": 0.5458333492279053, "loss": 0.2616, "objective": 0.2515460252761841, "ranking_simple": 0.5458333492279053, "rejected_logps": -91.03845977783203, "rejected_rewards": 0.15287213027477264, "reward_accuracy": 0.9083333611488342, "step": 235 }, { "chosen_logps": -88.16352081298828, "chosen_rewards": 0.4461686313152313, "epoch": 1.5417558886509637, "grad_norm": 26.348174118085442, "learning_rate": 5.599813695538865e-07, "log_diff_policy": 3.9842584133148193, "logits": -1.2812697887420654, "logp_accuracy": 0.5874999761581421, "loss": 0.2589, "objective": 0.26638758182525635, "ranking_simple": 0.5874999761581421, "rejected_logps": -92.14775848388672, "rejected_rewards": 0.1892513781785965, "reward_accuracy": 0.8125, "step": 240 }, { "chosen_logps": -89.65911102294922, "chosen_rewards": 0.5600944757461548, "epoch": 1.5738758029978586, "grad_norm": 26.608934442683772, "learning_rate": 5.412896727361662e-07, "log_diff_policy": 4.081780433654785, "logits": -1.2721847295761108, "logp_accuracy": 0.6083333492279053, "loss": 0.2494, "objective": 0.24282366037368774, "ranking_simple": 0.6083333492279053, "rejected_logps": -93.74089813232422, "rejected_rewards": 0.2519553005695343, "reward_accuracy": 0.8666666746139526, "step": 245 }, { "chosen_logps": -87.64533996582031, "chosen_rewards": 0.5675151348114014, "epoch": 1.6059957173447539, "grad_norm": 27.440611511048427, "learning_rate": 5.225396747146111e-07, "log_diff_policy": 5.388829231262207, "logits": -1.3017712831497192, "logp_accuracy": 0.6083333492279053, "loss": 0.2565, "objective": 0.2558087110519409, "ranking_simple": 0.6083333492279053, "rejected_logps": -93.03416442871094, "rejected_rewards": 0.23212392628192902, "reward_accuracy": 0.8999999761581421, "step": 250 }, { "epoch": 1.6059957173447539, "eval_chosen_logps": -92.09646606445312, "eval_chosen_rewards": 0.324177086353302, "eval_log_diff_policy": 0.6351930499076843, "eval_logits": -1.2578530311584473, "eval_logp_accuracy": 0.4945652186870575, "eval_loss": 0.42034390568733215, "eval_objective": 0.4386288523674011, "eval_ranking_simple": 0.4945652186870575, "eval_rejected_logps": -92.7316665649414, "eval_rejected_rewards": 0.25477781891822815, "eval_reward_accuracy": 0.6277173757553101, "eval_runtime": 251.3822, "eval_samples_per_second": 17.531, "eval_steps_per_second": 0.732, "step": 250 }, { "chosen_logps": -87.76354217529297, "chosen_rewards": 0.571170449256897, "epoch": 1.6381156316916488, "grad_norm": 23.82842886186071, "learning_rate": 5.03757850570861e-07, "log_diff_policy": 5.523890018463135, "logits": -1.3106282949447632, "logp_accuracy": 0.612500011920929, "loss": 0.2317, "objective": 0.2333315759897232, "ranking_simple": 0.612500011920929, "rejected_logps": -93.28743743896484, "rejected_rewards": 0.24962659180164337, "reward_accuracy": 0.9333333373069763, "step": 255 }, { "chosen_logps": -89.83170318603516, "chosen_rewards": 0.5567531585693359, "epoch": 1.6702355460385439, "grad_norm": 23.51594850165469, "learning_rate": 4.849707203251826e-07, "log_diff_policy": 1.3650144338607788, "logits": -1.3435307741165161, "logp_accuracy": 0.5083333253860474, "loss": 0.2477, "objective": 0.22092224657535553, "ranking_simple": 0.5083333253860474, "rejected_logps": -91.19673156738281, "rejected_rewards": 0.25820502638816833, "reward_accuracy": 0.9333333373069763, "step": 260 }, { "chosen_logps": -87.76793670654297, "chosen_rewards": 0.5400545597076416, "epoch": 1.702355460385439, "grad_norm": 27.58860935242549, "learning_rate": 4.6620481149008364e-07, "log_diff_policy": 5.315587043762207, "logits": -1.289357304573059, "logp_accuracy": 0.6166666746139526, "loss": 0.2577, "objective": 0.24521103501319885, "ranking_simple": 0.6166666746139526, "rejected_logps": -93.08353424072266, "rejected_rewards": 0.24422727525234222, "reward_accuracy": 0.8833333253860474, "step": 265 }, { "chosen_logps": -88.39920806884766, "chosen_rewards": 0.47542911767959595, "epoch": 1.734475374732334, "grad_norm": 23.041909074983945, "learning_rate": 4.474866216133433e-07, "log_diff_policy": 4.8622894287109375, "logits": -1.2672302722930908, "logp_accuracy": 0.574999988079071, "loss": 0.2556, "objective": 0.24352595210075378, "ranking_simple": 0.574999988079071, "rejected_logps": -93.2614974975586, "rejected_rewards": 0.1576487272977829, "reward_accuracy": 0.9041666388511658, "step": 270 }, { "chosen_logps": -90.29805755615234, "chosen_rewards": 0.49421098828315735, "epoch": 1.7665952890792291, "grad_norm": 22.326208670509114, "learning_rate": 4.2884258086335745e-07, "log_diff_policy": 0.5838987827301025, "logits": -1.2083064317703247, "logp_accuracy": 0.48750001192092896, "loss": 0.2457, "objective": 0.24729198217391968, "ranking_simple": 0.48750001192092896, "rejected_logps": -90.8819580078125, "rejected_rewards": 0.19118016958236694, "reward_accuracy": 0.9125000238418579, "step": 275 }, { "chosen_logps": -88.4253921508789, "chosen_rewards": 0.5429556369781494, "epoch": 1.7987152034261242, "grad_norm": 22.630252805144902, "learning_rate": 4.1029901470962105e-07, "log_diff_policy": 4.2560200691223145, "logits": -1.2564666271209717, "logp_accuracy": 0.574999988079071, "loss": 0.2439, "objective": 0.22673402726650238, "ranking_simple": 0.574999988079071, "rejected_logps": -92.6814193725586, "rejected_rewards": 0.24279657006263733, "reward_accuracy": 0.887499988079071, "step": 280 }, { "chosen_logps": -88.12873077392578, "chosen_rewards": 0.5024438500404358, "epoch": 1.8308351177730193, "grad_norm": 24.48637670779026, "learning_rate": 3.918821067510464e-07, "log_diff_policy": 3.8860011100769043, "logits": -1.2542107105255127, "logp_accuracy": 0.574999988079071, "loss": 0.2385, "objective": 0.2421007752418518, "ranking_simple": 0.574999988079071, "rejected_logps": -92.01472473144531, "rejected_rewards": 0.19595599174499512, "reward_accuracy": 0.8999999761581421, "step": 285 }, { "chosen_logps": -88.56678771972656, "chosen_rewards": 0.43759381771087646, "epoch": 1.8629550321199142, "grad_norm": 23.278909184081943, "learning_rate": 3.7361786174460406e-07, "log_diff_policy": 4.839540004730225, "logits": -1.2234069108963013, "logp_accuracy": 0.5874999761581421, "loss": 0.2499, "objective": 0.2379584163427353, "ranking_simple": 0.5874999761581421, "rejected_logps": -93.40633392333984, "rejected_rewards": 0.10591766983270645, "reward_accuracy": 0.9291666746139526, "step": 290 }, { "chosen_logps": -89.0399169921875, "chosen_rewards": 0.4158915877342224, "epoch": 1.8950749464668095, "grad_norm": 25.094867990393954, "learning_rate": 3.5553206888648885e-07, "log_diff_policy": 4.1703782081604, "logits": -1.16842520236969, "logp_accuracy": 0.6291666626930237, "loss": 0.2596, "objective": 0.2710338830947876, "ranking_simple": 0.6291666626930237, "rejected_logps": -93.21028900146484, "rejected_rewards": 0.09861068427562714, "reward_accuracy": 0.8958333134651184, "step": 295 }, { "chosen_logps": -90.71745300292969, "chosen_rewards": 0.37180519104003906, "epoch": 1.9271948608137044, "grad_norm": 25.115297616399094, "learning_rate": 3.3765026539765827e-07, "log_diff_policy": 2.186328649520874, "logits": -1.2334167957305908, "logp_accuracy": 0.550000011920929, "loss": 0.2468, "objective": 0.2339223474264145, "ranking_simple": 0.550000011920929, "rejected_logps": -92.9037857055664, "rejected_rewards": 0.09225444495677948, "reward_accuracy": 0.875, "step": 300 }, { "epoch": 1.9271948608137044, "eval_chosen_logps": -93.93910217285156, "eval_chosen_rewards": 0.13991419970989227, "eval_log_diff_policy": 0.6116326451301575, "eval_logits": -1.2023948431015015, "eval_logp_accuracy": 0.49184781312942505, "eval_loss": 0.41765785217285156, "eval_objective": 0.43168357014656067, "eval_ranking_simple": 0.49184781312942505, "eval_rejected_logps": -94.55073547363281, "eval_rejected_rewards": 0.07287092506885529, "eval_reward_accuracy": 0.625, "eval_runtime": 249.9114, "eval_samples_per_second": 17.634, "eval_steps_per_second": 0.736, "step": 300 }, { "chosen_logps": -90.62615203857422, "chosen_rewards": 0.4162503182888031, "epoch": 1.9593147751605997, "grad_norm": 22.08186105341806, "learning_rate": 3.1999770046516194e-07, "log_diff_policy": 3.878814935684204, "logits": -1.2082240581512451, "logp_accuracy": 0.5666666626930237, "loss": 0.2341, "objective": 0.23510180413722992, "ranking_simple": 0.5666666626930237, "rejected_logps": -94.5049819946289, "rejected_rewards": 0.07595311105251312, "reward_accuracy": 0.9333333373069763, "step": 305 }, { "chosen_logps": -92.52909851074219, "chosen_rewards": 0.3378387689590454, "epoch": 1.9914346895074946, "grad_norm": 25.92192901477547, "learning_rate": 3.0259929959017584e-07, "log_diff_policy": 1.9836726188659668, "logits": -1.1833205223083496, "logp_accuracy": 0.5291666388511658, "loss": 0.2465, "objective": 0.23329707980155945, "ranking_simple": 0.5291666388511658, "rejected_logps": -94.51278686523438, "rejected_rewards": 0.022924652323126793, "reward_accuracy": 0.8708333373069763, "step": 310 }, { "chosen_logps": -90.68253326416016, "chosen_rewards": 0.34999939799308777, "epoch": 2.02355460385439, "grad_norm": 22.748913052851215, "learning_rate": 2.8547962939308186e-07, "log_diff_policy": 5.066477298736572, "logits": -1.2325475215911865, "logp_accuracy": 0.5833333134651184, "loss": 0.2016, "objective": 0.19704188406467438, "ranking_simple": 0.5833333134651184, "rejected_logps": -95.7490234375, "rejected_rewards": 0.030457384884357452, "reward_accuracy": 0.9458333253860474, "step": 315 }, { "chosen_logps": -90.06270599365234, "chosen_rewards": 0.4109943211078644, "epoch": 2.0556745182012848, "grad_norm": 22.556229957362657, "learning_rate": 2.686628629252898e-07, "log_diff_policy": 3.9403927326202393, "logits": -1.2493319511413574, "logp_accuracy": 0.5625, "loss": 0.2152, "objective": 0.22448262572288513, "ranking_simple": 0.5625, "rejected_logps": -94.00309753417969, "rejected_rewards": 0.05938427150249481, "reward_accuracy": 0.9166666865348816, "step": 320 }, { "chosen_logps": -91.30533599853516, "chosen_rewards": 0.2977936565876007, "epoch": 2.08779443254818, "grad_norm": 25.49323725569356, "learning_rate": 2.521727455367797e-07, "log_diff_policy": 3.5190927982330322, "logits": -1.1692496538162231, "logp_accuracy": 0.5458333492279053, "loss": 0.1983, "objective": 0.20966817438602448, "ranking_simple": 0.5458333492279053, "rejected_logps": -94.82442474365234, "rejected_rewards": -0.040650624781847, "reward_accuracy": 0.9083333611488342, "step": 325 }, { "chosen_logps": -90.49507141113281, "chosen_rewards": 0.33394020795822144, "epoch": 2.119914346895075, "grad_norm": 30.205833714302745, "learning_rate": 2.3603256134756062e-07, "log_diff_policy": 2.7719154357910156, "logits": -1.2458268404006958, "logp_accuracy": 0.5458333492279053, "loss": 0.1989, "objective": 0.22537565231323242, "ranking_simple": 0.5458333492279053, "rejected_logps": -93.26697540283203, "rejected_rewards": 0.032294537872076035, "reward_accuracy": 0.875, "step": 330 }, { "chosen_logps": -92.18978881835938, "chosen_rewards": 0.351632684469223, "epoch": 2.15203426124197, "grad_norm": 22.97866369635426, "learning_rate": 2.202651003703885e-07, "log_diff_policy": 1.8995152711868286, "logits": -1.2143503427505493, "logp_accuracy": 0.5416666865348816, "loss": 0.1873, "objective": 0.171888530254364, "ranking_simple": 0.5416666865348816, "rejected_logps": -94.08928680419922, "rejected_rewards": 0.027240337803959846, "reward_accuracy": 0.9125000238418579, "step": 335 }, { "chosen_logps": -90.46728515625, "chosen_rewards": 0.3730458617210388, "epoch": 2.184154175588865, "grad_norm": 23.979447378441623, "learning_rate": 2.0489262633116532e-07, "log_diff_policy": 3.6224582195281982, "logits": -1.1927863359451294, "logp_accuracy": 0.5916666388511658, "loss": 0.2088, "objective": 0.18619689345359802, "ranking_simple": 0.5916666388511658, "rejected_logps": -94.08972930908203, "rejected_rewards": 0.035482097417116165, "reward_accuracy": 0.9333333373069763, "step": 340 }, { "chosen_logps": -89.31218719482422, "chosen_rewards": 0.4046629071235657, "epoch": 2.21627408993576, "grad_norm": 27.635580381342287, "learning_rate": 1.899368452324584e-07, "log_diff_policy": 5.521125316619873, "logits": -1.2124210596084595, "logp_accuracy": 0.5874999761581421, "loss": 0.2067, "objective": 0.2378748506307602, "ranking_simple": 0.5874999761581421, "rejected_logps": -94.83332824707031, "rejected_rewards": 0.06822724640369415, "reward_accuracy": 0.9166666865348816, "step": 345 }, { "chosen_logps": -89.45582580566406, "chosen_rewards": 0.40281572937965393, "epoch": 2.2483940042826553, "grad_norm": 22.829183036104794, "learning_rate": 1.7541887470452604e-07, "log_diff_policy": 3.7678110599517822, "logits": -1.207155704498291, "logp_accuracy": 0.5708333253860474, "loss": 0.1956, "objective": 0.20592695474624634, "ranking_simple": 0.5708333253860474, "rejected_logps": -93.22364807128906, "rejected_rewards": 0.05199350416660309, "reward_accuracy": 0.9416666626930237, "step": 350 }, { "epoch": 2.2483940042826553, "eval_chosen_logps": -93.90203857421875, "eval_chosen_rewards": 0.14362077414989471, "eval_log_diff_policy": 0.6461811661720276, "eval_logits": -1.2088615894317627, "eval_logp_accuracy": 0.49184781312942505, "eval_loss": 0.418244332075119, "eval_objective": 0.431539922952652, "eval_ranking_simple": 0.49184781312942505, "eval_rejected_logps": -94.54822540283203, "eval_rejected_rewards": 0.07312265783548355, "eval_reward_accuracy": 0.6304348111152649, "eval_runtime": 251.7686, "eval_samples_per_second": 17.504, "eval_steps_per_second": 0.731, "step": 350 }, { "chosen_logps": -91.58312225341797, "chosen_rewards": 0.4052427411079407, "epoch": 2.28051391862955, "grad_norm": 23.31116170327188, "learning_rate": 1.6135921418712955e-07, "log_diff_policy": 1.2887767553329468, "logits": -1.2191410064697266, "logp_accuracy": 0.5458333492279053, "loss": 0.2078, "objective": 0.2148849070072174, "ranking_simple": 0.5458333492279053, "rejected_logps": -92.87187957763672, "rejected_rewards": 0.07918362319469452, "reward_accuracy": 0.9458333253860474, "step": 355 }, { "chosen_logps": -90.15478515625, "chosen_rewards": 0.4006979763507843, "epoch": 2.3126338329764455, "grad_norm": 22.615717682916863, "learning_rate": 1.4777771598423144e-07, "log_diff_policy": 2.372527599334717, "logits": -1.217307209968567, "logp_accuracy": 0.5791666507720947, "loss": 0.1982, "objective": 0.18126362562179565, "ranking_simple": 0.5791666507720947, "rejected_logps": -92.5273208618164, "rejected_rewards": 0.06262332946062088, "reward_accuracy": 0.949999988079071, "step": 360 }, { "chosen_logps": -89.39469146728516, "chosen_rewards": 0.3872474730014801, "epoch": 2.3447537473233404, "grad_norm": 24.070852930137136, "learning_rate": 1.34693557232453e-07, "log_diff_policy": 3.9540040493011475, "logits": -1.2530477046966553, "logp_accuracy": 0.5666666626930237, "loss": 0.1924, "objective": 0.1832498162984848, "ranking_simple": 0.5666666626930237, "rejected_logps": -93.34870147705078, "rejected_rewards": 0.03228560462594032, "reward_accuracy": 0.9208333492279053, "step": 365 }, { "chosen_logps": -88.58394622802734, "chosen_rewards": 0.3574458062648773, "epoch": 2.3768736616702357, "grad_norm": 22.98134707468111, "learning_rate": 1.2212521282287093e-07, "log_diff_policy": 4.671655654907227, "logits": -1.227338194847107, "logp_accuracy": 0.5666666626930237, "loss": 0.1938, "objective": 0.1913188248872757, "ranking_simple": 0.5666666626930237, "rejected_logps": -93.2555923461914, "rejected_rewards": 0.008650337345898151, "reward_accuracy": 0.9375, "step": 370 }, { "chosen_logps": -90.09524536132812, "chosen_rewards": 0.35855621099472046, "epoch": 2.4089935760171306, "grad_norm": 24.612617872840463, "learning_rate": 1.1009042931438783e-07, "log_diff_policy": 3.6620209217071533, "logits": -1.280255913734436, "logp_accuracy": 0.5666666626930237, "loss": 0.2022, "objective": 0.195404514670372, "ranking_simple": 0.5666666626930237, "rejected_logps": -93.75727844238281, "rejected_rewards": 0.014202169142663479, "reward_accuracy": 0.9375, "step": 375 }, { "chosen_logps": -89.82514953613281, "chosen_rewards": 0.3849697709083557, "epoch": 2.4411134903640255, "grad_norm": 23.590016368760683, "learning_rate": 9.860619987551156e-08, "log_diff_policy": 5.329193592071533, "logits": -1.2197004556655884, "logp_accuracy": 0.612500011920929, "loss": 0.1915, "objective": 0.22053636610507965, "ranking_simple": 0.612500011920929, "rejected_logps": -95.15433502197266, "rejected_rewards": 0.023265231400728226, "reward_accuracy": 0.9375, "step": 380 }, { "chosen_logps": -92.0965576171875, "chosen_rewards": 0.3651116192340851, "epoch": 2.473233404710921, "grad_norm": 22.03122151196825, "learning_rate": 8.768874028992429e-08, "log_diff_policy": 2.3904306888580322, "logits": -1.218959093093872, "logp_accuracy": 0.5333333611488342, "loss": 0.2005, "objective": 0.22580821812152863, "ranking_simple": 0.5333333611488342, "rejected_logps": -94.48699188232422, "rejected_rewards": 0.058130040764808655, "reward_accuracy": 0.9083333611488342, "step": 385 }, { "chosen_logps": -89.82624816894531, "chosen_rewards": 0.4091958701610565, "epoch": 2.505353319057816, "grad_norm": 21.19785067339645, "learning_rate": 7.735346605972321e-08, "log_diff_policy": 6.387943744659424, "logits": -1.235397458076477, "logp_accuracy": 0.6416666507720947, "loss": 0.1936, "objective": 0.16518478095531464, "ranking_simple": 0.6416666507720947, "rejected_logps": -96.21418762207031, "rejected_rewards": 0.04784006625413895, "reward_accuracy": 0.9416666626930237, "step": 390 }, { "chosen_logps": -89.0674819946289, "chosen_rewards": 0.42163580656051636, "epoch": 2.537473233404711, "grad_norm": 21.39369406236078, "learning_rate": 6.761497063866206e-08, "log_diff_policy": 4.555739402770996, "logits": -1.271119236946106, "logp_accuracy": 0.5958333611488342, "loss": 0.2032, "objective": 0.18886464834213257, "ranking_simple": 0.5958333611488342, "rejected_logps": -93.62320709228516, "rejected_rewards": 0.10697247833013535, "reward_accuracy": 0.8958333134651184, "step": 395 }, { "chosen_logps": -90.2821273803711, "chosen_rewards": 0.44763919711112976, "epoch": 2.569593147751606, "grad_norm": 24.805353217910756, "learning_rate": 5.8487004826128724e-08, "log_diff_policy": 4.245726108551025, "logits": -1.208069920539856, "logp_accuracy": 0.5708333253860474, "loss": 0.1909, "objective": 0.2093334048986435, "ranking_simple": 0.5708333253860474, "rejected_logps": -94.5278549194336, "rejected_rewards": 0.07701942324638367, "reward_accuracy": 0.9041666388511658, "step": 400 }, { "epoch": 2.569593147751606, "eval_chosen_logps": -93.78236389160156, "eval_chosen_rewards": 0.15558815002441406, "eval_log_diff_policy": 0.6469355821609497, "eval_logits": -1.199602484703064, "eval_logp_accuracy": 0.49184781312942505, "eval_loss": 0.4186185598373413, "eval_objective": 0.4326099753379822, "eval_ranking_simple": 0.49184781312942505, "eval_rejected_logps": -94.4292984008789, "eval_rejected_rewards": 0.08501457422971725, "eval_reward_accuracy": 0.635869562625885, "eval_runtime": 252.1041, "eval_samples_per_second": 17.481, "eval_steps_per_second": 0.73, "step": 400 }, { "chosen_logps": -89.90658569335938, "chosen_rewards": 0.3862473666667938, "epoch": 2.601713062098501, "grad_norm": 24.505822084687995, "learning_rate": 4.9982457350954576e-08, "log_diff_policy": 2.7724380493164062, "logits": -1.2620965242385864, "logp_accuracy": 0.5625, "loss": 0.1883, "objective": 0.1867034137248993, "ranking_simple": 0.5625, "rejected_logps": -92.67900848388672, "rejected_rewards": 0.07545500248670578, "reward_accuracy": 0.9291666746139526, "step": 405 }, { "chosen_logps": -89.51068878173828, "chosen_rewards": 0.39734941720962524, "epoch": 2.633832976445396, "grad_norm": 21.878299647612078, "learning_rate": 4.2113336672471245e-08, "log_diff_policy": 3.5083513259887695, "logits": -1.1789612770080566, "logp_accuracy": 0.5791666507720947, "loss": 0.202, "objective": 0.20192624628543854, "ranking_simple": 0.5791666507720947, "rejected_logps": -93.01902770996094, "rejected_rewards": 0.060310911387205124, "reward_accuracy": 0.925000011920929, "step": 410 }, { "chosen_logps": -88.49069213867188, "chosen_rewards": 0.4038696885108948, "epoch": 2.6659528907922914, "grad_norm": 22.981148769436672, "learning_rate": 3.4890754024512246e-08, "log_diff_policy": 4.208141803741455, "logits": -1.2675896883010864, "logp_accuracy": 0.6166666746139526, "loss": 0.1981, "objective": 0.17464695870876312, "ranking_simple": 0.6166666746139526, "rejected_logps": -92.69884490966797, "rejected_rewards": 0.0687737911939621, "reward_accuracy": 0.9375, "step": 415 }, { "chosen_logps": -90.7087173461914, "chosen_rewards": 0.4322812259197235, "epoch": 2.6980728051391862, "grad_norm": 21.571624798366557, "learning_rate": 2.8324907726300362e-08, "log_diff_policy": 2.6288528442382812, "logits": -1.2604050636291504, "logp_accuracy": 0.5541666746139526, "loss": 0.186, "objective": 0.1804119050502777, "ranking_simple": 0.5541666746139526, "rejected_logps": -93.33757019042969, "rejected_rewards": 0.10248282551765442, "reward_accuracy": 0.925000011920929, "step": 420 }, { "chosen_logps": -89.65091705322266, "chosen_rewards": 0.42282071709632874, "epoch": 2.730192719486081, "grad_norm": 21.813226731137505, "learning_rate": 2.2425068782375378e-08, "log_diff_policy": 3.298468589782715, "logits": -1.2308186292648315, "logp_accuracy": 0.5208333134651184, "loss": 0.1969, "objective": 0.19682972133159637, "ranking_simple": 0.5208333134651184, "rejected_logps": -92.94937896728516, "rejected_rewards": 0.1043475791811943, "reward_accuracy": 0.9208333492279053, "step": 425 }, { "chosen_logps": -87.14006805419922, "chosen_rewards": 0.4283243417739868, "epoch": 2.7623126338329764, "grad_norm": 24.3840853234919, "learning_rate": 1.719956779189352e-08, "log_diff_policy": 5.845090389251709, "logits": -1.2345212697982788, "logp_accuracy": 0.574999988079071, "loss": 0.1882, "objective": 0.1731584668159485, "ranking_simple": 0.574999988079071, "rejected_logps": -92.98515319824219, "rejected_rewards": 0.09286811947822571, "reward_accuracy": 0.9375, "step": 430 }, { "chosen_logps": -90.48269653320312, "chosen_rewards": 0.4304358661174774, "epoch": 2.7944325481798717, "grad_norm": 24.61652527478587, "learning_rate": 1.2655783185784252e-08, "log_diff_policy": 2.5907907485961914, "logits": -1.157464861869812, "logp_accuracy": 0.5416666865348816, "loss": 0.1916, "objective": 0.17763207852840424, "ranking_simple": 0.5416666865348816, "rejected_logps": -93.073486328125, "rejected_rewards": 0.08815901726484299, "reward_accuracy": 0.925000011920929, "step": 435 }, { "chosen_logps": -89.2928695678711, "chosen_rewards": 0.4231608510017395, "epoch": 2.8265524625267666, "grad_norm": 21.941346190273187, "learning_rate": 8.800130808372551e-09, "log_diff_policy": 4.692712783813477, "logits": -1.16831636428833, "logp_accuracy": 0.5958333611488342, "loss": 0.2006, "objective": 0.19169165194034576, "ranking_simple": 0.5958333611488342, "rejected_logps": -93.98558044433594, "rejected_rewards": 0.07684887200593948, "reward_accuracy": 0.9291666746139526, "step": 440 }, { "chosen_logps": -90.35305786132812, "chosen_rewards": 0.423807293176651, "epoch": 2.8586723768736615, "grad_norm": 22.787892504020697, "learning_rate": 5.638054858177643e-09, "log_diff_policy": 5.714171886444092, "logits": -1.184297800064087, "logp_accuracy": 0.612500011920929, "loss": 0.1934, "objective": 0.19787992537021637, "ranking_simple": 0.612500011920929, "rejected_logps": -96.06722259521484, "rejected_rewards": 0.06335235387086868, "reward_accuracy": 0.9375, "step": 445 }, { "chosen_logps": -88.91342163085938, "chosen_rewards": 0.4381515383720398, "epoch": 2.890792291220557, "grad_norm": 27.56330730399253, "learning_rate": 3.1740202006804163e-09, "log_diff_policy": 4.057486057281494, "logits": -1.2223776578903198, "logp_accuracy": 0.5958333611488342, "loss": 0.1873, "objective": 0.16403119266033173, "ranking_simple": 0.5958333611488342, "rejected_logps": -92.97090911865234, "rejected_rewards": 0.06596081703901291, "reward_accuracy": 0.9458333253860474, "step": 450 }, { "epoch": 2.890792291220557, "eval_chosen_logps": -93.77582550048828, "eval_chosen_rewards": 0.15624161064624786, "eval_log_diff_policy": 0.6264697909355164, "eval_logits": -1.2007873058319092, "eval_logp_accuracy": 0.49184781312942505, "eval_loss": 0.4184601306915283, "eval_objective": 0.4334569573402405, "eval_ranking_simple": 0.49184781312942505, "eval_rejected_logps": -94.40229797363281, "eval_rejected_rewards": 0.08771461993455887, "eval_reward_accuracy": 0.626358687877655, "eval_runtime": 252.428, "eval_samples_per_second": 17.458, "eval_steps_per_second": 0.729, "step": 450 }, { "chosen_logps": -88.31222534179688, "chosen_rewards": 0.4052502512931824, "epoch": 2.9229122055674517, "grad_norm": 21.03087965897765, "learning_rate": 1.4115060639128818e-09, "log_diff_policy": 3.3760011196136475, "logits": -1.2099932432174683, "logp_accuracy": 0.5375000238418579, "loss": 0.198, "objective": 0.21335595846176147, "ranking_simple": 0.5375000238418579, "rejected_logps": -91.68822479248047, "rejected_rewards": 0.062255825847387314, "reward_accuracy": 0.9166666865348816, "step": 455 }, { "chosen_logps": -89.78349304199219, "chosen_rewards": 0.4195081293582916, "epoch": 2.955032119914347, "grad_norm": 22.336525693869355, "learning_rate": 3.5300112577302253e-10, "log_diff_policy": 4.523636817932129, "logits": -1.1891493797302246, "logp_accuracy": 0.5791666507720947, "loss": 0.1977, "objective": 0.2073889523744583, "ranking_simple": 0.5791666507720947, "rejected_logps": -94.30713653564453, "rejected_rewards": 0.0867224633693695, "reward_accuracy": 0.9291666746139526, "step": 460 }, { "chosen_logps": -87.48312377929688, "chosen_rewards": 0.4234972894191742, "epoch": 2.987152034261242, "grad_norm": 23.026406642027453, "learning_rate": 0.0, "log_diff_policy": 5.529599189758301, "logits": -1.207619071006775, "logp_accuracy": 0.5916666388511658, "loss": 0.1912, "objective": 0.19047409296035767, "ranking_simple": 0.5916666388511658, "rejected_logps": -93.01271057128906, "rejected_rewards": 0.05281840264797211, "reward_accuracy": 0.9333333373069763, "step": 465 }, { "epoch": 2.987152034261242, "step": 465, "total_flos": 0.0, "train_loss": 0.27528571787700856, "train_runtime": 19429.5857, "train_samples_per_second": 6.92, "train_steps_per_second": 0.024 } ], "logging_steps": 5, "max_steps": 465, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }