{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997803645947727, "eval_steps": 500, "global_step": 2276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 11.643586425942758, "learning_rate": 5.797101449275363e-07, "loss": 2.4907, "step": 1 }, { "epoch": 0.0, "grad_norm": 12.582549687588164, "learning_rate": 1.1594202898550726e-06, "loss": 2.7263, "step": 2 }, { "epoch": 0.0, "grad_norm": 12.13022524530988, "learning_rate": 1.7391304347826088e-06, "loss": 2.6405, "step": 3 }, { "epoch": 0.0, "grad_norm": 12.537529111849302, "learning_rate": 2.3188405797101453e-06, "loss": 2.7744, "step": 4 }, { "epoch": 0.0, "grad_norm": 11.91565755203887, "learning_rate": 2.8985507246376816e-06, "loss": 2.769, "step": 5 }, { "epoch": 0.0, "grad_norm": 10.825594519660413, "learning_rate": 3.4782608695652175e-06, "loss": 2.8101, "step": 6 }, { "epoch": 0.0, "grad_norm": 7.838612610420928, "learning_rate": 4.057971014492754e-06, "loss": 2.5242, "step": 7 }, { "epoch": 0.0, "grad_norm": 7.1332678759177135, "learning_rate": 4.637681159420291e-06, "loss": 2.3856, "step": 8 }, { "epoch": 0.0, "grad_norm": 6.5452520502922935, "learning_rate": 5.2173913043478265e-06, "loss": 2.1936, "step": 9 }, { "epoch": 0.0, "grad_norm": 6.974625236260318, "learning_rate": 5.797101449275363e-06, "loss": 2.0783, "step": 10 }, { "epoch": 0.0, "grad_norm": 6.214005236880991, "learning_rate": 6.376811594202898e-06, "loss": 2.1768, "step": 11 }, { "epoch": 0.01, "grad_norm": 4.14608784314517, "learning_rate": 6.956521739130435e-06, "loss": 2.113, "step": 12 }, { "epoch": 0.01, "grad_norm": 4.255599428442641, "learning_rate": 7.536231884057972e-06, "loss": 1.8455, "step": 13 }, { "epoch": 0.01, "grad_norm": 3.354503559387566, "learning_rate": 8.115942028985508e-06, "loss": 1.8267, "step": 14 }, { "epoch": 0.01, "grad_norm": 3.3810001302119415, "learning_rate": 8.695652173913044e-06, "loss": 1.8159, "step": 15 }, { "epoch": 0.01, "grad_norm": 2.807560376463269, "learning_rate": 9.275362318840581e-06, "loss": 1.6351, "step": 16 }, { "epoch": 0.01, "grad_norm": 3.872593901784333, "learning_rate": 9.855072463768118e-06, "loss": 1.7424, "step": 17 }, { "epoch": 0.01, "grad_norm": 2.8799533466692995, "learning_rate": 1.0434782608695653e-05, "loss": 1.5994, "step": 18 }, { "epoch": 0.01, "grad_norm": 2.5389160113992455, "learning_rate": 1.101449275362319e-05, "loss": 1.5188, "step": 19 }, { "epoch": 0.01, "grad_norm": 3.855058116746892, "learning_rate": 1.1594202898550726e-05, "loss": 1.6548, "step": 20 }, { "epoch": 0.01, "grad_norm": 3.5436634194429146, "learning_rate": 1.2173913043478263e-05, "loss": 1.7053, "step": 21 }, { "epoch": 0.01, "grad_norm": 2.358383374531348, "learning_rate": 1.2753623188405797e-05, "loss": 1.4753, "step": 22 }, { "epoch": 0.01, "grad_norm": 2.421991259337828, "learning_rate": 1.3333333333333333e-05, "loss": 1.522, "step": 23 }, { "epoch": 0.01, "grad_norm": 2.3307991798725456, "learning_rate": 1.391304347826087e-05, "loss": 1.5614, "step": 24 }, { "epoch": 0.01, "grad_norm": 1.7668583031247647, "learning_rate": 1.4492753623188407e-05, "loss": 1.4325, "step": 25 }, { "epoch": 0.01, "grad_norm": 1.973391737599878, "learning_rate": 1.5072463768115944e-05, "loss": 1.5118, "step": 26 }, { "epoch": 0.01, "grad_norm": 1.9894867785393882, "learning_rate": 1.565217391304348e-05, "loss": 1.3893, "step": 27 }, { "epoch": 0.01, "grad_norm": 1.7781160458304106, "learning_rate": 1.6231884057971015e-05, "loss": 1.3143, "step": 28 }, { "epoch": 0.01, "grad_norm": 1.9647883564779547, "learning_rate": 1.681159420289855e-05, "loss": 1.2442, "step": 29 }, { "epoch": 0.01, "grad_norm": 1.7723671123798062, "learning_rate": 1.739130434782609e-05, "loss": 1.2962, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.3308433689432833, "learning_rate": 1.7971014492753624e-05, "loss": 1.2922, "step": 31 }, { "epoch": 0.01, "grad_norm": 1.7216931200287824, "learning_rate": 1.8550724637681162e-05, "loss": 1.2208, "step": 32 }, { "epoch": 0.01, "grad_norm": 1.6221781952572192, "learning_rate": 1.9130434782608697e-05, "loss": 1.2673, "step": 33 }, { "epoch": 0.01, "grad_norm": 1.7879143908143995, "learning_rate": 1.9710144927536236e-05, "loss": 1.2991, "step": 34 }, { "epoch": 0.02, "grad_norm": 1.765667805110619, "learning_rate": 2.028985507246377e-05, "loss": 1.2743, "step": 35 }, { "epoch": 0.02, "grad_norm": 1.8259761603429736, "learning_rate": 2.0869565217391306e-05, "loss": 1.3467, "step": 36 }, { "epoch": 0.02, "grad_norm": 2.0685967011138566, "learning_rate": 2.1449275362318844e-05, "loss": 1.2503, "step": 37 }, { "epoch": 0.02, "grad_norm": 1.7059646870937393, "learning_rate": 2.202898550724638e-05, "loss": 1.3001, "step": 38 }, { "epoch": 0.02, "grad_norm": 2.0942894539570136, "learning_rate": 2.2608695652173914e-05, "loss": 1.2199, "step": 39 }, { "epoch": 0.02, "grad_norm": 1.7040157263706177, "learning_rate": 2.3188405797101453e-05, "loss": 1.1913, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.7267519471502861, "learning_rate": 2.3768115942028988e-05, "loss": 1.2496, "step": 41 }, { "epoch": 0.02, "grad_norm": 1.628832918341474, "learning_rate": 2.4347826086956526e-05, "loss": 1.2608, "step": 42 }, { "epoch": 0.02, "grad_norm": 1.3940197944385069, "learning_rate": 2.4927536231884058e-05, "loss": 1.1285, "step": 43 }, { "epoch": 0.02, "grad_norm": 1.649760072777097, "learning_rate": 2.5507246376811593e-05, "loss": 1.28, "step": 44 }, { "epoch": 0.02, "grad_norm": 1.7809285325538966, "learning_rate": 2.608695652173913e-05, "loss": 1.1782, "step": 45 }, { "epoch": 0.02, "grad_norm": 1.7290156271665695, "learning_rate": 2.6666666666666667e-05, "loss": 1.1365, "step": 46 }, { "epoch": 0.02, "grad_norm": 1.5915233540884302, "learning_rate": 2.7246376811594205e-05, "loss": 1.1961, "step": 47 }, { "epoch": 0.02, "grad_norm": 1.6667204927653672, "learning_rate": 2.782608695652174e-05, "loss": 1.0774, "step": 48 }, { "epoch": 0.02, "grad_norm": 1.749286846898628, "learning_rate": 2.840579710144928e-05, "loss": 1.1676, "step": 49 }, { "epoch": 0.02, "grad_norm": 1.5422308980445505, "learning_rate": 2.8985507246376814e-05, "loss": 1.124, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.989403308759273, "learning_rate": 2.956521739130435e-05, "loss": 1.1377, "step": 51 }, { "epoch": 0.02, "grad_norm": 1.4578673390237324, "learning_rate": 3.0144927536231887e-05, "loss": 0.9924, "step": 52 }, { "epoch": 0.02, "grad_norm": 2.732200499068133, "learning_rate": 3.072463768115942e-05, "loss": 1.0985, "step": 53 }, { "epoch": 0.02, "grad_norm": 1.8219101733939524, "learning_rate": 3.130434782608696e-05, "loss": 1.0965, "step": 54 }, { "epoch": 0.02, "grad_norm": 1.8236530088344272, "learning_rate": 3.188405797101449e-05, "loss": 1.0629, "step": 55 }, { "epoch": 0.02, "grad_norm": 1.8371106234754488, "learning_rate": 3.246376811594203e-05, "loss": 1.0983, "step": 56 }, { "epoch": 0.03, "grad_norm": 1.607395324858573, "learning_rate": 3.304347826086957e-05, "loss": 1.0815, "step": 57 }, { "epoch": 0.03, "grad_norm": 1.4931649407034289, "learning_rate": 3.36231884057971e-05, "loss": 1.042, "step": 58 }, { "epoch": 0.03, "grad_norm": 1.696013204734608, "learning_rate": 3.420289855072464e-05, "loss": 1.0488, "step": 59 }, { "epoch": 0.03, "grad_norm": 1.6631481542054256, "learning_rate": 3.478260869565218e-05, "loss": 1.0179, "step": 60 }, { "epoch": 0.03, "grad_norm": 1.5781069461809598, "learning_rate": 3.5362318840579716e-05, "loss": 1.0055, "step": 61 }, { "epoch": 0.03, "grad_norm": 1.9074186901196404, "learning_rate": 3.594202898550725e-05, "loss": 1.1451, "step": 62 }, { "epoch": 0.03, "grad_norm": 1.4542952716581865, "learning_rate": 3.6521739130434786e-05, "loss": 1.0055, "step": 63 }, { "epoch": 0.03, "grad_norm": 1.5741038694657559, "learning_rate": 3.7101449275362325e-05, "loss": 0.9604, "step": 64 }, { "epoch": 0.03, "grad_norm": 1.7502101363312144, "learning_rate": 3.7681159420289856e-05, "loss": 1.1369, "step": 65 }, { "epoch": 0.03, "grad_norm": 1.6359380712936265, "learning_rate": 3.8260869565217395e-05, "loss": 1.0084, "step": 66 }, { "epoch": 0.03, "grad_norm": 1.4785624689419108, "learning_rate": 3.884057971014493e-05, "loss": 0.9599, "step": 67 }, { "epoch": 0.03, "grad_norm": 1.5994874490887636, "learning_rate": 3.942028985507247e-05, "loss": 0.9929, "step": 68 }, { "epoch": 0.03, "grad_norm": 1.5869945893030168, "learning_rate": 4e-05, "loss": 0.9821, "step": 69 }, { "epoch": 0.03, "grad_norm": 1.6908520502013669, "learning_rate": 3.9999979737407726e-05, "loss": 1.0395, "step": 70 }, { "epoch": 0.03, "grad_norm": 1.827483105248714, "learning_rate": 3.999991894967193e-05, "loss": 0.95, "step": 71 }, { "epoch": 0.03, "grad_norm": 1.5326032303210693, "learning_rate": 3.999981763691582e-05, "loss": 0.9104, "step": 72 }, { "epoch": 0.03, "grad_norm": 1.7735780500601463, "learning_rate": 3.999967579934466e-05, "loss": 1.0167, "step": 73 }, { "epoch": 0.03, "grad_norm": 1.7791213316431005, "learning_rate": 3.999949343724586e-05, "loss": 0.9972, "step": 74 }, { "epoch": 0.03, "grad_norm": 1.7105565452607236, "learning_rate": 3.9999270550988915e-05, "loss": 0.9691, "step": 75 }, { "epoch": 0.03, "grad_norm": 1.592582050287772, "learning_rate": 3.999900714102548e-05, "loss": 1.0106, "step": 76 }, { "epoch": 0.03, "grad_norm": 2.0496283496685774, "learning_rate": 3.999870320788925e-05, "loss": 0.9734, "step": 77 }, { "epoch": 0.03, "grad_norm": 1.7495316151097917, "learning_rate": 3.999835875219611e-05, "loss": 1.0278, "step": 78 }, { "epoch": 0.03, "grad_norm": 1.5694329374823623, "learning_rate": 3.9997973774644006e-05, "loss": 0.9838, "step": 79 }, { "epoch": 0.04, "grad_norm": 1.8093647641961392, "learning_rate": 3.999754827601299e-05, "loss": 0.8479, "step": 80 }, { "epoch": 0.04, "grad_norm": 1.5546666436258079, "learning_rate": 3.999708225716525e-05, "loss": 0.8341, "step": 81 }, { "epoch": 0.04, "grad_norm": 1.7868753582265895, "learning_rate": 3.999657571904505e-05, "loss": 0.9546, "step": 82 }, { "epoch": 0.04, "grad_norm": 2.1873676804986264, "learning_rate": 3.9996028662678765e-05, "loss": 0.9807, "step": 83 }, { "epoch": 0.04, "grad_norm": 1.4470075410647711, "learning_rate": 3.999544108917489e-05, "loss": 0.9186, "step": 84 }, { "epoch": 0.04, "grad_norm": 1.5826707842424323, "learning_rate": 3.999481299972398e-05, "loss": 0.8792, "step": 85 }, { "epoch": 0.04, "grad_norm": 1.5520825823412963, "learning_rate": 3.999414439559872e-05, "loss": 0.8257, "step": 86 }, { "epoch": 0.04, "grad_norm": 1.6944116405098015, "learning_rate": 3.9993435278153875e-05, "loss": 0.9179, "step": 87 }, { "epoch": 0.04, "grad_norm": 1.4839096142884463, "learning_rate": 3.9992685648826295e-05, "loss": 0.8732, "step": 88 }, { "epoch": 0.04, "grad_norm": 1.6980176868303871, "learning_rate": 3.999189550913492e-05, "loss": 0.8802, "step": 89 }, { "epoch": 0.04, "grad_norm": 1.5237780068180864, "learning_rate": 3.9991064860680795e-05, "loss": 0.8969, "step": 90 }, { "epoch": 0.04, "grad_norm": 1.716642926184679, "learning_rate": 3.999019370514701e-05, "loss": 0.9384, "step": 91 }, { "epoch": 0.04, "grad_norm": 1.8095884286186736, "learning_rate": 3.9989282044298764e-05, "loss": 0.9436, "step": 92 }, { "epoch": 0.04, "grad_norm": 1.5327871742450585, "learning_rate": 3.998832987998332e-05, "loss": 0.8673, "step": 93 }, { "epoch": 0.04, "grad_norm": 1.7960201717934612, "learning_rate": 3.9987337214129994e-05, "loss": 0.8478, "step": 94 }, { "epoch": 0.04, "grad_norm": 1.6147399662294912, "learning_rate": 3.998630404875019e-05, "loss": 0.8802, "step": 95 }, { "epoch": 0.04, "grad_norm": 1.885572002046685, "learning_rate": 3.998523038593738e-05, "loss": 0.8739, "step": 96 }, { "epoch": 0.04, "grad_norm": 1.6785596742769209, "learning_rate": 3.9984116227867075e-05, "loss": 0.9109, "step": 97 }, { "epoch": 0.04, "grad_norm": 1.7484381381690777, "learning_rate": 3.998296157679686e-05, "loss": 0.8601, "step": 98 }, { "epoch": 0.04, "grad_norm": 1.476019380984956, "learning_rate": 3.9981766435066335e-05, "loss": 0.8548, "step": 99 }, { "epoch": 0.04, "grad_norm": 1.6881432190427084, "learning_rate": 3.998053080509718e-05, "loss": 0.8578, "step": 100 }, { "epoch": 0.04, "grad_norm": 2.071981311676995, "learning_rate": 3.99792546893931e-05, "loss": 0.8576, "step": 101 }, { "epoch": 0.04, "grad_norm": 1.6036226002256477, "learning_rate": 3.997793809053984e-05, "loss": 0.8477, "step": 102 }, { "epoch": 0.05, "grad_norm": 1.6181401934155382, "learning_rate": 3.997658101120517e-05, "loss": 0.7691, "step": 103 }, { "epoch": 0.05, "grad_norm": 1.6430742419482551, "learning_rate": 3.997518345413887e-05, "loss": 0.8982, "step": 104 }, { "epoch": 0.05, "grad_norm": 1.5890712501487516, "learning_rate": 3.997374542217277e-05, "loss": 0.7554, "step": 105 }, { "epoch": 0.05, "grad_norm": 1.6477850757373529, "learning_rate": 3.99722669182207e-05, "loss": 0.8222, "step": 106 }, { "epoch": 0.05, "grad_norm": 1.454441171803851, "learning_rate": 3.997074794527847e-05, "loss": 0.8149, "step": 107 }, { "epoch": 0.05, "grad_norm": 1.674755716302993, "learning_rate": 3.9969188506423934e-05, "loss": 0.8274, "step": 108 }, { "epoch": 0.05, "grad_norm": 1.660644890174726, "learning_rate": 3.9967588604816904e-05, "loss": 0.824, "step": 109 }, { "epoch": 0.05, "grad_norm": 1.6004205121337858, "learning_rate": 3.9965948243699206e-05, "loss": 0.8572, "step": 110 }, { "epoch": 0.05, "grad_norm": 1.682152292562396, "learning_rate": 3.996426742639463e-05, "loss": 0.8007, "step": 111 }, { "epoch": 0.05, "grad_norm": 1.5421096921267532, "learning_rate": 3.9962546156308954e-05, "loss": 0.8316, "step": 112 }, { "epoch": 0.05, "grad_norm": 1.6081055754975826, "learning_rate": 3.9960784436929906e-05, "loss": 0.816, "step": 113 }, { "epoch": 0.05, "grad_norm": 1.6461503492128977, "learning_rate": 3.9958982271827203e-05, "loss": 0.7729, "step": 114 }, { "epoch": 0.05, "grad_norm": 1.6626285589317855, "learning_rate": 3.995713966465249e-05, "loss": 0.8064, "step": 115 }, { "epoch": 0.05, "grad_norm": 1.5137577943105085, "learning_rate": 3.995525661913936e-05, "loss": 0.7203, "step": 116 }, { "epoch": 0.05, "grad_norm": 1.7525616017614758, "learning_rate": 3.995333313910337e-05, "loss": 0.7761, "step": 117 }, { "epoch": 0.05, "grad_norm": 1.7482896348542674, "learning_rate": 3.995136922844197e-05, "loss": 0.7725, "step": 118 }, { "epoch": 0.05, "grad_norm": 1.7252256301155913, "learning_rate": 3.994936489113455e-05, "loss": 0.7668, "step": 119 }, { "epoch": 0.05, "grad_norm": 1.696675047500886, "learning_rate": 3.994732013124243e-05, "loss": 0.8148, "step": 120 }, { "epoch": 0.05, "grad_norm": 1.5312765469488108, "learning_rate": 3.994523495290883e-05, "loss": 0.7378, "step": 121 }, { "epoch": 0.05, "grad_norm": 1.553461022361036, "learning_rate": 3.994310936035884e-05, "loss": 0.7919, "step": 122 }, { "epoch": 0.05, "grad_norm": 1.4289284447332717, "learning_rate": 3.994094335789948e-05, "loss": 0.7152, "step": 123 }, { "epoch": 0.05, "grad_norm": 2.0729613435106438, "learning_rate": 3.993873694991963e-05, "loss": 0.7881, "step": 124 }, { "epoch": 0.05, "grad_norm": 1.4427235277755215, "learning_rate": 3.9936490140890025e-05, "loss": 0.6667, "step": 125 }, { "epoch": 0.06, "grad_norm": 1.7236037643621684, "learning_rate": 3.993420293536331e-05, "loss": 0.7529, "step": 126 }, { "epoch": 0.06, "grad_norm": 1.57506793950399, "learning_rate": 3.993187533797394e-05, "loss": 0.8098, "step": 127 }, { "epoch": 0.06, "grad_norm": 1.9910058799476726, "learning_rate": 3.9929507353438236e-05, "loss": 0.8476, "step": 128 }, { "epoch": 0.06, "grad_norm": 1.5600141969670562, "learning_rate": 3.9927098986554345e-05, "loss": 0.7001, "step": 129 }, { "epoch": 0.06, "grad_norm": 1.733854335954115, "learning_rate": 3.9924650242202245e-05, "loss": 0.7535, "step": 130 }, { "epoch": 0.06, "grad_norm": 1.5496869970968161, "learning_rate": 3.9922161125343734e-05, "loss": 0.6662, "step": 131 }, { "epoch": 0.06, "grad_norm": 1.629556942151152, "learning_rate": 3.991963164102239e-05, "loss": 0.6608, "step": 132 }, { "epoch": 0.06, "grad_norm": 1.4499389832911362, "learning_rate": 3.9917061794363616e-05, "loss": 0.682, "step": 133 }, { "epoch": 0.06, "grad_norm": 1.7214026355315923, "learning_rate": 3.991445159057459e-05, "loss": 0.7314, "step": 134 }, { "epoch": 0.06, "grad_norm": 1.6929341170531762, "learning_rate": 3.991180103494426e-05, "loss": 0.7126, "step": 135 }, { "epoch": 0.06, "grad_norm": 1.5596277158297274, "learning_rate": 3.990911013284333e-05, "loss": 0.6811, "step": 136 }, { "epoch": 0.06, "grad_norm": 1.8330239410585047, "learning_rate": 3.9906378889724276e-05, "loss": 0.7415, "step": 137 }, { "epoch": 0.06, "grad_norm": 1.5514819966586175, "learning_rate": 3.99036073111213e-05, "loss": 0.6935, "step": 138 }, { "epoch": 0.06, "grad_norm": 1.660685735301817, "learning_rate": 3.9900795402650334e-05, "loss": 0.7095, "step": 139 }, { "epoch": 0.06, "grad_norm": 1.9265088298960584, "learning_rate": 3.989794317000904e-05, "loss": 0.8017, "step": 140 }, { "epoch": 0.06, "grad_norm": 1.652297215449007, "learning_rate": 3.989505061897679e-05, "loss": 0.6885, "step": 141 }, { "epoch": 0.06, "grad_norm": 1.8099428751661601, "learning_rate": 3.9892117755414615e-05, "loss": 0.6002, "step": 142 }, { "epoch": 0.06, "grad_norm": 1.7347345065491377, "learning_rate": 3.9889144585265287e-05, "loss": 0.6209, "step": 143 }, { "epoch": 0.06, "grad_norm": 1.8864377525326135, "learning_rate": 3.988613111455319e-05, "loss": 0.6692, "step": 144 }, { "epoch": 0.06, "grad_norm": 2.1766207871365357, "learning_rate": 3.988307734938443e-05, "loss": 0.752, "step": 145 }, { "epoch": 0.06, "grad_norm": 1.8199379335027797, "learning_rate": 3.9879983295946696e-05, "loss": 0.6722, "step": 146 }, { "epoch": 0.06, "grad_norm": 1.6455588131933245, "learning_rate": 3.987684896050936e-05, "loss": 0.6409, "step": 147 }, { "epoch": 0.07, "grad_norm": 1.9482356403594565, "learning_rate": 3.987367434942339e-05, "loss": 0.724, "step": 148 }, { "epoch": 0.07, "grad_norm": 1.6718755704219237, "learning_rate": 3.987045946912138e-05, "loss": 0.6621, "step": 149 }, { "epoch": 0.07, "grad_norm": 1.5506080511505307, "learning_rate": 3.9867204326117505e-05, "loss": 0.6559, "step": 150 }, { "epoch": 0.07, "grad_norm": 1.687721873638236, "learning_rate": 3.986390892700753e-05, "loss": 0.7215, "step": 151 }, { "epoch": 0.07, "grad_norm": 1.4547552172902753, "learning_rate": 3.9860573278468785e-05, "loss": 0.6295, "step": 152 }, { "epoch": 0.07, "grad_norm": 1.4227693700254418, "learning_rate": 3.9857197387260164e-05, "loss": 0.6362, "step": 153 }, { "epoch": 0.07, "grad_norm": 1.5663651534703993, "learning_rate": 3.985378126022209e-05, "loss": 0.6105, "step": 154 }, { "epoch": 0.07, "grad_norm": 1.8088687168645143, "learning_rate": 3.985032490427653e-05, "loss": 0.6775, "step": 155 }, { "epoch": 0.07, "grad_norm": 1.5891609694017539, "learning_rate": 3.984682832642695e-05, "loss": 0.6446, "step": 156 }, { "epoch": 0.07, "grad_norm": 1.7984891855881397, "learning_rate": 3.984329153375833e-05, "loss": 0.6963, "step": 157 }, { "epoch": 0.07, "grad_norm": 1.4446448356241686, "learning_rate": 3.983971453343713e-05, "loss": 0.5809, "step": 158 }, { "epoch": 0.07, "grad_norm": 1.9439171378839017, "learning_rate": 3.9836097332711263e-05, "loss": 0.6097, "step": 159 }, { "epoch": 0.07, "grad_norm": 1.7622528281620065, "learning_rate": 3.983243993891013e-05, "loss": 0.5964, "step": 160 }, { "epoch": 0.07, "grad_norm": 1.4720413178833622, "learning_rate": 3.982874235944456e-05, "loss": 0.6115, "step": 161 }, { "epoch": 0.07, "grad_norm": 1.5517645728298304, "learning_rate": 3.98250046018068e-05, "loss": 0.5692, "step": 162 }, { "epoch": 0.07, "grad_norm": 1.4860092803593412, "learning_rate": 3.982122667357053e-05, "loss": 0.6125, "step": 163 }, { "epoch": 0.07, "grad_norm": 1.4927280106171485, "learning_rate": 3.9817408582390796e-05, "loss": 0.6503, "step": 164 }, { "epoch": 0.07, "grad_norm": 1.490621334047428, "learning_rate": 3.981355033600405e-05, "loss": 0.5934, "step": 165 }, { "epoch": 0.07, "grad_norm": 1.4252651168752246, "learning_rate": 3.980965194222809e-05, "loss": 0.5791, "step": 166 }, { "epoch": 0.07, "grad_norm": 1.4824063547802098, "learning_rate": 3.980571340896208e-05, "loss": 0.5758, "step": 167 }, { "epoch": 0.07, "grad_norm": 1.4386123002721702, "learning_rate": 3.980173474418651e-05, "loss": 0.5562, "step": 168 }, { "epoch": 0.07, "grad_norm": 1.438472584643176, "learning_rate": 3.979771595596318e-05, "loss": 0.5503, "step": 169 }, { "epoch": 0.07, "grad_norm": 1.4746449447282879, "learning_rate": 3.97936570524352e-05, "loss": 0.6175, "step": 170 }, { "epoch": 0.08, "grad_norm": 1.4359533033030594, "learning_rate": 3.978955804182696e-05, "loss": 0.5663, "step": 171 }, { "epoch": 0.08, "grad_norm": 1.4310939520510761, "learning_rate": 3.978541893244413e-05, "loss": 0.5633, "step": 172 }, { "epoch": 0.08, "grad_norm": 1.476671487328186, "learning_rate": 3.97812397326736e-05, "loss": 0.6254, "step": 173 }, { "epoch": 0.08, "grad_norm": 1.473386224398599, "learning_rate": 3.977702045098352e-05, "loss": 0.5879, "step": 174 }, { "epoch": 0.08, "grad_norm": 1.4296032823449158, "learning_rate": 3.977276109592325e-05, "loss": 0.5742, "step": 175 }, { "epoch": 0.08, "grad_norm": 1.4296142059049426, "learning_rate": 3.976846167612334e-05, "loss": 0.623, "step": 176 }, { "epoch": 0.08, "grad_norm": 1.5873249167665515, "learning_rate": 3.976412220029554e-05, "loss": 0.5953, "step": 177 }, { "epoch": 0.08, "grad_norm": 1.5672553370085667, "learning_rate": 3.975974267723275e-05, "loss": 0.5937, "step": 178 }, { "epoch": 0.08, "grad_norm": 1.482896731268462, "learning_rate": 3.975532311580901e-05, "loss": 0.6129, "step": 179 }, { "epoch": 0.08, "grad_norm": 1.5258540622977488, "learning_rate": 3.9750863524979506e-05, "loss": 0.5779, "step": 180 }, { "epoch": 0.08, "grad_norm": 1.5583706268227155, "learning_rate": 3.974636391378053e-05, "loss": 0.6064, "step": 181 }, { "epoch": 0.08, "grad_norm": 1.5734345634001192, "learning_rate": 3.9741824291329446e-05, "loss": 0.519, "step": 182 }, { "epoch": 0.08, "grad_norm": 1.4985175754762539, "learning_rate": 3.973724466682471e-05, "loss": 0.5779, "step": 183 }, { "epoch": 0.08, "grad_norm": 1.3872915884669936, "learning_rate": 3.9732625049545845e-05, "loss": 0.5299, "step": 184 }, { "epoch": 0.08, "grad_norm": 1.491908901230542, "learning_rate": 3.972796544885337e-05, "loss": 0.5761, "step": 185 }, { "epoch": 0.08, "grad_norm": 1.5028352168621608, "learning_rate": 3.972326587418886e-05, "loss": 0.5707, "step": 186 }, { "epoch": 0.08, "grad_norm": 1.4173322778203221, "learning_rate": 3.971852633507487e-05, "loss": 0.5751, "step": 187 }, { "epoch": 0.08, "grad_norm": 1.3527899664334746, "learning_rate": 3.971374684111492e-05, "loss": 0.4824, "step": 188 }, { "epoch": 0.08, "grad_norm": 1.5635438102827768, "learning_rate": 3.970892740199352e-05, "loss": 0.5105, "step": 189 }, { "epoch": 0.08, "grad_norm": 1.528064918507103, "learning_rate": 3.97040680274761e-05, "loss": 0.5707, "step": 190 }, { "epoch": 0.08, "grad_norm": 1.523451741469892, "learning_rate": 3.9699168727409e-05, "loss": 0.5183, "step": 191 }, { "epoch": 0.08, "grad_norm": 1.5678279826825616, "learning_rate": 3.969422951171949e-05, "loss": 0.593, "step": 192 }, { "epoch": 0.08, "grad_norm": 1.694579638453741, "learning_rate": 3.968925039041569e-05, "loss": 0.5474, "step": 193 }, { "epoch": 0.09, "grad_norm": 1.5893592939902124, "learning_rate": 3.968423137358659e-05, "loss": 0.5652, "step": 194 }, { "epoch": 0.09, "grad_norm": 1.4373449158977396, "learning_rate": 3.9679172471402026e-05, "loss": 0.4914, "step": 195 }, { "epoch": 0.09, "grad_norm": 1.956987513740639, "learning_rate": 3.967407369411265e-05, "loss": 0.5752, "step": 196 }, { "epoch": 0.09, "grad_norm": 1.7723742419181443, "learning_rate": 3.966893505204989e-05, "loss": 0.5152, "step": 197 }, { "epoch": 0.09, "grad_norm": 1.4975371805456783, "learning_rate": 3.966375655562598e-05, "loss": 0.5415, "step": 198 }, { "epoch": 0.09, "grad_norm": 1.8531339391071417, "learning_rate": 3.9658538215333885e-05, "loss": 0.5603, "step": 199 }, { "epoch": 0.09, "grad_norm": 1.4906133367374061, "learning_rate": 3.965328004174733e-05, "loss": 0.5297, "step": 200 }, { "epoch": 0.09, "grad_norm": 1.8560693029956028, "learning_rate": 3.964798204552072e-05, "loss": 0.5125, "step": 201 }, { "epoch": 0.09, "grad_norm": 1.5240262972247456, "learning_rate": 3.964264423738918e-05, "loss": 0.5769, "step": 202 }, { "epoch": 0.09, "grad_norm": 1.4540963156805944, "learning_rate": 3.963726662816849e-05, "loss": 0.4629, "step": 203 }, { "epoch": 0.09, "grad_norm": 1.7843912768581929, "learning_rate": 3.963184922875509e-05, "loss": 0.5496, "step": 204 }, { "epoch": 0.09, "grad_norm": 1.5042545064003081, "learning_rate": 3.962639205012602e-05, "loss": 0.4965, "step": 205 }, { "epoch": 0.09, "grad_norm": 1.757512724004188, "learning_rate": 3.9620895103338946e-05, "loss": 0.5194, "step": 206 }, { "epoch": 0.09, "grad_norm": 1.7409125482636698, "learning_rate": 3.961535839953211e-05, "loss": 0.5833, "step": 207 }, { "epoch": 0.09, "grad_norm": 1.498008996119053, "learning_rate": 3.9609781949924305e-05, "loss": 0.4744, "step": 208 }, { "epoch": 0.09, "grad_norm": 1.4127462172490506, "learning_rate": 3.960416576581486e-05, "loss": 0.4871, "step": 209 }, { "epoch": 0.09, "grad_norm": 1.3262246841905174, "learning_rate": 3.959850985858364e-05, "loss": 0.4254, "step": 210 }, { "epoch": 0.09, "grad_norm": 1.564374485968316, "learning_rate": 3.959281423969095e-05, "loss": 0.5198, "step": 211 }, { "epoch": 0.09, "grad_norm": 1.4880741005259133, "learning_rate": 3.95870789206776e-05, "loss": 0.5068, "step": 212 }, { "epoch": 0.09, "grad_norm": 1.3876435111120127, "learning_rate": 3.958130391316484e-05, "loss": 0.4143, "step": 213 }, { "epoch": 0.09, "grad_norm": 1.4654552759994384, "learning_rate": 3.957548922885434e-05, "loss": 0.4459, "step": 214 }, { "epoch": 0.09, "grad_norm": 1.351469312332851, "learning_rate": 3.9569634879528134e-05, "loss": 0.5134, "step": 215 }, { "epoch": 0.09, "grad_norm": 1.6535047415869237, "learning_rate": 3.956374087704867e-05, "loss": 0.4659, "step": 216 }, { "epoch": 0.1, "grad_norm": 1.5877060073190123, "learning_rate": 3.9557807233358724e-05, "loss": 0.513, "step": 217 }, { "epoch": 0.1, "grad_norm": 2.0478279283403125, "learning_rate": 3.955183396048138e-05, "loss": 0.4653, "step": 218 }, { "epoch": 0.1, "grad_norm": 1.2808846208574596, "learning_rate": 3.9545821070520054e-05, "loss": 0.4143, "step": 219 }, { "epoch": 0.1, "grad_norm": 1.699522986525668, "learning_rate": 3.953976857565842e-05, "loss": 0.4228, "step": 220 }, { "epoch": 0.1, "grad_norm": 1.5302320716978073, "learning_rate": 3.953367648816039e-05, "loss": 0.4348, "step": 221 }, { "epoch": 0.1, "grad_norm": 1.8357551646491785, "learning_rate": 3.952754482037012e-05, "loss": 0.4533, "step": 222 }, { "epoch": 0.1, "grad_norm": 1.5728923804153456, "learning_rate": 3.9521373584711956e-05, "loss": 0.4577, "step": 223 }, { "epoch": 0.1, "grad_norm": 1.4386495057644866, "learning_rate": 3.9515162793690424e-05, "loss": 0.4417, "step": 224 }, { "epoch": 0.1, "grad_norm": 1.530220697853749, "learning_rate": 3.95089124598902e-05, "loss": 0.4666, "step": 225 }, { "epoch": 0.1, "grad_norm": 1.4379726959575616, "learning_rate": 3.9502622595976065e-05, "loss": 0.455, "step": 226 }, { "epoch": 0.1, "grad_norm": 1.530897372378452, "learning_rate": 3.949629321469293e-05, "loss": 0.4277, "step": 227 }, { "epoch": 0.1, "grad_norm": 1.4213464613952935, "learning_rate": 3.9489924328865755e-05, "loss": 0.4475, "step": 228 }, { "epoch": 0.1, "grad_norm": 1.3898811922824856, "learning_rate": 3.948351595139955e-05, "loss": 0.4526, "step": 229 }, { "epoch": 0.1, "grad_norm": 1.405342826694174, "learning_rate": 3.947706809527937e-05, "loss": 0.4653, "step": 230 }, { "epoch": 0.1, "grad_norm": 1.4465358196508509, "learning_rate": 3.947058077357021e-05, "loss": 0.4325, "step": 231 }, { "epoch": 0.1, "grad_norm": 1.360301348874714, "learning_rate": 3.9464053999417094e-05, "loss": 0.4683, "step": 232 }, { "epoch": 0.1, "grad_norm": 1.3241299849835622, "learning_rate": 3.945748778604494e-05, "loss": 0.4162, "step": 233 }, { "epoch": 0.1, "grad_norm": 1.4071083416277619, "learning_rate": 3.945088214675861e-05, "loss": 0.4076, "step": 234 }, { "epoch": 0.1, "grad_norm": 1.2919904805443891, "learning_rate": 3.944423709494284e-05, "loss": 0.3801, "step": 235 }, { "epoch": 0.1, "grad_norm": 1.3371747628621873, "learning_rate": 3.943755264406221e-05, "loss": 0.4428, "step": 236 }, { "epoch": 0.1, "grad_norm": 1.4199301865031417, "learning_rate": 3.9430828807661174e-05, "loss": 0.4646, "step": 237 }, { "epoch": 0.1, "grad_norm": 1.3140117476706048, "learning_rate": 3.942406559936396e-05, "loss": 0.4486, "step": 238 }, { "epoch": 0.1, "grad_norm": 1.3235671429148934, "learning_rate": 3.941726303287458e-05, "loss": 0.4166, "step": 239 }, { "epoch": 0.11, "grad_norm": 1.224336974102132, "learning_rate": 3.941042112197679e-05, "loss": 0.4063, "step": 240 }, { "epoch": 0.11, "grad_norm": 1.207706250332667, "learning_rate": 3.940353988053409e-05, "loss": 0.3896, "step": 241 }, { "epoch": 0.11, "grad_norm": 1.667011066457244, "learning_rate": 3.939661932248965e-05, "loss": 0.4363, "step": 242 }, { "epoch": 0.11, "grad_norm": 1.3563714443276547, "learning_rate": 3.938965946186631e-05, "loss": 0.3639, "step": 243 }, { "epoch": 0.11, "grad_norm": 1.2505178809246953, "learning_rate": 3.9382660312766566e-05, "loss": 0.4337, "step": 244 }, { "epoch": 0.11, "grad_norm": 1.3015391115283055, "learning_rate": 3.937562188937249e-05, "loss": 0.4025, "step": 245 }, { "epoch": 0.11, "grad_norm": 1.412095530166397, "learning_rate": 3.936854420594577e-05, "loss": 0.417, "step": 246 }, { "epoch": 0.11, "grad_norm": 1.480885469674138, "learning_rate": 3.9361427276827605e-05, "loss": 0.4092, "step": 247 }, { "epoch": 0.11, "grad_norm": 1.3031640209329376, "learning_rate": 3.9354271116438764e-05, "loss": 0.4081, "step": 248 }, { "epoch": 0.11, "grad_norm": 1.165213508687677, "learning_rate": 3.934707573927947e-05, "loss": 0.3516, "step": 249 }, { "epoch": 0.11, "grad_norm": 1.2722910769865, "learning_rate": 3.933984115992943e-05, "loss": 0.4069, "step": 250 }, { "epoch": 0.11, "grad_norm": 1.5458368998826553, "learning_rate": 3.933256739304776e-05, "loss": 0.4006, "step": 251 }, { "epoch": 0.11, "grad_norm": 1.3982579206531864, "learning_rate": 3.932525445337302e-05, "loss": 0.3591, "step": 252 }, { "epoch": 0.11, "grad_norm": 1.378690319066525, "learning_rate": 3.93179023557231e-05, "loss": 0.3864, "step": 253 }, { "epoch": 0.11, "grad_norm": 1.4200735732615941, "learning_rate": 3.9310511114995266e-05, "loss": 0.3732, "step": 254 }, { "epoch": 0.11, "grad_norm": 1.3485622732842326, "learning_rate": 3.9303080746166094e-05, "loss": 0.368, "step": 255 }, { "epoch": 0.11, "grad_norm": 1.4783819382571577, "learning_rate": 3.9295611264291424e-05, "loss": 0.4291, "step": 256 }, { "epoch": 0.11, "grad_norm": 1.322498655985562, "learning_rate": 3.928810268450637e-05, "loss": 0.4094, "step": 257 }, { "epoch": 0.11, "grad_norm": 1.2373705382011844, "learning_rate": 3.928055502202527e-05, "loss": 0.3542, "step": 258 }, { "epoch": 0.11, "grad_norm": 1.290945677370071, "learning_rate": 3.9272968292141624e-05, "loss": 0.3796, "step": 259 }, { "epoch": 0.11, "grad_norm": 1.3079370929248728, "learning_rate": 3.926534251022814e-05, "loss": 0.4196, "step": 260 }, { "epoch": 0.11, "grad_norm": 1.5288517914489537, "learning_rate": 3.92576776917366e-05, "loss": 0.4239, "step": 261 }, { "epoch": 0.12, "grad_norm": 1.3519946603641366, "learning_rate": 3.924997385219793e-05, "loss": 0.4066, "step": 262 }, { "epoch": 0.12, "grad_norm": 1.3436086713412843, "learning_rate": 3.92422310072221e-05, "loss": 0.3813, "step": 263 }, { "epoch": 0.12, "grad_norm": 1.3587019119947774, "learning_rate": 3.923444917249812e-05, "loss": 0.3944, "step": 264 }, { "epoch": 0.12, "grad_norm": 1.1984378620729061, "learning_rate": 3.922662836379402e-05, "loss": 0.3188, "step": 265 }, { "epoch": 0.12, "grad_norm": 1.4651457208281735, "learning_rate": 3.9218768596956767e-05, "loss": 0.4067, "step": 266 }, { "epoch": 0.12, "grad_norm": 1.4348335414134077, "learning_rate": 3.92108698879123e-05, "loss": 0.4437, "step": 267 }, { "epoch": 0.12, "grad_norm": 1.2745509048598174, "learning_rate": 3.920293225266543e-05, "loss": 0.3492, "step": 268 }, { "epoch": 0.12, "grad_norm": 1.354206079740881, "learning_rate": 3.919495570729989e-05, "loss": 0.3766, "step": 269 }, { "epoch": 0.12, "grad_norm": 1.504893981773322, "learning_rate": 3.918694026797822e-05, "loss": 0.4099, "step": 270 }, { "epoch": 0.12, "grad_norm": 1.53755943369549, "learning_rate": 3.917888595094176e-05, "loss": 0.4095, "step": 271 }, { "epoch": 0.12, "grad_norm": 1.3129746169304548, "learning_rate": 3.917079277251067e-05, "loss": 0.3807, "step": 272 }, { "epoch": 0.12, "grad_norm": 1.2599414791413532, "learning_rate": 3.916266074908381e-05, "loss": 0.3736, "step": 273 }, { "epoch": 0.12, "grad_norm": 1.3430217721357323, "learning_rate": 3.915448989713878e-05, "loss": 0.3339, "step": 274 }, { "epoch": 0.12, "grad_norm": 1.535744124974932, "learning_rate": 3.914628023323184e-05, "loss": 0.4252, "step": 275 }, { "epoch": 0.12, "grad_norm": 1.2637973351673997, "learning_rate": 3.9138031773997886e-05, "loss": 0.3344, "step": 276 }, { "epoch": 0.12, "grad_norm": 1.3793033103825154, "learning_rate": 3.912974453615045e-05, "loss": 0.4435, "step": 277 }, { "epoch": 0.12, "grad_norm": 1.4145029973105872, "learning_rate": 3.9121418536481616e-05, "loss": 0.4119, "step": 278 }, { "epoch": 0.12, "grad_norm": 1.3393693534529123, "learning_rate": 3.911305379186201e-05, "loss": 0.3634, "step": 279 }, { "epoch": 0.12, "grad_norm": 1.1899988084674928, "learning_rate": 3.91046503192408e-05, "loss": 0.3564, "step": 280 }, { "epoch": 0.12, "grad_norm": 1.4946243600112579, "learning_rate": 3.9096208135645566e-05, "loss": 0.3717, "step": 281 }, { "epoch": 0.12, "grad_norm": 1.2705807148148822, "learning_rate": 3.908772725818239e-05, "loss": 0.3582, "step": 282 }, { "epoch": 0.12, "grad_norm": 1.5254443131106863, "learning_rate": 3.907920770403571e-05, "loss": 0.351, "step": 283 }, { "epoch": 0.12, "grad_norm": 1.5058121927392691, "learning_rate": 3.9070649490468356e-05, "loss": 0.3745, "step": 284 }, { "epoch": 0.13, "grad_norm": 1.4343497546266086, "learning_rate": 3.906205263482148e-05, "loss": 0.3257, "step": 285 }, { "epoch": 0.13, "grad_norm": 1.366455492514063, "learning_rate": 3.905341715451456e-05, "loss": 0.3248, "step": 286 }, { "epoch": 0.13, "grad_norm": 1.1434263553164885, "learning_rate": 3.90447430670453e-05, "loss": 0.3288, "step": 287 }, { "epoch": 0.13, "grad_norm": 1.5259294522878097, "learning_rate": 3.9036030389989655e-05, "loss": 0.4251, "step": 288 }, { "epoch": 0.13, "grad_norm": 1.253431331264597, "learning_rate": 3.9027279141001774e-05, "loss": 0.3278, "step": 289 }, { "epoch": 0.13, "grad_norm": 1.5103855612171766, "learning_rate": 3.901848933781394e-05, "loss": 0.4149, "step": 290 }, { "epoch": 0.13, "grad_norm": 1.2752271842508422, "learning_rate": 3.9009660998236586e-05, "loss": 0.3146, "step": 291 }, { "epoch": 0.13, "grad_norm": 1.3252211746019849, "learning_rate": 3.9000794140158214e-05, "loss": 0.3679, "step": 292 }, { "epoch": 0.13, "grad_norm": 1.323813812547691, "learning_rate": 3.8991888781545377e-05, "loss": 0.3341, "step": 293 }, { "epoch": 0.13, "grad_norm": 1.4391402961834538, "learning_rate": 3.898294494044263e-05, "loss": 0.3856, "step": 294 }, { "epoch": 0.13, "grad_norm": 1.573579720033174, "learning_rate": 3.8973962634972536e-05, "loss": 0.3501, "step": 295 }, { "epoch": 0.13, "grad_norm": 1.2057989552062927, "learning_rate": 3.896494188333555e-05, "loss": 0.3532, "step": 296 }, { "epoch": 0.13, "grad_norm": 1.4378390534305105, "learning_rate": 3.8955882703810066e-05, "loss": 0.3745, "step": 297 }, { "epoch": 0.13, "grad_norm": 1.3295663977946774, "learning_rate": 3.894678511475233e-05, "loss": 0.3682, "step": 298 }, { "epoch": 0.13, "grad_norm": 1.3174730726671662, "learning_rate": 3.893764913459641e-05, "loss": 0.3035, "step": 299 }, { "epoch": 0.13, "grad_norm": 1.4066219261852482, "learning_rate": 3.8928474781854176e-05, "loss": 0.34, "step": 300 }, { "epoch": 0.13, "grad_norm": 1.36352689874441, "learning_rate": 3.891926207511524e-05, "loss": 0.3408, "step": 301 }, { "epoch": 0.13, "grad_norm": 1.3532528312508314, "learning_rate": 3.8910011033046945e-05, "loss": 0.3309, "step": 302 }, { "epoch": 0.13, "grad_norm": 1.2442904730942472, "learning_rate": 3.8900721674394286e-05, "loss": 0.3224, "step": 303 }, { "epoch": 0.13, "grad_norm": 1.3731458039546098, "learning_rate": 3.889139401797992e-05, "loss": 0.3633, "step": 304 }, { "epoch": 0.13, "grad_norm": 1.2868412647392378, "learning_rate": 3.8882028082704095e-05, "loss": 0.3269, "step": 305 }, { "epoch": 0.13, "grad_norm": 1.2552574696822865, "learning_rate": 3.887262388754462e-05, "loss": 0.352, "step": 306 }, { "epoch": 0.13, "grad_norm": 1.1787584195690892, "learning_rate": 3.886318145155684e-05, "loss": 0.3282, "step": 307 }, { "epoch": 0.14, "grad_norm": 1.5028988325038972, "learning_rate": 3.8853700793873577e-05, "loss": 0.3218, "step": 308 }, { "epoch": 0.14, "grad_norm": 1.4457465267598126, "learning_rate": 3.88441819337051e-05, "loss": 0.3495, "step": 309 }, { "epoch": 0.14, "grad_norm": 1.2803238568536672, "learning_rate": 3.8834624890339074e-05, "loss": 0.3021, "step": 310 }, { "epoch": 0.14, "grad_norm": 1.3793732714069633, "learning_rate": 3.8825029683140564e-05, "loss": 0.3691, "step": 311 }, { "epoch": 0.14, "grad_norm": 1.1771198143271118, "learning_rate": 3.8815396331551935e-05, "loss": 0.3166, "step": 312 }, { "epoch": 0.14, "grad_norm": 1.2809326895451079, "learning_rate": 3.8805724855092865e-05, "loss": 0.3556, "step": 313 }, { "epoch": 0.14, "grad_norm": 1.0392000494527474, "learning_rate": 3.879601527336027e-05, "loss": 0.3344, "step": 314 }, { "epoch": 0.14, "grad_norm": 1.3418154874840607, "learning_rate": 3.878626760602828e-05, "loss": 0.2777, "step": 315 }, { "epoch": 0.14, "grad_norm": 1.2469284466830728, "learning_rate": 3.8776481872848186e-05, "loss": 0.3214, "step": 316 }, { "epoch": 0.14, "grad_norm": 1.6506490471217423, "learning_rate": 3.876665809364843e-05, "loss": 0.3546, "step": 317 }, { "epoch": 0.14, "grad_norm": 1.4417098224479656, "learning_rate": 3.875679628833453e-05, "loss": 0.3954, "step": 318 }, { "epoch": 0.14, "grad_norm": 1.1255518831168245, "learning_rate": 3.8746896476889066e-05, "loss": 0.2804, "step": 319 }, { "epoch": 0.14, "grad_norm": 1.3029310544086434, "learning_rate": 3.8736958679371615e-05, "loss": 0.3353, "step": 320 }, { "epoch": 0.14, "grad_norm": 1.1219865387535743, "learning_rate": 3.8726982915918736e-05, "loss": 0.318, "step": 321 }, { "epoch": 0.14, "grad_norm": 1.1713268269488624, "learning_rate": 3.8716969206743914e-05, "loss": 0.295, "step": 322 }, { "epoch": 0.14, "grad_norm": 1.3670028125814933, "learning_rate": 3.870691757213751e-05, "loss": 0.3785, "step": 323 }, { "epoch": 0.14, "grad_norm": 1.1377812823775164, "learning_rate": 3.869682803246676e-05, "loss": 0.3254, "step": 324 }, { "epoch": 0.14, "grad_norm": 1.1251498758299443, "learning_rate": 3.8686700608175663e-05, "loss": 0.3172, "step": 325 }, { "epoch": 0.14, "grad_norm": 1.161359375072263, "learning_rate": 3.867653531978502e-05, "loss": 0.3308, "step": 326 }, { "epoch": 0.14, "grad_norm": 1.1779195915071607, "learning_rate": 3.866633218789234e-05, "loss": 0.3321, "step": 327 }, { "epoch": 0.14, "grad_norm": 1.315962719554472, "learning_rate": 3.865609123317181e-05, "loss": 0.3385, "step": 328 }, { "epoch": 0.14, "grad_norm": 1.0851299109671326, "learning_rate": 3.864581247637426e-05, "loss": 0.2739, "step": 329 }, { "epoch": 0.14, "grad_norm": 1.172619646633328, "learning_rate": 3.863549593832711e-05, "loss": 0.3194, "step": 330 }, { "epoch": 0.15, "grad_norm": 1.1606662027649133, "learning_rate": 3.862514163993435e-05, "loss": 0.3648, "step": 331 }, { "epoch": 0.15, "grad_norm": 1.1320180803984803, "learning_rate": 3.861474960217647e-05, "loss": 0.3088, "step": 332 }, { "epoch": 0.15, "grad_norm": 1.0673327770180079, "learning_rate": 3.860431984611043e-05, "loss": 0.2737, "step": 333 }, { "epoch": 0.15, "grad_norm": 1.2381158471575533, "learning_rate": 3.859385239286963e-05, "loss": 0.3793, "step": 334 }, { "epoch": 0.15, "grad_norm": 1.125034119830359, "learning_rate": 3.858334726366383e-05, "loss": 0.3238, "step": 335 }, { "epoch": 0.15, "grad_norm": 1.116579170262754, "learning_rate": 3.857280447977914e-05, "loss": 0.3064, "step": 336 }, { "epoch": 0.15, "grad_norm": 1.0993821229427436, "learning_rate": 3.856222406257799e-05, "loss": 0.2711, "step": 337 }, { "epoch": 0.15, "grad_norm": 1.116732097720498, "learning_rate": 3.855160603349904e-05, "loss": 0.303, "step": 338 }, { "epoch": 0.15, "grad_norm": 1.3069757279883663, "learning_rate": 3.854095041405717e-05, "loss": 0.3512, "step": 339 }, { "epoch": 0.15, "grad_norm": 1.1307121009657524, "learning_rate": 3.853025722584342e-05, "loss": 0.2749, "step": 340 }, { "epoch": 0.15, "grad_norm": 1.4255837499662112, "learning_rate": 3.851952649052498e-05, "loss": 0.344, "step": 341 }, { "epoch": 0.15, "grad_norm": 1.218830399428904, "learning_rate": 3.8508758229845085e-05, "loss": 0.3474, "step": 342 }, { "epoch": 0.15, "grad_norm": 1.1672123019074119, "learning_rate": 3.849795246562302e-05, "loss": 0.2967, "step": 343 }, { "epoch": 0.15, "grad_norm": 1.4804156855028707, "learning_rate": 3.848710921975408e-05, "loss": 0.3159, "step": 344 }, { "epoch": 0.15, "grad_norm": 1.0446826696669325, "learning_rate": 3.847622851420948e-05, "loss": 0.2685, "step": 345 }, { "epoch": 0.15, "grad_norm": 1.232711008436584, "learning_rate": 3.8465310371036365e-05, "loss": 0.297, "step": 346 }, { "epoch": 0.15, "grad_norm": 1.1944439793125006, "learning_rate": 3.8454354812357705e-05, "loss": 0.3089, "step": 347 }, { "epoch": 0.15, "grad_norm": 1.1477671113297683, "learning_rate": 3.8443361860372305e-05, "loss": 0.2866, "step": 348 }, { "epoch": 0.15, "grad_norm": 1.2192700083633536, "learning_rate": 3.843233153735475e-05, "loss": 0.3052, "step": 349 }, { "epoch": 0.15, "grad_norm": 1.1044731374593606, "learning_rate": 3.8421263865655315e-05, "loss": 0.3017, "step": 350 }, { "epoch": 0.15, "grad_norm": 1.1420365052409571, "learning_rate": 3.841015886769998e-05, "loss": 0.2848, "step": 351 }, { "epoch": 0.15, "grad_norm": 1.059363616612133, "learning_rate": 3.839901656599035e-05, "loss": 0.2901, "step": 352 }, { "epoch": 0.16, "grad_norm": 1.0813821436225282, "learning_rate": 3.8387836983103624e-05, "loss": 0.2597, "step": 353 }, { "epoch": 0.16, "grad_norm": 1.2676930426890327, "learning_rate": 3.837662014169252e-05, "loss": 0.3066, "step": 354 }, { "epoch": 0.16, "grad_norm": 1.2128007889309735, "learning_rate": 3.836536606448527e-05, "loss": 0.303, "step": 355 }, { "epoch": 0.16, "grad_norm": 1.0413378705228298, "learning_rate": 3.8354074774285565e-05, "loss": 0.2779, "step": 356 }, { "epoch": 0.16, "grad_norm": 1.0795171291340453, "learning_rate": 3.8342746293972475e-05, "loss": 0.2873, "step": 357 }, { "epoch": 0.16, "grad_norm": 1.1244222959078172, "learning_rate": 3.833138064650044e-05, "loss": 0.3038, "step": 358 }, { "epoch": 0.16, "grad_norm": 1.133428577086121, "learning_rate": 3.831997785489921e-05, "loss": 0.2788, "step": 359 }, { "epoch": 0.16, "grad_norm": 1.3072909118325942, "learning_rate": 3.830853794227379e-05, "loss": 0.3311, "step": 360 }, { "epoch": 0.16, "grad_norm": 1.044447917938093, "learning_rate": 3.829706093180441e-05, "loss": 0.278, "step": 361 }, { "epoch": 0.16, "grad_norm": 1.1015452390360023, "learning_rate": 3.828554684674648e-05, "loss": 0.2939, "step": 362 }, { "epoch": 0.16, "grad_norm": 0.9740645848760728, "learning_rate": 3.82739957104305e-05, "loss": 0.2693, "step": 363 }, { "epoch": 0.16, "grad_norm": 1.1857088031822605, "learning_rate": 3.8262407546262085e-05, "loss": 0.3307, "step": 364 }, { "epoch": 0.16, "grad_norm": 0.9740399855115789, "learning_rate": 3.8250782377721855e-05, "loss": 0.2708, "step": 365 }, { "epoch": 0.16, "grad_norm": 1.2282173024986183, "learning_rate": 3.8239120228365415e-05, "loss": 0.3189, "step": 366 }, { "epoch": 0.16, "grad_norm": 1.225159054764995, "learning_rate": 3.82274211218233e-05, "loss": 0.2871, "step": 367 }, { "epoch": 0.16, "grad_norm": 1.2034539602813672, "learning_rate": 3.821568508180093e-05, "loss": 0.3173, "step": 368 }, { "epoch": 0.16, "grad_norm": 1.3040131979315388, "learning_rate": 3.8203912132078574e-05, "loss": 0.3157, "step": 369 }, { "epoch": 0.16, "grad_norm": 1.190223782438712, "learning_rate": 3.819210229651127e-05, "loss": 0.2923, "step": 370 }, { "epoch": 0.16, "grad_norm": 1.2465006005675532, "learning_rate": 3.818025559902881e-05, "loss": 0.3249, "step": 371 }, { "epoch": 0.16, "grad_norm": 1.1393751324995156, "learning_rate": 3.8168372063635677e-05, "loss": 0.3132, "step": 372 }, { "epoch": 0.16, "grad_norm": 1.2594163513967358, "learning_rate": 3.815645171441099e-05, "loss": 0.3078, "step": 373 }, { "epoch": 0.16, "grad_norm": 0.9654625209663587, "learning_rate": 3.814449457550847e-05, "loss": 0.274, "step": 374 }, { "epoch": 0.16, "grad_norm": 1.1190706400933415, "learning_rate": 3.813250067115638e-05, "loss": 0.2627, "step": 375 }, { "epoch": 0.17, "grad_norm": 1.1523361464427377, "learning_rate": 3.8120470025657475e-05, "loss": 0.2711, "step": 376 }, { "epoch": 0.17, "grad_norm": 1.1577231842474236, "learning_rate": 3.810840266338897e-05, "loss": 0.3119, "step": 377 }, { "epoch": 0.17, "grad_norm": 1.3914208867086608, "learning_rate": 3.8096298608802455e-05, "loss": 0.3533, "step": 378 }, { "epoch": 0.17, "grad_norm": 1.0676526062851714, "learning_rate": 3.80841578864239e-05, "loss": 0.2576, "step": 379 }, { "epoch": 0.17, "grad_norm": 1.0875564692075579, "learning_rate": 3.8071980520853545e-05, "loss": 0.2686, "step": 380 }, { "epoch": 0.17, "grad_norm": 1.165603999544428, "learning_rate": 3.805976653676588e-05, "loss": 0.2832, "step": 381 }, { "epoch": 0.17, "grad_norm": 1.1856258560845196, "learning_rate": 3.804751595890962e-05, "loss": 0.2861, "step": 382 }, { "epoch": 0.17, "grad_norm": 1.196255530062562, "learning_rate": 3.803522881210761e-05, "loss": 0.3129, "step": 383 }, { "epoch": 0.17, "grad_norm": 1.114296273078427, "learning_rate": 3.802290512125679e-05, "loss": 0.3003, "step": 384 }, { "epoch": 0.17, "grad_norm": 1.1285528666054545, "learning_rate": 3.801054491132815e-05, "loss": 0.2474, "step": 385 }, { "epoch": 0.17, "grad_norm": 1.0182947136090013, "learning_rate": 3.799814820736668e-05, "loss": 0.2405, "step": 386 }, { "epoch": 0.17, "grad_norm": 1.1977321095588573, "learning_rate": 3.798571503449132e-05, "loss": 0.3049, "step": 387 }, { "epoch": 0.17, "grad_norm": 1.105739654668564, "learning_rate": 3.79732454178949e-05, "loss": 0.2949, "step": 388 }, { "epoch": 0.17, "grad_norm": 1.249379862498696, "learning_rate": 3.796073938284408e-05, "loss": 0.3035, "step": 389 }, { "epoch": 0.17, "grad_norm": 1.1986092376006745, "learning_rate": 3.794819695467936e-05, "loss": 0.2816, "step": 390 }, { "epoch": 0.17, "grad_norm": 1.1251436247803976, "learning_rate": 3.7935618158814936e-05, "loss": 0.2729, "step": 391 }, { "epoch": 0.17, "grad_norm": 1.1071454061311834, "learning_rate": 3.79230030207387e-05, "loss": 0.2993, "step": 392 }, { "epoch": 0.17, "grad_norm": 1.084245859860781, "learning_rate": 3.79103515660122e-05, "loss": 0.2703, "step": 393 }, { "epoch": 0.17, "grad_norm": 1.1499400786669434, "learning_rate": 3.7897663820270555e-05, "loss": 0.2996, "step": 394 }, { "epoch": 0.17, "grad_norm": 1.3333723191681095, "learning_rate": 3.788493980922245e-05, "loss": 0.3108, "step": 395 }, { "epoch": 0.17, "grad_norm": 1.0842750503253293, "learning_rate": 3.787217955865e-05, "loss": 0.2888, "step": 396 }, { "epoch": 0.17, "grad_norm": 1.069443398567338, "learning_rate": 3.78593830944088e-05, "loss": 0.2557, "step": 397 }, { "epoch": 0.17, "grad_norm": 1.0504052719734938, "learning_rate": 3.78465504424278e-05, "loss": 0.2608, "step": 398 }, { "epoch": 0.18, "grad_norm": 1.0615119828182573, "learning_rate": 3.7833681628709275e-05, "loss": 0.3042, "step": 399 }, { "epoch": 0.18, "grad_norm": 1.0314263857647374, "learning_rate": 3.782077667932878e-05, "loss": 0.283, "step": 400 }, { "epoch": 0.18, "grad_norm": 1.008082449784267, "learning_rate": 3.780783562043509e-05, "loss": 0.2444, "step": 401 }, { "epoch": 0.18, "grad_norm": 1.1010002703480124, "learning_rate": 3.779485847825014e-05, "loss": 0.2773, "step": 402 }, { "epoch": 0.18, "grad_norm": 1.1032017084094645, "learning_rate": 3.778184527906899e-05, "loss": 0.2847, "step": 403 }, { "epoch": 0.18, "grad_norm": 1.0539286956772262, "learning_rate": 3.776879604925975e-05, "loss": 0.2573, "step": 404 }, { "epoch": 0.18, "grad_norm": 1.07200911325168, "learning_rate": 3.775571081526355e-05, "loss": 0.2422, "step": 405 }, { "epoch": 0.18, "grad_norm": 1.2122354749807307, "learning_rate": 3.7742589603594455e-05, "loss": 0.2742, "step": 406 }, { "epoch": 0.18, "grad_norm": 1.1206726453006293, "learning_rate": 3.772943244083944e-05, "loss": 0.2701, "step": 407 }, { "epoch": 0.18, "grad_norm": 1.1312261589446488, "learning_rate": 3.771623935365834e-05, "loss": 0.2651, "step": 408 }, { "epoch": 0.18, "grad_norm": 1.163046322907833, "learning_rate": 3.770301036878377e-05, "loss": 0.2613, "step": 409 }, { "epoch": 0.18, "grad_norm": 1.0666758996345351, "learning_rate": 3.768974551302107e-05, "loss": 0.2711, "step": 410 }, { "epoch": 0.18, "grad_norm": 1.186691008614609, "learning_rate": 3.7676444813248284e-05, "loss": 0.3083, "step": 411 }, { "epoch": 0.18, "grad_norm": 1.2157248585929474, "learning_rate": 3.766310829641608e-05, "loss": 0.3066, "step": 412 }, { "epoch": 0.18, "grad_norm": 0.9884229113707944, "learning_rate": 3.764973598954769e-05, "loss": 0.2187, "step": 413 }, { "epoch": 0.18, "grad_norm": 1.0300407255324875, "learning_rate": 3.763632791973888e-05, "loss": 0.275, "step": 414 }, { "epoch": 0.18, "grad_norm": 1.1276072912488584, "learning_rate": 3.762288411415788e-05, "loss": 0.2851, "step": 415 }, { "epoch": 0.18, "grad_norm": 0.9673372242146259, "learning_rate": 3.760940460004531e-05, "loss": 0.2093, "step": 416 }, { "epoch": 0.18, "grad_norm": 1.1101539992994867, "learning_rate": 3.759588940471417e-05, "loss": 0.3021, "step": 417 }, { "epoch": 0.18, "grad_norm": 0.9762734862859238, "learning_rate": 3.758233855554976e-05, "loss": 0.2002, "step": 418 }, { "epoch": 0.18, "grad_norm": 1.0405400032408276, "learning_rate": 3.756875208000959e-05, "loss": 0.276, "step": 419 }, { "epoch": 0.18, "grad_norm": 1.0972222933118665, "learning_rate": 3.7555130005623395e-05, "loss": 0.2581, "step": 420 }, { "epoch": 0.18, "grad_norm": 1.1366455474040047, "learning_rate": 3.754147235999303e-05, "loss": 0.2445, "step": 421 }, { "epoch": 0.19, "grad_norm": 1.051253315560899, "learning_rate": 3.752777917079242e-05, "loss": 0.2498, "step": 422 }, { "epoch": 0.19, "grad_norm": 1.17841204477928, "learning_rate": 3.751405046576752e-05, "loss": 0.2512, "step": 423 }, { "epoch": 0.19, "grad_norm": 1.1179544977096696, "learning_rate": 3.7500286272736246e-05, "loss": 0.2582, "step": 424 }, { "epoch": 0.19, "grad_norm": 1.04526544074401, "learning_rate": 3.748648661958841e-05, "loss": 0.2523, "step": 425 }, { "epoch": 0.19, "grad_norm": 1.1933543802959101, "learning_rate": 3.74726515342857e-05, "loss": 0.2516, "step": 426 }, { "epoch": 0.19, "grad_norm": 1.0787631924066758, "learning_rate": 3.7458781044861585e-05, "loss": 0.2513, "step": 427 }, { "epoch": 0.19, "grad_norm": 1.4117152084703044, "learning_rate": 3.7444875179421266e-05, "loss": 0.2869, "step": 428 }, { "epoch": 0.19, "grad_norm": 1.0976022727428405, "learning_rate": 3.7430933966141634e-05, "loss": 0.2742, "step": 429 }, { "epoch": 0.19, "grad_norm": 1.0772573877675737, "learning_rate": 3.741695743327119e-05, "loss": 0.2654, "step": 430 }, { "epoch": 0.19, "grad_norm": 1.057470704252306, "learning_rate": 3.740294560913003e-05, "loss": 0.2411, "step": 431 }, { "epoch": 0.19, "grad_norm": 1.0107676037527527, "learning_rate": 3.738889852210974e-05, "loss": 0.2272, "step": 432 }, { "epoch": 0.19, "grad_norm": 1.0865904128247288, "learning_rate": 3.737481620067335e-05, "loss": 0.2419, "step": 433 }, { "epoch": 0.19, "grad_norm": 0.9974252216733845, "learning_rate": 3.73606986733553e-05, "loss": 0.2322, "step": 434 }, { "epoch": 0.19, "grad_norm": 1.2264896055295063, "learning_rate": 3.7346545968761355e-05, "loss": 0.2731, "step": 435 }, { "epoch": 0.19, "grad_norm": 1.0181320937666516, "learning_rate": 3.7332358115568566e-05, "loss": 0.2334, "step": 436 }, { "epoch": 0.19, "grad_norm": 1.1237513712534106, "learning_rate": 3.73181351425252e-05, "loss": 0.2665, "step": 437 }, { "epoch": 0.19, "grad_norm": 0.9584264122639083, "learning_rate": 3.730387707845069e-05, "loss": 0.2211, "step": 438 }, { "epoch": 0.19, "grad_norm": 0.9906235030009514, "learning_rate": 3.7289583952235574e-05, "loss": 0.2114, "step": 439 }, { "epoch": 0.19, "grad_norm": 0.9576445191085456, "learning_rate": 3.727525579284143e-05, "loss": 0.2362, "step": 440 }, { "epoch": 0.19, "grad_norm": 0.9841432147148322, "learning_rate": 3.726089262930081e-05, "loss": 0.2269, "step": 441 }, { "epoch": 0.19, "grad_norm": 1.3381411575418058, "learning_rate": 3.7246494490717215e-05, "loss": 0.3072, "step": 442 }, { "epoch": 0.19, "grad_norm": 0.9631139716742949, "learning_rate": 3.723206140626501e-05, "loss": 0.2094, "step": 443 }, { "epoch": 0.2, "grad_norm": 1.1884460696067658, "learning_rate": 3.721759340518937e-05, "loss": 0.2494, "step": 444 }, { "epoch": 0.2, "grad_norm": 1.0299418851512645, "learning_rate": 3.7203090516806205e-05, "loss": 0.2527, "step": 445 }, { "epoch": 0.2, "grad_norm": 1.0059035208190226, "learning_rate": 3.7188552770502125e-05, "loss": 0.2638, "step": 446 }, { "epoch": 0.2, "grad_norm": 1.1036220262932213, "learning_rate": 3.717398019573438e-05, "loss": 0.268, "step": 447 }, { "epoch": 0.2, "grad_norm": 1.015189268101601, "learning_rate": 3.715937282203078e-05, "loss": 0.2406, "step": 448 }, { "epoch": 0.2, "grad_norm": 1.0255573827385827, "learning_rate": 3.714473067898965e-05, "loss": 0.2591, "step": 449 }, { "epoch": 0.2, "grad_norm": 0.9396164845259255, "learning_rate": 3.713005379627978e-05, "loss": 0.239, "step": 450 }, { "epoch": 0.2, "grad_norm": 1.051183290394776, "learning_rate": 3.7115342203640315e-05, "loss": 0.2523, "step": 451 }, { "epoch": 0.2, "grad_norm": 1.038513554862486, "learning_rate": 3.710059593088078e-05, "loss": 0.2576, "step": 452 }, { "epoch": 0.2, "grad_norm": 0.9896430181607495, "learning_rate": 3.708581500788093e-05, "loss": 0.2382, "step": 453 }, { "epoch": 0.2, "grad_norm": 0.8625211174079745, "learning_rate": 3.7070999464590757e-05, "loss": 0.2096, "step": 454 }, { "epoch": 0.2, "grad_norm": 1.1498975231288286, "learning_rate": 3.705614933103038e-05, "loss": 0.2688, "step": 455 }, { "epoch": 0.2, "grad_norm": 1.0249613708102063, "learning_rate": 3.704126463729003e-05, "loss": 0.244, "step": 456 }, { "epoch": 0.2, "grad_norm": 1.0453786261702158, "learning_rate": 3.702634541352995e-05, "loss": 0.279, "step": 457 }, { "epoch": 0.2, "grad_norm": 1.0838779156618934, "learning_rate": 3.701139168998036e-05, "loss": 0.2494, "step": 458 }, { "epoch": 0.2, "grad_norm": 0.9884263486230128, "learning_rate": 3.6996403496941374e-05, "loss": 0.2226, "step": 459 }, { "epoch": 0.2, "grad_norm": 1.08298073434645, "learning_rate": 3.698138086478296e-05, "loss": 0.2469, "step": 460 }, { "epoch": 0.2, "grad_norm": 1.0519171605582838, "learning_rate": 3.6966323823944864e-05, "loss": 0.2239, "step": 461 }, { "epoch": 0.2, "grad_norm": 1.1029922189308377, "learning_rate": 3.6951232404936556e-05, "loss": 0.2342, "step": 462 }, { "epoch": 0.2, "grad_norm": 1.1254803373838418, "learning_rate": 3.693610663833716e-05, "loss": 0.2596, "step": 463 }, { "epoch": 0.2, "grad_norm": 0.9576330044751333, "learning_rate": 3.69209465547954e-05, "loss": 0.2467, "step": 464 }, { "epoch": 0.2, "grad_norm": 0.9853701329820604, "learning_rate": 3.690575218502953e-05, "loss": 0.2304, "step": 465 }, { "epoch": 0.2, "grad_norm": 0.9146891019368586, "learning_rate": 3.68905235598273e-05, "loss": 0.2019, "step": 466 }, { "epoch": 0.21, "grad_norm": 1.0186222160096274, "learning_rate": 3.687526071004583e-05, "loss": 0.2336, "step": 467 }, { "epoch": 0.21, "grad_norm": 0.9485099563699457, "learning_rate": 3.685996366661163e-05, "loss": 0.1945, "step": 468 }, { "epoch": 0.21, "grad_norm": 0.9449915380452257, "learning_rate": 3.684463246052047e-05, "loss": 0.222, "step": 469 }, { "epoch": 0.21, "grad_norm": 1.1220747214712932, "learning_rate": 3.6829267122837334e-05, "loss": 0.2297, "step": 470 }, { "epoch": 0.21, "grad_norm": 1.2165172371918984, "learning_rate": 3.681386768469639e-05, "loss": 0.2306, "step": 471 }, { "epoch": 0.21, "grad_norm": 1.060383877250389, "learning_rate": 3.679843417730089e-05, "loss": 0.2489, "step": 472 }, { "epoch": 0.21, "grad_norm": 1.136996834944293, "learning_rate": 3.6782966631923134e-05, "loss": 0.2116, "step": 473 }, { "epoch": 0.21, "grad_norm": 0.919677507538713, "learning_rate": 3.676746507990435e-05, "loss": 0.2395, "step": 474 }, { "epoch": 0.21, "grad_norm": 1.0035377152544445, "learning_rate": 3.675192955265473e-05, "loss": 0.2274, "step": 475 }, { "epoch": 0.21, "grad_norm": 1.023605565838665, "learning_rate": 3.6736360081653265e-05, "loss": 0.2067, "step": 476 }, { "epoch": 0.21, "grad_norm": 1.029190086891063, "learning_rate": 3.672075669844774e-05, "loss": 0.2444, "step": 477 }, { "epoch": 0.21, "grad_norm": 1.0984810007958705, "learning_rate": 3.670511943465465e-05, "loss": 0.2158, "step": 478 }, { "epoch": 0.21, "grad_norm": 0.9800967236392774, "learning_rate": 3.668944832195916e-05, "loss": 0.234, "step": 479 }, { "epoch": 0.21, "grad_norm": 1.1748577559384978, "learning_rate": 3.667374339211499e-05, "loss": 0.2251, "step": 480 }, { "epoch": 0.21, "grad_norm": 0.9632607574634446, "learning_rate": 3.6658004676944406e-05, "loss": 0.188, "step": 481 }, { "epoch": 0.21, "grad_norm": 1.006796154342539, "learning_rate": 3.664223220833813e-05, "loss": 0.2396, "step": 482 }, { "epoch": 0.21, "grad_norm": 1.0880725099018904, "learning_rate": 3.662642601825526e-05, "loss": 0.2121, "step": 483 }, { "epoch": 0.21, "grad_norm": 1.1921721951163484, "learning_rate": 3.661058613872324e-05, "loss": 0.267, "step": 484 }, { "epoch": 0.21, "grad_norm": 1.2046385818227212, "learning_rate": 3.6594712601837776e-05, "loss": 0.293, "step": 485 }, { "epoch": 0.21, "grad_norm": 1.0186355573346053, "learning_rate": 3.657880543976277e-05, "loss": 0.2075, "step": 486 }, { "epoch": 0.21, "grad_norm": 0.9130081878035919, "learning_rate": 3.656286468473025e-05, "loss": 0.227, "step": 487 }, { "epoch": 0.21, "grad_norm": 0.9447548222253443, "learning_rate": 3.654689036904032e-05, "loss": 0.2147, "step": 488 }, { "epoch": 0.21, "grad_norm": 1.0260829949486971, "learning_rate": 3.6530882525061084e-05, "loss": 0.2244, "step": 489 }, { "epoch": 0.22, "grad_norm": 1.0201861152828826, "learning_rate": 3.6514841185228586e-05, "loss": 0.2292, "step": 490 }, { "epoch": 0.22, "grad_norm": 1.0835662860319046, "learning_rate": 3.649876638204674e-05, "loss": 0.2319, "step": 491 }, { "epoch": 0.22, "grad_norm": 1.09507192018423, "learning_rate": 3.648265814808726e-05, "loss": 0.2326, "step": 492 }, { "epoch": 0.22, "grad_norm": 1.150397567857604, "learning_rate": 3.64665165159896e-05, "loss": 0.2447, "step": 493 }, { "epoch": 0.22, "grad_norm": 1.0567461620375729, "learning_rate": 3.6450341518460896e-05, "loss": 0.2626, "step": 494 }, { "epoch": 0.22, "grad_norm": 1.059242190697995, "learning_rate": 3.643413318827589e-05, "loss": 0.2112, "step": 495 }, { "epoch": 0.22, "grad_norm": 0.9625368841956355, "learning_rate": 3.641789155827685e-05, "loss": 0.2303, "step": 496 }, { "epoch": 0.22, "grad_norm": 1.010145636058018, "learning_rate": 3.640161666137354e-05, "loss": 0.2281, "step": 497 }, { "epoch": 0.22, "grad_norm": 0.9188257964651, "learning_rate": 3.638530853054312e-05, "loss": 0.2018, "step": 498 }, { "epoch": 0.22, "grad_norm": 1.1172769684079353, "learning_rate": 3.636896719883009e-05, "loss": 0.2312, "step": 499 }, { "epoch": 0.22, "grad_norm": 0.979813984032668, "learning_rate": 3.63525926993462e-05, "loss": 0.2031, "step": 500 }, { "epoch": 0.22, "grad_norm": 0.9048496147206796, "learning_rate": 3.633618506527047e-05, "loss": 0.2114, "step": 501 }, { "epoch": 0.22, "grad_norm": 0.907442754931394, "learning_rate": 3.631974432984899e-05, "loss": 0.2011, "step": 502 }, { "epoch": 0.22, "grad_norm": 0.9607513061264416, "learning_rate": 3.630327052639497e-05, "loss": 0.2157, "step": 503 }, { "epoch": 0.22, "grad_norm": 1.0137465491747655, "learning_rate": 3.62867636882886e-05, "loss": 0.3061, "step": 504 }, { "epoch": 0.22, "grad_norm": 0.8813433617991808, "learning_rate": 3.6270223848977e-05, "loss": 0.1999, "step": 505 }, { "epoch": 0.22, "grad_norm": 0.8778965214966409, "learning_rate": 3.62536510419742e-05, "loss": 0.2069, "step": 506 }, { "epoch": 0.22, "grad_norm": 1.043491361175589, "learning_rate": 3.623704530086097e-05, "loss": 0.237, "step": 507 }, { "epoch": 0.22, "grad_norm": 0.9455442538584001, "learning_rate": 3.622040665928488e-05, "loss": 0.2071, "step": 508 }, { "epoch": 0.22, "grad_norm": 0.9565253652887441, "learning_rate": 3.6203735150960105e-05, "loss": 0.2113, "step": 509 }, { "epoch": 0.22, "grad_norm": 0.8681064239008521, "learning_rate": 3.618703080966745e-05, "loss": 0.1914, "step": 510 }, { "epoch": 0.22, "grad_norm": 0.849550758181329, "learning_rate": 3.6170293669254246e-05, "loss": 0.1981, "step": 511 }, { "epoch": 0.22, "grad_norm": 0.9091903407274792, "learning_rate": 3.6153523763634275e-05, "loss": 0.24, "step": 512 }, { "epoch": 0.23, "grad_norm": 0.9166735663298832, "learning_rate": 3.613672112678771e-05, "loss": 0.2088, "step": 513 }, { "epoch": 0.23, "grad_norm": 1.0044473458935552, "learning_rate": 3.611988579276106e-05, "loss": 0.2533, "step": 514 }, { "epoch": 0.23, "grad_norm": 0.9388376864993713, "learning_rate": 3.6103017795667056e-05, "loss": 0.2236, "step": 515 }, { "epoch": 0.23, "grad_norm": 0.9569632058379927, "learning_rate": 3.608611716968465e-05, "loss": 0.2428, "step": 516 }, { "epoch": 0.23, "grad_norm": 1.0188841538357947, "learning_rate": 3.606918394905889e-05, "loss": 0.2015, "step": 517 }, { "epoch": 0.23, "grad_norm": 0.9166374454753642, "learning_rate": 3.605221816810086e-05, "loss": 0.2259, "step": 518 }, { "epoch": 0.23, "grad_norm": 1.07660651040884, "learning_rate": 3.603521986118764e-05, "loss": 0.2473, "step": 519 }, { "epoch": 0.23, "grad_norm": 1.0202600205669439, "learning_rate": 3.6018189062762204e-05, "loss": 0.2203, "step": 520 }, { "epoch": 0.23, "grad_norm": 0.9325915907298034, "learning_rate": 3.6001125807333365e-05, "loss": 0.2101, "step": 521 }, { "epoch": 0.23, "grad_norm": 0.9625129500669749, "learning_rate": 3.5984030129475696e-05, "loss": 0.225, "step": 522 }, { "epoch": 0.23, "grad_norm": 0.9470126413953757, "learning_rate": 3.596690206382948e-05, "loss": 0.2284, "step": 523 }, { "epoch": 0.23, "grad_norm": 0.9094865815702144, "learning_rate": 3.594974164510061e-05, "loss": 0.1807, "step": 524 }, { "epoch": 0.23, "grad_norm": 0.9776737147071879, "learning_rate": 3.5932548908060546e-05, "loss": 0.1986, "step": 525 }, { "epoch": 0.23, "grad_norm": 1.0676964301709624, "learning_rate": 3.5915323887546236e-05, "loss": 0.2058, "step": 526 }, { "epoch": 0.23, "grad_norm": 1.1713760330965826, "learning_rate": 3.5898066618460024e-05, "loss": 0.2464, "step": 527 }, { "epoch": 0.23, "grad_norm": 1.0268891347607694, "learning_rate": 3.5880777135769624e-05, "loss": 0.2332, "step": 528 }, { "epoch": 0.23, "grad_norm": 0.9472452063178686, "learning_rate": 3.586345547450801e-05, "loss": 0.2268, "step": 529 }, { "epoch": 0.23, "grad_norm": 1.060030142157625, "learning_rate": 3.5846101669773344e-05, "loss": 0.2054, "step": 530 }, { "epoch": 0.23, "grad_norm": 0.909191651885311, "learning_rate": 3.582871575672895e-05, "loss": 0.2347, "step": 531 }, { "epoch": 0.23, "grad_norm": 0.9464062125143163, "learning_rate": 3.581129777060318e-05, "loss": 0.238, "step": 532 }, { "epoch": 0.23, "grad_norm": 0.8790508236128243, "learning_rate": 3.579384774668939e-05, "loss": 0.1902, "step": 533 }, { "epoch": 0.23, "grad_norm": 1.0166100053683993, "learning_rate": 3.5776365720345864e-05, "loss": 0.2128, "step": 534 }, { "epoch": 0.24, "grad_norm": 0.9199037651336612, "learning_rate": 3.575885172699571e-05, "loss": 0.1841, "step": 535 }, { "epoch": 0.24, "grad_norm": 1.211957538817766, "learning_rate": 3.574130580212682e-05, "loss": 0.2271, "step": 536 }, { "epoch": 0.24, "grad_norm": 0.936776517815941, "learning_rate": 3.5723727981291795e-05, "loss": 0.2115, "step": 537 }, { "epoch": 0.24, "grad_norm": 0.8647613954832651, "learning_rate": 3.570611830010784e-05, "loss": 0.1884, "step": 538 }, { "epoch": 0.24, "grad_norm": 0.9915423961853181, "learning_rate": 3.568847679425675e-05, "loss": 0.249, "step": 539 }, { "epoch": 0.24, "grad_norm": 0.9597729395027575, "learning_rate": 3.567080349948478e-05, "loss": 0.1964, "step": 540 }, { "epoch": 0.24, "grad_norm": 0.9235205430646206, "learning_rate": 3.56530984516026e-05, "loss": 0.2012, "step": 541 }, { "epoch": 0.24, "grad_norm": 1.0321526478384588, "learning_rate": 3.5635361686485246e-05, "loss": 0.2264, "step": 542 }, { "epoch": 0.24, "grad_norm": 0.8940557690406719, "learning_rate": 3.561759324007199e-05, "loss": 0.1878, "step": 543 }, { "epoch": 0.24, "grad_norm": 1.1196711216590276, "learning_rate": 3.559979314836631e-05, "loss": 0.2264, "step": 544 }, { "epoch": 0.24, "grad_norm": 0.9037187676948977, "learning_rate": 3.558196144743581e-05, "loss": 0.1885, "step": 545 }, { "epoch": 0.24, "grad_norm": 1.0222066801785479, "learning_rate": 3.556409817341213e-05, "loss": 0.1974, "step": 546 }, { "epoch": 0.24, "grad_norm": 1.0032283408866247, "learning_rate": 3.554620336249091e-05, "loss": 0.2054, "step": 547 }, { "epoch": 0.24, "grad_norm": 0.971332399807911, "learning_rate": 3.552827705093166e-05, "loss": 0.219, "step": 548 }, { "epoch": 0.24, "grad_norm": 0.918790927935508, "learning_rate": 3.551031927505774e-05, "loss": 0.2193, "step": 549 }, { "epoch": 0.24, "grad_norm": 0.8990064395190178, "learning_rate": 3.5492330071256255e-05, "loss": 0.2033, "step": 550 }, { "epoch": 0.24, "grad_norm": 0.8833911316034189, "learning_rate": 3.5474309475978006e-05, "loss": 0.2147, "step": 551 }, { "epoch": 0.24, "grad_norm": 0.8402183584391892, "learning_rate": 3.545625752573738e-05, "loss": 0.1837, "step": 552 }, { "epoch": 0.24, "grad_norm": 0.9287072150539883, "learning_rate": 3.5438174257112316e-05, "loss": 0.1939, "step": 553 }, { "epoch": 0.24, "grad_norm": 0.9189688246796976, "learning_rate": 3.54200597067442e-05, "loss": 0.207, "step": 554 }, { "epoch": 0.24, "grad_norm": 0.9214621767197787, "learning_rate": 3.540191391133781e-05, "loss": 0.1912, "step": 555 }, { "epoch": 0.24, "grad_norm": 0.9719815959306592, "learning_rate": 3.538373690766122e-05, "loss": 0.2167, "step": 556 }, { "epoch": 0.24, "grad_norm": 0.8379328079097955, "learning_rate": 3.536552873254576e-05, "loss": 0.1798, "step": 557 }, { "epoch": 0.25, "grad_norm": 0.891367652853158, "learning_rate": 3.534728942288591e-05, "loss": 0.1863, "step": 558 }, { "epoch": 0.25, "grad_norm": 0.9057465832209125, "learning_rate": 3.532901901563925e-05, "loss": 0.1914, "step": 559 }, { "epoch": 0.25, "grad_norm": 0.9102070020679166, "learning_rate": 3.5310717547826345e-05, "loss": 0.1931, "step": 560 }, { "epoch": 0.25, "grad_norm": 0.9135805799547209, "learning_rate": 3.529238505653073e-05, "loss": 0.1905, "step": 561 }, { "epoch": 0.25, "grad_norm": 0.8379458251412093, "learning_rate": 3.527402157889877e-05, "loss": 0.1763, "step": 562 }, { "epoch": 0.25, "grad_norm": 1.1500055934935831, "learning_rate": 3.5255627152139635e-05, "loss": 0.2295, "step": 563 }, { "epoch": 0.25, "grad_norm": 1.008528935294881, "learning_rate": 3.523720181352521e-05, "loss": 0.2403, "step": 564 }, { "epoch": 0.25, "grad_norm": 1.02612063625215, "learning_rate": 3.5218745600389996e-05, "loss": 0.1888, "step": 565 }, { "epoch": 0.25, "grad_norm": 0.8265578358163461, "learning_rate": 3.5200258550131075e-05, "loss": 0.1848, "step": 566 }, { "epoch": 0.25, "grad_norm": 0.964718582744954, "learning_rate": 3.518174070020799e-05, "loss": 0.1845, "step": 567 }, { "epoch": 0.25, "grad_norm": 0.8695204344262746, "learning_rate": 3.516319208814272e-05, "loss": 0.1906, "step": 568 }, { "epoch": 0.25, "grad_norm": 0.8755074119711626, "learning_rate": 3.5144612751519556e-05, "loss": 0.1962, "step": 569 }, { "epoch": 0.25, "grad_norm": 0.9099243589026551, "learning_rate": 3.512600272798504e-05, "loss": 0.1888, "step": 570 }, { "epoch": 0.25, "grad_norm": 0.9525515910817286, "learning_rate": 3.5107362055247925e-05, "loss": 0.1858, "step": 571 }, { "epoch": 0.25, "grad_norm": 0.9827843484652299, "learning_rate": 3.508869077107903e-05, "loss": 0.1962, "step": 572 }, { "epoch": 0.25, "grad_norm": 0.9241324441682974, "learning_rate": 3.506998891331122e-05, "loss": 0.1844, "step": 573 }, { "epoch": 0.25, "grad_norm": 0.8764414154576801, "learning_rate": 3.5051256519839306e-05, "loss": 0.1983, "step": 574 }, { "epoch": 0.25, "grad_norm": 0.9962240694852259, "learning_rate": 3.503249362861998e-05, "loss": 0.2186, "step": 575 }, { "epoch": 0.25, "grad_norm": 0.8395124979798536, "learning_rate": 3.501370027767172e-05, "loss": 0.1824, "step": 576 }, { "epoch": 0.25, "grad_norm": 0.9075988399915986, "learning_rate": 3.499487650507472e-05, "loss": 0.2032, "step": 577 }, { "epoch": 0.25, "grad_norm": 0.857906890845557, "learning_rate": 3.497602234897084e-05, "loss": 0.1774, "step": 578 }, { "epoch": 0.25, "grad_norm": 0.9308436281550754, "learning_rate": 3.495713784756346e-05, "loss": 0.2007, "step": 579 }, { "epoch": 0.25, "grad_norm": 0.8480004521494686, "learning_rate": 3.49382230391175e-05, "loss": 0.1735, "step": 580 }, { "epoch": 0.26, "grad_norm": 0.9062430282850774, "learning_rate": 3.491927796195926e-05, "loss": 0.1786, "step": 581 }, { "epoch": 0.26, "grad_norm": 0.8980274757523379, "learning_rate": 3.490030265447637e-05, "loss": 0.2008, "step": 582 }, { "epoch": 0.26, "grad_norm": 0.8593206561852432, "learning_rate": 3.4881297155117726e-05, "loss": 0.1996, "step": 583 }, { "epoch": 0.26, "grad_norm": 0.7993542866711273, "learning_rate": 3.4862261502393395e-05, "loss": 0.1677, "step": 584 }, { "epoch": 0.26, "grad_norm": 0.890627141581855, "learning_rate": 3.484319573487455e-05, "loss": 0.188, "step": 585 }, { "epoch": 0.26, "grad_norm": 0.913789431220837, "learning_rate": 3.482409989119337e-05, "loss": 0.1887, "step": 586 }, { "epoch": 0.26, "grad_norm": 0.9284292730572319, "learning_rate": 3.480497401004299e-05, "loss": 0.2047, "step": 587 }, { "epoch": 0.26, "grad_norm": 0.9620746923801781, "learning_rate": 3.47858181301774e-05, "loss": 0.2045, "step": 588 }, { "epoch": 0.26, "grad_norm": 0.8004033383225002, "learning_rate": 3.476663229041138e-05, "loss": 0.1542, "step": 589 }, { "epoch": 0.26, "grad_norm": 0.8783926247147279, "learning_rate": 3.474741652962042e-05, "loss": 0.2007, "step": 590 }, { "epoch": 0.26, "grad_norm": 0.914624717807244, "learning_rate": 3.472817088674062e-05, "loss": 0.2104, "step": 591 }, { "epoch": 0.26, "grad_norm": 0.8761573018168645, "learning_rate": 3.470889540076865e-05, "loss": 0.1809, "step": 592 }, { "epoch": 0.26, "grad_norm": 0.8555532614015259, "learning_rate": 3.468959011076165e-05, "loss": 0.1966, "step": 593 }, { "epoch": 0.26, "grad_norm": 0.8697497687080429, "learning_rate": 3.467025505583712e-05, "loss": 0.1738, "step": 594 }, { "epoch": 0.26, "grad_norm": 0.8410179623625824, "learning_rate": 3.465089027517291e-05, "loss": 0.1637, "step": 595 }, { "epoch": 0.26, "grad_norm": 0.814020568002007, "learning_rate": 3.463149580800708e-05, "loss": 0.1786, "step": 596 }, { "epoch": 0.26, "grad_norm": 0.8878159309919769, "learning_rate": 3.461207169363785e-05, "loss": 0.1914, "step": 597 }, { "epoch": 0.26, "grad_norm": 0.8606687344344771, "learning_rate": 3.4592617971423515e-05, "loss": 0.1919, "step": 598 }, { "epoch": 0.26, "grad_norm": 0.8137093859966277, "learning_rate": 3.4573134680782344e-05, "loss": 0.1725, "step": 599 }, { "epoch": 0.26, "grad_norm": 0.8808367741089609, "learning_rate": 3.455362186119255e-05, "loss": 0.1993, "step": 600 }, { "epoch": 0.26, "grad_norm": 0.8430402030747981, "learning_rate": 3.4534079552192164e-05, "loss": 0.1629, "step": 601 }, { "epoch": 0.26, "grad_norm": 0.852939856704248, "learning_rate": 3.4514507793378955e-05, "loss": 0.1944, "step": 602 }, { "epoch": 0.26, "grad_norm": 0.7620741577458139, "learning_rate": 3.44949066244104e-05, "loss": 0.1724, "step": 603 }, { "epoch": 0.27, "grad_norm": 0.9159360415897062, "learning_rate": 3.447527608500353e-05, "loss": 0.1885, "step": 604 }, { "epoch": 0.27, "grad_norm": 0.856956338173056, "learning_rate": 3.4455616214934915e-05, "loss": 0.161, "step": 605 }, { "epoch": 0.27, "grad_norm": 0.9936750179768626, "learning_rate": 3.4435927054040554e-05, "loss": 0.2053, "step": 606 }, { "epoch": 0.27, "grad_norm": 0.8779144794984237, "learning_rate": 3.441620864221578e-05, "loss": 0.1775, "step": 607 }, { "epoch": 0.27, "grad_norm": 0.8143580539020531, "learning_rate": 3.439646101941521e-05, "loss": 0.1748, "step": 608 }, { "epoch": 0.27, "grad_norm": 0.8948211866371334, "learning_rate": 3.4376684225652656e-05, "loss": 0.1974, "step": 609 }, { "epoch": 0.27, "grad_norm": 0.826447713630002, "learning_rate": 3.435687830100101e-05, "loss": 0.1955, "step": 610 }, { "epoch": 0.27, "grad_norm": 0.9214160229218168, "learning_rate": 3.4337043285592226e-05, "loss": 0.1848, "step": 611 }, { "epoch": 0.27, "grad_norm": 0.9043672340655432, "learning_rate": 3.4317179219617185e-05, "loss": 0.1961, "step": 612 }, { "epoch": 0.27, "grad_norm": 0.845523030969567, "learning_rate": 3.429728614332563e-05, "loss": 0.1884, "step": 613 }, { "epoch": 0.27, "grad_norm": 0.8251503475059726, "learning_rate": 3.4277364097026086e-05, "loss": 0.1795, "step": 614 }, { "epoch": 0.27, "grad_norm": 0.9071464051476555, "learning_rate": 3.4257413121085796e-05, "loss": 0.1932, "step": 615 }, { "epoch": 0.27, "grad_norm": 0.8535124824518449, "learning_rate": 3.4237433255930594e-05, "loss": 0.1789, "step": 616 }, { "epoch": 0.27, "grad_norm": 0.8227535078008122, "learning_rate": 3.421742454204488e-05, "loss": 0.1738, "step": 617 }, { "epoch": 0.27, "grad_norm": 0.8632492128529344, "learning_rate": 3.419738701997148e-05, "loss": 0.1975, "step": 618 }, { "epoch": 0.27, "grad_norm": 0.8256026306702783, "learning_rate": 3.4177320730311635e-05, "loss": 0.1813, "step": 619 }, { "epoch": 0.27, "grad_norm": 0.8267008213196483, "learning_rate": 3.4157225713724816e-05, "loss": 0.1628, "step": 620 }, { "epoch": 0.27, "grad_norm": 0.8115819733185233, "learning_rate": 3.413710201092876e-05, "loss": 0.1671, "step": 621 }, { "epoch": 0.27, "grad_norm": 0.8239011830709978, "learning_rate": 3.411694966269929e-05, "loss": 0.1689, "step": 622 }, { "epoch": 0.27, "grad_norm": 0.777690330955787, "learning_rate": 3.4096768709870294e-05, "loss": 0.1662, "step": 623 }, { "epoch": 0.27, "grad_norm": 0.8780589109373628, "learning_rate": 3.4076559193333616e-05, "loss": 0.1746, "step": 624 }, { "epoch": 0.27, "grad_norm": 0.9174633826450884, "learning_rate": 3.405632115403898e-05, "loss": 0.1977, "step": 625 }, { "epoch": 0.27, "grad_norm": 0.8634308560226575, "learning_rate": 3.403605463299389e-05, "loss": 0.1895, "step": 626 }, { "epoch": 0.28, "grad_norm": 0.8931077516891043, "learning_rate": 3.401575967126357e-05, "loss": 0.1798, "step": 627 }, { "epoch": 0.28, "grad_norm": 0.8010424676369448, "learning_rate": 3.399543630997089e-05, "loss": 0.1477, "step": 628 }, { "epoch": 0.28, "grad_norm": 0.9239869575983687, "learning_rate": 3.3975084590296226e-05, "loss": 0.1847, "step": 629 }, { "epoch": 0.28, "grad_norm": 0.7747688010126524, "learning_rate": 3.395470455347745e-05, "loss": 0.176, "step": 630 }, { "epoch": 0.28, "grad_norm": 0.8432738408739845, "learning_rate": 3.39342962408098e-05, "loss": 0.1904, "step": 631 }, { "epoch": 0.28, "grad_norm": 0.8330663412267283, "learning_rate": 3.3913859693645806e-05, "loss": 0.1721, "step": 632 }, { "epoch": 0.28, "grad_norm": 0.9028114658982273, "learning_rate": 3.3893394953395207e-05, "loss": 0.1768, "step": 633 }, { "epoch": 0.28, "grad_norm": 0.936036079577417, "learning_rate": 3.387290206152488e-05, "loss": 0.1819, "step": 634 }, { "epoch": 0.28, "grad_norm": 0.7997923909454233, "learning_rate": 3.385238105955873e-05, "loss": 0.1617, "step": 635 }, { "epoch": 0.28, "grad_norm": 0.838153966677634, "learning_rate": 3.383183198907762e-05, "loss": 0.1757, "step": 636 }, { "epoch": 0.28, "grad_norm": 0.8511320303510185, "learning_rate": 3.3811254891719316e-05, "loss": 0.1542, "step": 637 }, { "epoch": 0.28, "grad_norm": 0.928234182544082, "learning_rate": 3.379064980917833e-05, "loss": 0.1839, "step": 638 }, { "epoch": 0.28, "grad_norm": 0.8644306591922478, "learning_rate": 3.377001678320591e-05, "loss": 0.173, "step": 639 }, { "epoch": 0.28, "grad_norm": 1.0989583935971114, "learning_rate": 3.374935585560991e-05, "loss": 0.2206, "step": 640 }, { "epoch": 0.28, "grad_norm": 0.855108629486809, "learning_rate": 3.3728667068254726e-05, "loss": 0.1838, "step": 641 }, { "epoch": 0.28, "grad_norm": 0.8880591323663043, "learning_rate": 3.370795046306121e-05, "loss": 0.1911, "step": 642 }, { "epoch": 0.28, "grad_norm": 0.877533956071781, "learning_rate": 3.3687206082006574e-05, "loss": 0.1651, "step": 643 }, { "epoch": 0.28, "grad_norm": 0.8884789225944658, "learning_rate": 3.36664339671243e-05, "loss": 0.1665, "step": 644 }, { "epoch": 0.28, "grad_norm": 0.7805994376890839, "learning_rate": 3.364563416050409e-05, "loss": 0.1692, "step": 645 }, { "epoch": 0.28, "grad_norm": 0.9238214472394257, "learning_rate": 3.362480670429174e-05, "loss": 0.1832, "step": 646 }, { "epoch": 0.28, "grad_norm": 0.8753496561332241, "learning_rate": 3.360395164068908e-05, "loss": 0.1972, "step": 647 }, { "epoch": 0.28, "grad_norm": 0.8634936041316233, "learning_rate": 3.3583069011953867e-05, "loss": 0.1887, "step": 648 }, { "epoch": 0.29, "grad_norm": 0.9033310018370924, "learning_rate": 3.3562158860399724e-05, "loss": 0.1943, "step": 649 }, { "epoch": 0.29, "grad_norm": 0.8596500303422075, "learning_rate": 3.354122122839604e-05, "loss": 0.185, "step": 650 }, { "epoch": 0.29, "grad_norm": 0.7448447993250906, "learning_rate": 3.352025615836788e-05, "loss": 0.1507, "step": 651 }, { "epoch": 0.29, "grad_norm": 0.8868896050893823, "learning_rate": 3.349926369279591e-05, "loss": 0.1632, "step": 652 }, { "epoch": 0.29, "grad_norm": 0.8939868653141144, "learning_rate": 3.347824387421632e-05, "loss": 0.1841, "step": 653 }, { "epoch": 0.29, "grad_norm": 0.8342787069126242, "learning_rate": 3.3457196745220696e-05, "loss": 0.1703, "step": 654 }, { "epoch": 0.29, "grad_norm": 0.9383196109041612, "learning_rate": 3.343612234845599e-05, "loss": 0.1924, "step": 655 }, { "epoch": 0.29, "grad_norm": 0.9063778491108919, "learning_rate": 3.341502072662438e-05, "loss": 0.1816, "step": 656 }, { "epoch": 0.29, "grad_norm": 0.8679331761030854, "learning_rate": 3.339389192248323e-05, "loss": 0.1935, "step": 657 }, { "epoch": 0.29, "grad_norm": 0.7709548098411078, "learning_rate": 3.337273597884498e-05, "loss": 0.1741, "step": 658 }, { "epoch": 0.29, "grad_norm": 1.0336136173752253, "learning_rate": 3.335155293857704e-05, "loss": 0.2197, "step": 659 }, { "epoch": 0.29, "grad_norm": 0.8769566933064409, "learning_rate": 3.333034284460175e-05, "loss": 0.2149, "step": 660 }, { "epoch": 0.29, "grad_norm": 0.7532764630738268, "learning_rate": 3.330910573989626e-05, "loss": 0.1657, "step": 661 }, { "epoch": 0.29, "grad_norm": 0.9623150139934566, "learning_rate": 3.3287841667492444e-05, "loss": 0.1962, "step": 662 }, { "epoch": 0.29, "grad_norm": 0.7773502292075494, "learning_rate": 3.326655067047684e-05, "loss": 0.1588, "step": 663 }, { "epoch": 0.29, "grad_norm": 0.9144693756940887, "learning_rate": 3.324523279199051e-05, "loss": 0.1655, "step": 664 }, { "epoch": 0.29, "grad_norm": 0.9959496487372362, "learning_rate": 3.322388807522901e-05, "loss": 0.1863, "step": 665 }, { "epoch": 0.29, "grad_norm": 0.8241620157250089, "learning_rate": 3.320251656344226e-05, "loss": 0.1584, "step": 666 }, { "epoch": 0.29, "grad_norm": 0.7951753104547714, "learning_rate": 3.318111829993449e-05, "loss": 0.1368, "step": 667 }, { "epoch": 0.29, "grad_norm": 0.8740572619153356, "learning_rate": 3.3159693328064135e-05, "loss": 0.1614, "step": 668 }, { "epoch": 0.29, "grad_norm": 0.7250793660230017, "learning_rate": 3.313824169124374e-05, "loss": 0.171, "step": 669 }, { "epoch": 0.29, "grad_norm": 0.8324659044999058, "learning_rate": 3.3116763432939864e-05, "loss": 0.1539, "step": 670 }, { "epoch": 0.29, "grad_norm": 0.7799322841056281, "learning_rate": 3.3095258596673054e-05, "loss": 0.1704, "step": 671 }, { "epoch": 0.3, "grad_norm": 0.7818016774706777, "learning_rate": 3.307372722601766e-05, "loss": 0.1581, "step": 672 }, { "epoch": 0.3, "grad_norm": 0.8141255623376077, "learning_rate": 3.305216936460184e-05, "loss": 0.1511, "step": 673 }, { "epoch": 0.3, "grad_norm": 1.003367832536999, "learning_rate": 3.3030585056107395e-05, "loss": 0.2201, "step": 674 }, { "epoch": 0.3, "grad_norm": 0.8126768873308454, "learning_rate": 3.300897434426973e-05, "loss": 0.1805, "step": 675 }, { "epoch": 0.3, "grad_norm": 0.7670482718932947, "learning_rate": 3.298733727287775e-05, "loss": 0.1635, "step": 676 }, { "epoch": 0.3, "grad_norm": 0.7531733137280455, "learning_rate": 3.296567388577378e-05, "loss": 0.1669, "step": 677 }, { "epoch": 0.3, "grad_norm": 0.7680295437007963, "learning_rate": 3.2943984226853446e-05, "loss": 0.1489, "step": 678 }, { "epoch": 0.3, "grad_norm": 0.9280006106987128, "learning_rate": 3.292226834006563e-05, "loss": 0.1826, "step": 679 }, { "epoch": 0.3, "grad_norm": 0.9777549482399351, "learning_rate": 3.2900526269412346e-05, "loss": 0.1798, "step": 680 }, { "epoch": 0.3, "grad_norm": 0.7860890251901782, "learning_rate": 3.2878758058948654e-05, "loss": 0.1803, "step": 681 }, { "epoch": 0.3, "grad_norm": 0.8440248606826026, "learning_rate": 3.28569637527826e-05, "loss": 0.1549, "step": 682 }, { "epoch": 0.3, "grad_norm": 0.7582718941319468, "learning_rate": 3.2835143395075104e-05, "loss": 0.1467, "step": 683 }, { "epoch": 0.3, "grad_norm": 0.8127636115049969, "learning_rate": 3.281329703003986e-05, "loss": 0.1712, "step": 684 }, { "epoch": 0.3, "grad_norm": 0.7942888143203024, "learning_rate": 3.279142470194327e-05, "loss": 0.1638, "step": 685 }, { "epoch": 0.3, "grad_norm": 0.8982682441835546, "learning_rate": 3.276952645510433e-05, "loss": 0.1823, "step": 686 }, { "epoch": 0.3, "grad_norm": 0.8789540934786302, "learning_rate": 3.274760233389457e-05, "loss": 0.1952, "step": 687 }, { "epoch": 0.3, "grad_norm": 0.7048044281098408, "learning_rate": 3.272565238273795e-05, "loss": 0.1344, "step": 688 }, { "epoch": 0.3, "grad_norm": 0.8803039965288278, "learning_rate": 3.270367664611076e-05, "loss": 0.1708, "step": 689 }, { "epoch": 0.3, "grad_norm": 0.7607782671186942, "learning_rate": 3.268167516854153e-05, "loss": 0.1537, "step": 690 }, { "epoch": 0.3, "grad_norm": 0.8208625765656624, "learning_rate": 3.265964799461097e-05, "loss": 0.1755, "step": 691 }, { "epoch": 0.3, "grad_norm": 0.7864386111853919, "learning_rate": 3.2637595168951835e-05, "loss": 0.1656, "step": 692 }, { "epoch": 0.3, "grad_norm": 0.7913649552954614, "learning_rate": 3.2615516736248875e-05, "loss": 0.1724, "step": 693 }, { "epoch": 0.3, "grad_norm": 0.8472965725513705, "learning_rate": 3.25934127412387e-05, "loss": 0.1779, "step": 694 }, { "epoch": 0.31, "grad_norm": 0.7119593560105105, "learning_rate": 3.2571283228709756e-05, "loss": 0.145, "step": 695 }, { "epoch": 0.31, "grad_norm": 0.8530111677080492, "learning_rate": 3.254912824350215e-05, "loss": 0.1697, "step": 696 }, { "epoch": 0.31, "grad_norm": 0.7472697471748235, "learning_rate": 3.252694783050765e-05, "loss": 0.1427, "step": 697 }, { "epoch": 0.31, "grad_norm": 0.85207428558197, "learning_rate": 3.25047420346695e-05, "loss": 0.172, "step": 698 }, { "epoch": 0.31, "grad_norm": 0.8074700845426631, "learning_rate": 3.2482510900982414e-05, "loss": 0.1531, "step": 699 }, { "epoch": 0.31, "grad_norm": 0.8530323746810717, "learning_rate": 3.246025447449242e-05, "loss": 0.1576, "step": 700 }, { "epoch": 0.31, "grad_norm": 0.8108663274149222, "learning_rate": 3.243797280029682e-05, "loss": 0.1505, "step": 701 }, { "epoch": 0.31, "grad_norm": 0.805226562083107, "learning_rate": 3.241566592354405e-05, "loss": 0.1525, "step": 702 }, { "epoch": 0.31, "grad_norm": 0.8589037209845359, "learning_rate": 3.239333388943363e-05, "loss": 0.1689, "step": 703 }, { "epoch": 0.31, "grad_norm": 0.7649383190321135, "learning_rate": 3.2370976743216056e-05, "loss": 0.1393, "step": 704 }, { "epoch": 0.31, "grad_norm": 0.7951053716896215, "learning_rate": 3.23485945301927e-05, "loss": 0.1594, "step": 705 }, { "epoch": 0.31, "grad_norm": 0.7813154193191056, "learning_rate": 3.2326187295715716e-05, "loss": 0.1393, "step": 706 }, { "epoch": 0.31, "grad_norm": 1.036795990861394, "learning_rate": 3.230375508518798e-05, "loss": 0.1797, "step": 707 }, { "epoch": 0.31, "grad_norm": 0.8517732971785257, "learning_rate": 3.228129794406296e-05, "loss": 0.1742, "step": 708 }, { "epoch": 0.31, "grad_norm": 0.8998548523101574, "learning_rate": 3.2258815917844656e-05, "loss": 0.1742, "step": 709 }, { "epoch": 0.31, "grad_norm": 0.7688202259893332, "learning_rate": 3.2236309052087464e-05, "loss": 0.15, "step": 710 }, { "epoch": 0.31, "grad_norm": 0.7189473834807215, "learning_rate": 3.221377739239615e-05, "loss": 0.1335, "step": 711 }, { "epoch": 0.31, "grad_norm": 0.7666260539230648, "learning_rate": 3.2191220984425675e-05, "loss": 0.1457, "step": 712 }, { "epoch": 0.31, "grad_norm": 0.8950432057847292, "learning_rate": 3.2168639873881184e-05, "loss": 0.1768, "step": 713 }, { "epoch": 0.31, "grad_norm": 0.7427837185545626, "learning_rate": 3.2146034106517856e-05, "loss": 0.1163, "step": 714 }, { "epoch": 0.31, "grad_norm": 0.8843322433542129, "learning_rate": 3.212340372814084e-05, "loss": 0.1521, "step": 715 }, { "epoch": 0.31, "grad_norm": 0.9735488472778555, "learning_rate": 3.210074878460514e-05, "loss": 0.1593, "step": 716 }, { "epoch": 0.31, "grad_norm": 0.9517240390308087, "learning_rate": 3.207806932181555e-05, "loss": 0.16, "step": 717 }, { "epoch": 0.32, "grad_norm": 0.8113180146117721, "learning_rate": 3.2055365385726535e-05, "loss": 0.1504, "step": 718 }, { "epoch": 0.32, "grad_norm": 0.7888817532944971, "learning_rate": 3.2032637022342164e-05, "loss": 0.1569, "step": 719 }, { "epoch": 0.32, "grad_norm": 0.8848932102666681, "learning_rate": 3.2009884277716e-05, "loss": 0.1889, "step": 720 }, { "epoch": 0.32, "grad_norm": 0.7770112443132435, "learning_rate": 3.198710719795098e-05, "loss": 0.1473, "step": 721 }, { "epoch": 0.32, "grad_norm": 0.7808600406157274, "learning_rate": 3.1964305829199375e-05, "loss": 0.1606, "step": 722 }, { "epoch": 0.32, "grad_norm": 0.7533787675747198, "learning_rate": 3.1941480217662684e-05, "loss": 0.1506, "step": 723 }, { "epoch": 0.32, "grad_norm": 0.7285312108885933, "learning_rate": 3.191863040959151e-05, "loss": 0.1427, "step": 724 }, { "epoch": 0.32, "grad_norm": 0.7977518978927162, "learning_rate": 3.189575645128548e-05, "loss": 0.1642, "step": 725 }, { "epoch": 0.32, "grad_norm": 0.8286319476550068, "learning_rate": 3.1872858389093166e-05, "loss": 0.149, "step": 726 }, { "epoch": 0.32, "grad_norm": 0.7830163062213732, "learning_rate": 3.184993626941198e-05, "loss": 0.1348, "step": 727 }, { "epoch": 0.32, "grad_norm": 1.1129168533121283, "learning_rate": 3.182699013868807e-05, "loss": 0.1727, "step": 728 }, { "epoch": 0.32, "grad_norm": 0.9018938007893289, "learning_rate": 3.1804020043416256e-05, "loss": 0.1508, "step": 729 }, { "epoch": 0.32, "grad_norm": 0.8716560567413127, "learning_rate": 3.17810260301399e-05, "loss": 0.1533, "step": 730 }, { "epoch": 0.32, "grad_norm": 0.8637757071990735, "learning_rate": 3.175800814545084e-05, "loss": 0.1802, "step": 731 }, { "epoch": 0.32, "grad_norm": 0.8189126923221619, "learning_rate": 3.173496643598927e-05, "loss": 0.1348, "step": 732 }, { "epoch": 0.32, "grad_norm": 0.8638432257285374, "learning_rate": 3.171190094844367e-05, "loss": 0.1778, "step": 733 }, { "epoch": 0.32, "grad_norm": 0.8045935344460922, "learning_rate": 3.168881172955069e-05, "loss": 0.1521, "step": 734 }, { "epoch": 0.32, "grad_norm": 0.7732737204030372, "learning_rate": 3.1665698826095084e-05, "loss": 0.1469, "step": 735 }, { "epoch": 0.32, "grad_norm": 0.9353430093823056, "learning_rate": 3.164256228490958e-05, "loss": 0.1555, "step": 736 }, { "epoch": 0.32, "grad_norm": 0.7613936684248256, "learning_rate": 3.16194021528748e-05, "loss": 0.1447, "step": 737 }, { "epoch": 0.32, "grad_norm": 0.8484292087887458, "learning_rate": 3.1596218476919194e-05, "loss": 0.1498, "step": 738 }, { "epoch": 0.32, "grad_norm": 0.8891837521752443, "learning_rate": 3.157301130401888e-05, "loss": 0.1443, "step": 739 }, { "epoch": 0.33, "grad_norm": 0.855410569533594, "learning_rate": 3.1549780681197624e-05, "loss": 0.1461, "step": 740 }, { "epoch": 0.33, "grad_norm": 0.8307208635798177, "learning_rate": 3.152652665552668e-05, "loss": 0.1513, "step": 741 }, { "epoch": 0.33, "grad_norm": 0.9114330651550024, "learning_rate": 3.1503249274124725e-05, "loss": 0.1611, "step": 742 }, { "epoch": 0.33, "grad_norm": 0.9309984148935987, "learning_rate": 3.1479948584157784e-05, "loss": 0.1806, "step": 743 }, { "epoch": 0.33, "grad_norm": 0.8076009137544757, "learning_rate": 3.145662463283908e-05, "loss": 0.1679, "step": 744 }, { "epoch": 0.33, "grad_norm": 0.8041275075822677, "learning_rate": 3.1433277467429e-05, "loss": 0.1414, "step": 745 }, { "epoch": 0.33, "grad_norm": 0.7351171517591297, "learning_rate": 3.1409907135234934e-05, "loss": 0.1322, "step": 746 }, { "epoch": 0.33, "grad_norm": 0.7375565458700186, "learning_rate": 3.138651368361126e-05, "loss": 0.1425, "step": 747 }, { "epoch": 0.33, "grad_norm": 0.7972956276555913, "learning_rate": 3.1363097159959146e-05, "loss": 0.1477, "step": 748 }, { "epoch": 0.33, "grad_norm": 0.80248644636123, "learning_rate": 3.1339657611726554e-05, "loss": 0.1586, "step": 749 }, { "epoch": 0.33, "grad_norm": 0.7364574288484522, "learning_rate": 3.13161950864081e-05, "loss": 0.1409, "step": 750 }, { "epoch": 0.33, "grad_norm": 0.8199432631961112, "learning_rate": 3.12927096315449e-05, "loss": 0.1479, "step": 751 }, { "epoch": 0.33, "grad_norm": 0.7549162113356621, "learning_rate": 3.126920129472462e-05, "loss": 0.1256, "step": 752 }, { "epoch": 0.33, "grad_norm": 0.789232292414065, "learning_rate": 3.1245670123581215e-05, "loss": 0.1454, "step": 753 }, { "epoch": 0.33, "grad_norm": 0.7642248156799555, "learning_rate": 3.122211616579495e-05, "loss": 0.1324, "step": 754 }, { "epoch": 0.33, "grad_norm": 0.6690622937461824, "learning_rate": 3.119853946909224e-05, "loss": 0.1202, "step": 755 }, { "epoch": 0.33, "grad_norm": 0.7066089359429955, "learning_rate": 3.11749400812456e-05, "loss": 0.132, "step": 756 }, { "epoch": 0.33, "grad_norm": 0.6601409233165001, "learning_rate": 3.1151318050073495e-05, "loss": 0.1262, "step": 757 }, { "epoch": 0.33, "grad_norm": 0.8159214752505441, "learning_rate": 3.112767342344028e-05, "loss": 0.1552, "step": 758 }, { "epoch": 0.33, "grad_norm": 0.7359463539319858, "learning_rate": 3.1104006249256116e-05, "loss": 0.1372, "step": 759 }, { "epoch": 0.33, "grad_norm": 0.713538891492213, "learning_rate": 3.108031657547681e-05, "loss": 0.1264, "step": 760 }, { "epoch": 0.33, "grad_norm": 0.78657593174205, "learning_rate": 3.1056604450103795e-05, "loss": 0.1552, "step": 761 }, { "epoch": 0.33, "grad_norm": 0.6902611943697757, "learning_rate": 3.103286992118399e-05, "loss": 0.1291, "step": 762 }, { "epoch": 0.34, "grad_norm": 0.7483528646824505, "learning_rate": 3.1009113036809683e-05, "loss": 0.1358, "step": 763 }, { "epoch": 0.34, "grad_norm": 0.7321755770772028, "learning_rate": 3.0985333845118504e-05, "loss": 0.152, "step": 764 }, { "epoch": 0.34, "grad_norm": 0.7913849521795088, "learning_rate": 3.0961532394293247e-05, "loss": 0.1336, "step": 765 }, { "epoch": 0.34, "grad_norm": 0.8415377543338738, "learning_rate": 3.093770873256182e-05, "loss": 0.1508, "step": 766 }, { "epoch": 0.34, "grad_norm": 0.7934337436486444, "learning_rate": 3.091386290819714e-05, "loss": 0.1633, "step": 767 }, { "epoch": 0.34, "grad_norm": 0.9167651788200313, "learning_rate": 3.0889994969517036e-05, "loss": 0.1647, "step": 768 }, { "epoch": 0.34, "grad_norm": 0.7443318759524127, "learning_rate": 3.086610496488412e-05, "loss": 0.1308, "step": 769 }, { "epoch": 0.34, "grad_norm": 0.8029143788282512, "learning_rate": 3.084219294270575e-05, "loss": 0.1434, "step": 770 }, { "epoch": 0.34, "grad_norm": 0.7528049944625801, "learning_rate": 3.0818258951433886e-05, "loss": 0.14, "step": 771 }, { "epoch": 0.34, "grad_norm": 0.747303167218664, "learning_rate": 3.0794303039564976e-05, "loss": 0.1513, "step": 772 }, { "epoch": 0.34, "grad_norm": 0.7917166158746384, "learning_rate": 3.077032525563993e-05, "loss": 0.1404, "step": 773 }, { "epoch": 0.34, "grad_norm": 0.8634192930297651, "learning_rate": 3.074632564824395e-05, "loss": 0.1309, "step": 774 }, { "epoch": 0.34, "grad_norm": 0.6454482981706743, "learning_rate": 3.072230426600646e-05, "loss": 0.1246, "step": 775 }, { "epoch": 0.34, "grad_norm": 0.8566554995809639, "learning_rate": 3.0698261157600994e-05, "loss": 0.1659, "step": 776 }, { "epoch": 0.34, "grad_norm": 0.8141364344082779, "learning_rate": 3.067419637174514e-05, "loss": 0.1385, "step": 777 }, { "epoch": 0.34, "grad_norm": 0.7537719132510283, "learning_rate": 3.065010995720038e-05, "loss": 0.1507, "step": 778 }, { "epoch": 0.34, "grad_norm": 0.6849172586394466, "learning_rate": 3.0626001962772055e-05, "loss": 0.1401, "step": 779 }, { "epoch": 0.34, "grad_norm": 0.811479771583481, "learning_rate": 3.0601872437309185e-05, "loss": 0.1381, "step": 780 }, { "epoch": 0.34, "grad_norm": 0.8430149620760162, "learning_rate": 3.057772142970446e-05, "loss": 0.1436, "step": 781 }, { "epoch": 0.34, "grad_norm": 0.8202310430864365, "learning_rate": 3.055354898889406e-05, "loss": 0.1535, "step": 782 }, { "epoch": 0.34, "grad_norm": 0.8394376970793199, "learning_rate": 3.052935516385765e-05, "loss": 0.1643, "step": 783 }, { "epoch": 0.34, "grad_norm": 0.719912501091454, "learning_rate": 3.050514000361817e-05, "loss": 0.1404, "step": 784 }, { "epoch": 0.34, "grad_norm": 0.779314542871267, "learning_rate": 3.0480903557241813e-05, "loss": 0.1632, "step": 785 }, { "epoch": 0.35, "grad_norm": 0.7677215364546679, "learning_rate": 3.045664587383791e-05, "loss": 0.1385, "step": 786 }, { "epoch": 0.35, "grad_norm": 0.7584765562760926, "learning_rate": 3.0432367002558805e-05, "loss": 0.146, "step": 787 }, { "epoch": 0.35, "grad_norm": 0.8284478907615218, "learning_rate": 3.040806699259979e-05, "loss": 0.1424, "step": 788 }, { "epoch": 0.35, "grad_norm": 0.7694274909848291, "learning_rate": 3.0383745893199e-05, "loss": 0.1369, "step": 789 }, { "epoch": 0.35, "grad_norm": 0.9038402483820561, "learning_rate": 3.0359403753637252e-05, "loss": 0.1734, "step": 790 }, { "epoch": 0.35, "grad_norm": 0.7739693710614837, "learning_rate": 3.0335040623238067e-05, "loss": 0.1672, "step": 791 }, { "epoch": 0.35, "grad_norm": 0.7688615469911471, "learning_rate": 3.031065655136744e-05, "loss": 0.1453, "step": 792 }, { "epoch": 0.35, "grad_norm": 0.7248734561057111, "learning_rate": 3.028625158743383e-05, "loss": 0.1303, "step": 793 }, { "epoch": 0.35, "grad_norm": 0.7247339434626443, "learning_rate": 3.026182578088802e-05, "loss": 0.14, "step": 794 }, { "epoch": 0.35, "grad_norm": 0.7682610111856408, "learning_rate": 3.0237379181223026e-05, "loss": 0.1558, "step": 795 }, { "epoch": 0.35, "grad_norm": 0.646923688079038, "learning_rate": 3.0212911837973994e-05, "loss": 0.1377, "step": 796 }, { "epoch": 0.35, "grad_norm": 0.7995169358335456, "learning_rate": 3.01884238007181e-05, "loss": 0.1401, "step": 797 }, { "epoch": 0.35, "grad_norm": 0.7581022829330121, "learning_rate": 3.016391511907447e-05, "loss": 0.1346, "step": 798 }, { "epoch": 0.35, "grad_norm": 0.8803800644843321, "learning_rate": 3.0139385842704033e-05, "loss": 0.1589, "step": 799 }, { "epoch": 0.35, "grad_norm": 0.8624498670631088, "learning_rate": 3.0114836021309465e-05, "loss": 0.158, "step": 800 }, { "epoch": 0.35, "grad_norm": 0.8444801985619583, "learning_rate": 3.0090265704635074e-05, "loss": 0.1595, "step": 801 }, { "epoch": 0.35, "grad_norm": 0.6836386311647834, "learning_rate": 3.0065674942466674e-05, "loss": 0.1169, "step": 802 }, { "epoch": 0.35, "grad_norm": 0.7374715249981557, "learning_rate": 3.0041063784631547e-05, "loss": 0.1309, "step": 803 }, { "epoch": 0.35, "grad_norm": 0.7126235670722028, "learning_rate": 3.001643228099826e-05, "loss": 0.1143, "step": 804 }, { "epoch": 0.35, "grad_norm": 0.8231158719610282, "learning_rate": 2.9991780481476636e-05, "loss": 0.1543, "step": 805 }, { "epoch": 0.35, "grad_norm": 0.7054179210242355, "learning_rate": 2.9967108436017604e-05, "loss": 0.127, "step": 806 }, { "epoch": 0.35, "grad_norm": 0.8315210345114313, "learning_rate": 2.9942416194613128e-05, "loss": 0.1516, "step": 807 }, { "epoch": 0.35, "grad_norm": 0.7327079112688298, "learning_rate": 2.9917703807296092e-05, "loss": 0.1298, "step": 808 }, { "epoch": 0.36, "grad_norm": 0.8540197843162837, "learning_rate": 2.989297132414019e-05, "loss": 0.1506, "step": 809 }, { "epoch": 0.36, "grad_norm": 0.7768032552746177, "learning_rate": 2.986821879525986e-05, "loss": 0.1442, "step": 810 }, { "epoch": 0.36, "grad_norm": 0.7570870063019419, "learning_rate": 2.984344627081012e-05, "loss": 0.1206, "step": 811 }, { "epoch": 0.36, "grad_norm": 0.7718909242663053, "learning_rate": 2.9818653800986547e-05, "loss": 0.1373, "step": 812 }, { "epoch": 0.36, "grad_norm": 0.6636661019615838, "learning_rate": 2.97938414360251e-05, "loss": 0.1271, "step": 813 }, { "epoch": 0.36, "grad_norm": 0.7018454671337928, "learning_rate": 2.9769009226202063e-05, "loss": 0.1075, "step": 814 }, { "epoch": 0.36, "grad_norm": 0.7599196094107776, "learning_rate": 2.9744157221833932e-05, "loss": 0.1219, "step": 815 }, { "epoch": 0.36, "grad_norm": 0.6972674415201412, "learning_rate": 2.9719285473277316e-05, "loss": 0.1363, "step": 816 }, { "epoch": 0.36, "grad_norm": 0.7929863387539015, "learning_rate": 2.9694394030928814e-05, "loss": 0.1373, "step": 817 }, { "epoch": 0.36, "grad_norm": 0.834398367908158, "learning_rate": 2.9669482945224956e-05, "loss": 0.1363, "step": 818 }, { "epoch": 0.36, "grad_norm": 0.6571138690617476, "learning_rate": 2.9644552266642038e-05, "loss": 0.1099, "step": 819 }, { "epoch": 0.36, "grad_norm": 0.7372468433047368, "learning_rate": 2.9619602045696098e-05, "loss": 0.1208, "step": 820 }, { "epoch": 0.36, "grad_norm": 0.7640394024000332, "learning_rate": 2.959463233294274e-05, "loss": 0.1206, "step": 821 }, { "epoch": 0.36, "grad_norm": 0.6839667038565588, "learning_rate": 2.9569643178977076e-05, "loss": 0.1038, "step": 822 }, { "epoch": 0.36, "grad_norm": 0.9116877494189791, "learning_rate": 2.9544634634433618e-05, "loss": 0.1612, "step": 823 }, { "epoch": 0.36, "grad_norm": 0.8082061059527138, "learning_rate": 2.951960674998615e-05, "loss": 0.1477, "step": 824 }, { "epoch": 0.36, "grad_norm": 0.818057129611401, "learning_rate": 2.949455957634766e-05, "loss": 0.1346, "step": 825 }, { "epoch": 0.36, "grad_norm": 0.6291183919470518, "learning_rate": 2.946949316427021e-05, "loss": 0.1176, "step": 826 }, { "epoch": 0.36, "grad_norm": 0.7277561759857029, "learning_rate": 2.9444407564544852e-05, "loss": 0.1153, "step": 827 }, { "epoch": 0.36, "grad_norm": 0.7424433618515311, "learning_rate": 2.9419302828001513e-05, "loss": 0.1374, "step": 828 }, { "epoch": 0.36, "grad_norm": 0.720834942266473, "learning_rate": 2.9394179005508895e-05, "loss": 0.1285, "step": 829 }, { "epoch": 0.36, "grad_norm": 0.8145851010029403, "learning_rate": 2.936903614797438e-05, "loss": 0.1522, "step": 830 }, { "epoch": 0.37, "grad_norm": 0.81038927131642, "learning_rate": 2.9343874306343907e-05, "loss": 0.1431, "step": 831 }, { "epoch": 0.37, "grad_norm": 0.7735480797835099, "learning_rate": 2.9318693531601905e-05, "loss": 0.1236, "step": 832 }, { "epoch": 0.37, "grad_norm": 0.7683574030685526, "learning_rate": 2.9293493874771136e-05, "loss": 0.1308, "step": 833 }, { "epoch": 0.37, "grad_norm": 0.7813556218274641, "learning_rate": 2.926827538691264e-05, "loss": 0.1435, "step": 834 }, { "epoch": 0.37, "grad_norm": 0.7423966765520504, "learning_rate": 2.924303811912562e-05, "loss": 0.1311, "step": 835 }, { "epoch": 0.37, "grad_norm": 0.6694957612468274, "learning_rate": 2.921778212254731e-05, "loss": 0.1095, "step": 836 }, { "epoch": 0.37, "grad_norm": 0.7334214370717455, "learning_rate": 2.9192507448352917e-05, "loss": 0.1432, "step": 837 }, { "epoch": 0.37, "grad_norm": 0.7515627553705428, "learning_rate": 2.9167214147755478e-05, "loss": 0.1272, "step": 838 }, { "epoch": 0.37, "grad_norm": 0.9109592415519928, "learning_rate": 2.9141902272005774e-05, "loss": 0.1554, "step": 839 }, { "epoch": 0.37, "grad_norm": 0.6598695888785192, "learning_rate": 2.911657187239223e-05, "loss": 0.1396, "step": 840 }, { "epoch": 0.37, "grad_norm": 0.8282125174863597, "learning_rate": 2.90912230002408e-05, "loss": 0.118, "step": 841 }, { "epoch": 0.37, "grad_norm": 0.6680308753676077, "learning_rate": 2.906585570691488e-05, "loss": 0.1212, "step": 842 }, { "epoch": 0.37, "grad_norm": 0.6459468900665176, "learning_rate": 2.904047004381516e-05, "loss": 0.116, "step": 843 }, { "epoch": 0.37, "grad_norm": 0.6488813466860507, "learning_rate": 2.90150660623796e-05, "loss": 0.1224, "step": 844 }, { "epoch": 0.37, "grad_norm": 0.8029709441865217, "learning_rate": 2.8989643814083232e-05, "loss": 0.1262, "step": 845 }, { "epoch": 0.37, "grad_norm": 0.7177592788649255, "learning_rate": 2.8964203350438135e-05, "loss": 0.1372, "step": 846 }, { "epoch": 0.37, "grad_norm": 0.7422093036862062, "learning_rate": 2.8938744722993278e-05, "loss": 0.1506, "step": 847 }, { "epoch": 0.37, "grad_norm": 0.7694927535134141, "learning_rate": 2.8913267983334436e-05, "loss": 0.1205, "step": 848 }, { "epoch": 0.37, "grad_norm": 0.7552628245371327, "learning_rate": 2.8887773183084092e-05, "loss": 0.1164, "step": 849 }, { "epoch": 0.37, "grad_norm": 0.6471595120053745, "learning_rate": 2.886226037390132e-05, "loss": 0.1148, "step": 850 }, { "epoch": 0.37, "grad_norm": 0.7207758590261785, "learning_rate": 2.8836729607481676e-05, "loss": 0.1289, "step": 851 }, { "epoch": 0.37, "grad_norm": 0.7020792920442479, "learning_rate": 2.881118093555713e-05, "loss": 0.1063, "step": 852 }, { "epoch": 0.37, "grad_norm": 0.6452344122093477, "learning_rate": 2.8785614409895895e-05, "loss": 0.1174, "step": 853 }, { "epoch": 0.38, "grad_norm": 0.8085626421118355, "learning_rate": 2.8760030082302398e-05, "loss": 0.1438, "step": 854 }, { "epoch": 0.38, "grad_norm": 0.777996804887835, "learning_rate": 2.8734428004617103e-05, "loss": 0.1249, "step": 855 }, { "epoch": 0.38, "grad_norm": 0.6389715442425666, "learning_rate": 2.8708808228716464e-05, "loss": 0.12, "step": 856 }, { "epoch": 0.38, "grad_norm": 0.6935095825825918, "learning_rate": 2.868317080651278e-05, "loss": 0.11, "step": 857 }, { "epoch": 0.38, "grad_norm": 0.7829672822194377, "learning_rate": 2.865751578995413e-05, "loss": 0.1485, "step": 858 }, { "epoch": 0.38, "grad_norm": 0.7669515211856608, "learning_rate": 2.8631843231024215e-05, "loss": 0.1363, "step": 859 }, { "epoch": 0.38, "grad_norm": 0.6497163226920917, "learning_rate": 2.8606153181742295e-05, "loss": 0.124, "step": 860 }, { "epoch": 0.38, "grad_norm": 0.7477111461420333, "learning_rate": 2.858044569416308e-05, "loss": 0.1255, "step": 861 }, { "epoch": 0.38, "grad_norm": 0.6516835646824686, "learning_rate": 2.8554720820376594e-05, "loss": 0.1029, "step": 862 }, { "epoch": 0.38, "grad_norm": 0.7483747675015885, "learning_rate": 2.8528978612508095e-05, "loss": 0.1379, "step": 863 }, { "epoch": 0.38, "grad_norm": 0.7538550166772345, "learning_rate": 2.8503219122717988e-05, "loss": 0.1163, "step": 864 }, { "epoch": 0.38, "grad_norm": 0.7868341385925391, "learning_rate": 2.8477442403201653e-05, "loss": 0.1183, "step": 865 }, { "epoch": 0.38, "grad_norm": 0.750951997233645, "learning_rate": 2.8451648506189432e-05, "loss": 0.121, "step": 866 }, { "epoch": 0.38, "grad_norm": 0.6821493092504979, "learning_rate": 2.8425837483946422e-05, "loss": 0.1113, "step": 867 }, { "epoch": 0.38, "grad_norm": 0.8199113500754667, "learning_rate": 2.8400009388772463e-05, "loss": 0.1419, "step": 868 }, { "epoch": 0.38, "grad_norm": 0.7881118032755761, "learning_rate": 2.8374164273001966e-05, "loss": 0.1369, "step": 869 }, { "epoch": 0.38, "grad_norm": 0.741292554460466, "learning_rate": 2.834830218900383e-05, "loss": 0.1089, "step": 870 }, { "epoch": 0.38, "grad_norm": 0.7126504153711384, "learning_rate": 2.8322423189181347e-05, "loss": 0.1388, "step": 871 }, { "epoch": 0.38, "grad_norm": 0.6718269818911939, "learning_rate": 2.8296527325972078e-05, "loss": 0.1169, "step": 872 }, { "epoch": 0.38, "grad_norm": 0.6647092698894337, "learning_rate": 2.8270614651847755e-05, "loss": 0.1174, "step": 873 }, { "epoch": 0.38, "grad_norm": 0.7853268110714692, "learning_rate": 2.824468521931417e-05, "loss": 0.1554, "step": 874 }, { "epoch": 0.38, "grad_norm": 0.7291078589203538, "learning_rate": 2.8218739080911072e-05, "loss": 0.1471, "step": 875 }, { "epoch": 0.38, "grad_norm": 0.8219140177700853, "learning_rate": 2.8192776289212078e-05, "loss": 0.1235, "step": 876 }, { "epoch": 0.39, "grad_norm": 0.753535124989672, "learning_rate": 2.8166796896824514e-05, "loss": 0.1461, "step": 877 }, { "epoch": 0.39, "grad_norm": 0.6674885501665359, "learning_rate": 2.8140800956389383e-05, "loss": 0.127, "step": 878 }, { "epoch": 0.39, "grad_norm": 0.6346260269424218, "learning_rate": 2.8114788520581192e-05, "loss": 0.1129, "step": 879 }, { "epoch": 0.39, "grad_norm": 0.7160031448167328, "learning_rate": 2.8088759642107873e-05, "loss": 0.1246, "step": 880 }, { "epoch": 0.39, "grad_norm": 0.7171430453483147, "learning_rate": 2.8062714373710692e-05, "loss": 0.1261, "step": 881 }, { "epoch": 0.39, "grad_norm": 0.6122144909488045, "learning_rate": 2.80366527681641e-05, "loss": 0.1051, "step": 882 }, { "epoch": 0.39, "grad_norm": 0.755152014024513, "learning_rate": 2.801057487827568e-05, "loss": 0.1302, "step": 883 }, { "epoch": 0.39, "grad_norm": 0.6795281848325211, "learning_rate": 2.7984480756885987e-05, "loss": 0.1212, "step": 884 }, { "epoch": 0.39, "grad_norm": 0.7171913745251882, "learning_rate": 2.795837045686848e-05, "loss": 0.1356, "step": 885 }, { "epoch": 0.39, "grad_norm": 0.7017976099286674, "learning_rate": 2.79322440311294e-05, "loss": 0.1122, "step": 886 }, { "epoch": 0.39, "grad_norm": 0.7817640520717055, "learning_rate": 2.7906101532607642e-05, "loss": 0.1209, "step": 887 }, { "epoch": 0.39, "grad_norm": 0.7967063126880966, "learning_rate": 2.7879943014274712e-05, "loss": 0.1217, "step": 888 }, { "epoch": 0.39, "grad_norm": 0.7113535575363211, "learning_rate": 2.785376852913452e-05, "loss": 0.1269, "step": 889 }, { "epoch": 0.39, "grad_norm": 0.6435408872939462, "learning_rate": 2.7827578130223375e-05, "loss": 0.1083, "step": 890 }, { "epoch": 0.39, "grad_norm": 0.6967028743135424, "learning_rate": 2.780137187060981e-05, "loss": 0.1212, "step": 891 }, { "epoch": 0.39, "grad_norm": 0.779965145237099, "learning_rate": 2.7775149803394503e-05, "loss": 0.1292, "step": 892 }, { "epoch": 0.39, "grad_norm": 0.6872085907368043, "learning_rate": 2.7748911981710157e-05, "loss": 0.1046, "step": 893 }, { "epoch": 0.39, "grad_norm": 0.736336219803814, "learning_rate": 2.7722658458721402e-05, "loss": 0.1361, "step": 894 }, { "epoch": 0.39, "grad_norm": 0.6899188794604603, "learning_rate": 2.7696389287624675e-05, "loss": 0.1182, "step": 895 }, { "epoch": 0.39, "grad_norm": 0.7300089240508489, "learning_rate": 2.7670104521648133e-05, "loss": 0.1213, "step": 896 }, { "epoch": 0.39, "grad_norm": 0.6335757562267121, "learning_rate": 2.7643804214051523e-05, "loss": 0.1023, "step": 897 }, { "epoch": 0.39, "grad_norm": 0.7491012592497073, "learning_rate": 2.76174884181261e-05, "loss": 0.1247, "step": 898 }, { "epoch": 0.39, "grad_norm": 0.7029525545369714, "learning_rate": 2.759115718719446e-05, "loss": 0.1101, "step": 899 }, { "epoch": 0.4, "grad_norm": 0.625744709753031, "learning_rate": 2.756481057461052e-05, "loss": 0.1051, "step": 900 }, { "epoch": 0.4, "grad_norm": 0.6477576449070811, "learning_rate": 2.753844863375935e-05, "loss": 0.1143, "step": 901 }, { "epoch": 0.4, "grad_norm": 0.8371712679996318, "learning_rate": 2.751207141805707e-05, "loss": 0.1344, "step": 902 }, { "epoch": 0.4, "grad_norm": 0.7627986979534956, "learning_rate": 2.7485678980950757e-05, "loss": 0.1372, "step": 903 }, { "epoch": 0.4, "grad_norm": 0.6740036887738626, "learning_rate": 2.7459271375918332e-05, "loss": 0.1188, "step": 904 }, { "epoch": 0.4, "grad_norm": 0.702098075323288, "learning_rate": 2.7432848656468448e-05, "loss": 0.1136, "step": 905 }, { "epoch": 0.4, "grad_norm": 0.6554515613580089, "learning_rate": 2.740641087614038e-05, "loss": 0.1204, "step": 906 }, { "epoch": 0.4, "grad_norm": 0.6738753371353735, "learning_rate": 2.7379958088503925e-05, "loss": 0.1136, "step": 907 }, { "epoch": 0.4, "grad_norm": 0.6240101844204077, "learning_rate": 2.7353490347159297e-05, "loss": 0.1135, "step": 908 }, { "epoch": 0.4, "grad_norm": 0.6315753523223393, "learning_rate": 2.7327007705736993e-05, "loss": 0.1061, "step": 909 }, { "epoch": 0.4, "grad_norm": 0.634683691752441, "learning_rate": 2.7300510217897716e-05, "loss": 0.1126, "step": 910 }, { "epoch": 0.4, "grad_norm": 0.7486097643275307, "learning_rate": 2.7273997937332225e-05, "loss": 0.1375, "step": 911 }, { "epoch": 0.4, "grad_norm": 0.7236784021016701, "learning_rate": 2.7247470917761304e-05, "loss": 0.1287, "step": 912 }, { "epoch": 0.4, "grad_norm": 0.683391410309628, "learning_rate": 2.7220929212935553e-05, "loss": 0.0972, "step": 913 }, { "epoch": 0.4, "grad_norm": 0.6730191004198, "learning_rate": 2.7194372876635348e-05, "loss": 0.1179, "step": 914 }, { "epoch": 0.4, "grad_norm": 0.7176709160281026, "learning_rate": 2.7167801962670712e-05, "loss": 0.1144, "step": 915 }, { "epoch": 0.4, "grad_norm": 0.6927474639471709, "learning_rate": 2.7141216524881202e-05, "loss": 0.121, "step": 916 }, { "epoch": 0.4, "grad_norm": 0.623602926922255, "learning_rate": 2.7114616617135815e-05, "loss": 0.0931, "step": 917 }, { "epoch": 0.4, "grad_norm": 0.6724854291166966, "learning_rate": 2.708800229333285e-05, "loss": 0.1115, "step": 918 }, { "epoch": 0.4, "grad_norm": 0.6659962715625931, "learning_rate": 2.7061373607399826e-05, "loss": 0.1163, "step": 919 }, { "epoch": 0.4, "grad_norm": 0.7253966463216031, "learning_rate": 2.7034730613293375e-05, "loss": 0.1196, "step": 920 }, { "epoch": 0.4, "grad_norm": 0.6542830736213483, "learning_rate": 2.7008073364999097e-05, "loss": 0.1056, "step": 921 }, { "epoch": 0.41, "grad_norm": 0.6407240814253602, "learning_rate": 2.69814019165315e-05, "loss": 0.1086, "step": 922 }, { "epoch": 0.41, "grad_norm": 0.664862072845685, "learning_rate": 2.695471632193383e-05, "loss": 0.1018, "step": 923 }, { "epoch": 0.41, "grad_norm": 0.8081797401433884, "learning_rate": 2.692801663527805e-05, "loss": 0.1295, "step": 924 }, { "epoch": 0.41, "grad_norm": 0.6726220878760635, "learning_rate": 2.6901302910664622e-05, "loss": 0.1024, "step": 925 }, { "epoch": 0.41, "grad_norm": 0.6570735486869748, "learning_rate": 2.687457520222249e-05, "loss": 0.1016, "step": 926 }, { "epoch": 0.41, "grad_norm": 0.7433040051027372, "learning_rate": 2.6847833564108917e-05, "loss": 0.1153, "step": 927 }, { "epoch": 0.41, "grad_norm": 0.7370587677758836, "learning_rate": 2.6821078050509387e-05, "loss": 0.1229, "step": 928 }, { "epoch": 0.41, "grad_norm": 0.7605901720509913, "learning_rate": 2.6794308715637517e-05, "loss": 0.1364, "step": 929 }, { "epoch": 0.41, "grad_norm": 0.695874640576897, "learning_rate": 2.676752561373491e-05, "loss": 0.1141, "step": 930 }, { "epoch": 0.41, "grad_norm": 0.8150185650776328, "learning_rate": 2.6740728799071075e-05, "loss": 0.1321, "step": 931 }, { "epoch": 0.41, "grad_norm": 0.7192294139017621, "learning_rate": 2.6713918325943317e-05, "loss": 0.1333, "step": 932 }, { "epoch": 0.41, "grad_norm": 0.5962368434678064, "learning_rate": 2.6687094248676585e-05, "loss": 0.0956, "step": 933 }, { "epoch": 0.41, "grad_norm": 0.6374914711026393, "learning_rate": 2.666025662162343e-05, "loss": 0.113, "step": 934 }, { "epoch": 0.41, "grad_norm": 0.5996217508933775, "learning_rate": 2.6633405499163833e-05, "loss": 0.1064, "step": 935 }, { "epoch": 0.41, "grad_norm": 0.6901131083550958, "learning_rate": 2.6606540935705126e-05, "loss": 0.1192, "step": 936 }, { "epoch": 0.41, "grad_norm": 0.6456953229392426, "learning_rate": 2.657966298568189e-05, "loss": 0.12, "step": 937 }, { "epoch": 0.41, "grad_norm": 0.6364974591038909, "learning_rate": 2.6552771703555803e-05, "loss": 0.0982, "step": 938 }, { "epoch": 0.41, "grad_norm": 0.7295961114478638, "learning_rate": 2.652586714381559e-05, "loss": 0.1343, "step": 939 }, { "epoch": 0.41, "grad_norm": 0.6651386292883386, "learning_rate": 2.6498949360976852e-05, "loss": 0.1112, "step": 940 }, { "epoch": 0.41, "grad_norm": 0.7379593727797032, "learning_rate": 2.6472018409581998e-05, "loss": 0.1284, "step": 941 }, { "epoch": 0.41, "grad_norm": 0.6218432097240708, "learning_rate": 2.644507434420012e-05, "loss": 0.1082, "step": 942 }, { "epoch": 0.41, "grad_norm": 0.6172894019494004, "learning_rate": 2.6418117219426877e-05, "loss": 0.0947, "step": 943 }, { "epoch": 0.41, "grad_norm": 0.72954574449642, "learning_rate": 2.6391147089884396e-05, "loss": 0.1248, "step": 944 }, { "epoch": 0.42, "grad_norm": 0.618458609892955, "learning_rate": 2.6364164010221135e-05, "loss": 0.0996, "step": 945 }, { "epoch": 0.42, "grad_norm": 0.7006104115405929, "learning_rate": 2.6337168035111828e-05, "loss": 0.114, "step": 946 }, { "epoch": 0.42, "grad_norm": 0.6945359254557372, "learning_rate": 2.6310159219257306e-05, "loss": 0.1048, "step": 947 }, { "epoch": 0.42, "grad_norm": 0.7773464337019864, "learning_rate": 2.628313761738444e-05, "loss": 0.1291, "step": 948 }, { "epoch": 0.42, "grad_norm": 0.7253606967781274, "learning_rate": 2.6256103284245998e-05, "loss": 0.1137, "step": 949 }, { "epoch": 0.42, "grad_norm": 0.7063158325300324, "learning_rate": 2.622905627462054e-05, "loss": 0.1168, "step": 950 }, { "epoch": 0.42, "grad_norm": 0.703613620098808, "learning_rate": 2.6201996643312325e-05, "loss": 0.1054, "step": 951 }, { "epoch": 0.42, "grad_norm": 0.6220124845110456, "learning_rate": 2.6174924445151177e-05, "loss": 0.0898, "step": 952 }, { "epoch": 0.42, "grad_norm": 0.5963310809157757, "learning_rate": 2.614783973499239e-05, "loss": 0.105, "step": 953 }, { "epoch": 0.42, "grad_norm": 0.6278995488885266, "learning_rate": 2.6120742567716613e-05, "loss": 0.0979, "step": 954 }, { "epoch": 0.42, "grad_norm": 0.6956828770376385, "learning_rate": 2.6093632998229715e-05, "loss": 0.1177, "step": 955 }, { "epoch": 0.42, "grad_norm": 0.7507328188515211, "learning_rate": 2.606651108146273e-05, "loss": 0.1233, "step": 956 }, { "epoch": 0.42, "grad_norm": 0.7343453746258051, "learning_rate": 2.6039376872371685e-05, "loss": 0.1291, "step": 957 }, { "epoch": 0.42, "grad_norm": 0.6562470481442866, "learning_rate": 2.6012230425937518e-05, "loss": 0.0982, "step": 958 }, { "epoch": 0.42, "grad_norm": 0.6908890034026155, "learning_rate": 2.598507179716597e-05, "loss": 0.1237, "step": 959 }, { "epoch": 0.42, "grad_norm": 0.7015995856935241, "learning_rate": 2.5957901041087465e-05, "loss": 0.1234, "step": 960 }, { "epoch": 0.42, "grad_norm": 0.6693674801957058, "learning_rate": 2.5930718212756992e-05, "loss": 0.0939, "step": 961 }, { "epoch": 0.42, "grad_norm": 0.6930692683689317, "learning_rate": 2.5903523367254018e-05, "loss": 0.1144, "step": 962 }, { "epoch": 0.42, "grad_norm": 0.6291866743514901, "learning_rate": 2.5876316559682335e-05, "loss": 0.0876, "step": 963 }, { "epoch": 0.42, "grad_norm": 0.6276015259197811, "learning_rate": 2.584909784517e-05, "loss": 0.105, "step": 964 }, { "epoch": 0.42, "grad_norm": 0.6202195691274213, "learning_rate": 2.5821867278869183e-05, "loss": 0.0937, "step": 965 }, { "epoch": 0.42, "grad_norm": 0.6332828986849761, "learning_rate": 2.5794624915956067e-05, "loss": 0.1042, "step": 966 }, { "epoch": 0.42, "grad_norm": 0.8202691566278918, "learning_rate": 2.5767370811630736e-05, "loss": 0.1292, "step": 967 }, { "epoch": 0.43, "grad_norm": 0.7050649583596309, "learning_rate": 2.5740105021117082e-05, "loss": 0.1362, "step": 968 }, { "epoch": 0.43, "grad_norm": 0.6686731160612875, "learning_rate": 2.5712827599662656e-05, "loss": 0.108, "step": 969 }, { "epoch": 0.43, "grad_norm": 0.6393001627450011, "learning_rate": 2.5685538602538592e-05, "loss": 0.1087, "step": 970 }, { "epoch": 0.43, "grad_norm": 0.6352830391961439, "learning_rate": 2.5658238085039467e-05, "loss": 0.1055, "step": 971 }, { "epoch": 0.43, "grad_norm": 0.6605342708131813, "learning_rate": 2.5630926102483207e-05, "loss": 0.1127, "step": 972 }, { "epoch": 0.43, "grad_norm": 0.6681353935596595, "learning_rate": 2.5603602710210972e-05, "loss": 0.0975, "step": 973 }, { "epoch": 0.43, "grad_norm": 0.7090121177100518, "learning_rate": 2.5576267963587036e-05, "loss": 0.1167, "step": 974 }, { "epoch": 0.43, "grad_norm": 0.7110183585998477, "learning_rate": 2.554892191799867e-05, "loss": 0.1058, "step": 975 }, { "epoch": 0.43, "grad_norm": 0.6657600919436336, "learning_rate": 2.5521564628856076e-05, "loss": 0.1126, "step": 976 }, { "epoch": 0.43, "grad_norm": 0.7157324181441531, "learning_rate": 2.5494196151592192e-05, "loss": 0.1176, "step": 977 }, { "epoch": 0.43, "grad_norm": 0.6129618013188902, "learning_rate": 2.5466816541662657e-05, "loss": 0.0937, "step": 978 }, { "epoch": 0.43, "grad_norm": 0.6004515458758535, "learning_rate": 2.5439425854545653e-05, "loss": 0.0816, "step": 979 }, { "epoch": 0.43, "grad_norm": 0.742248010678322, "learning_rate": 2.5412024145741826e-05, "loss": 0.1092, "step": 980 }, { "epoch": 0.43, "grad_norm": 0.6564620220460424, "learning_rate": 2.5384611470774127e-05, "loss": 0.1147, "step": 981 }, { "epoch": 0.43, "grad_norm": 0.8407546670167664, "learning_rate": 2.5357187885187742e-05, "loss": 0.1181, "step": 982 }, { "epoch": 0.43, "grad_norm": 0.8262847029746303, "learning_rate": 2.5329753444549966e-05, "loss": 0.1388, "step": 983 }, { "epoch": 0.43, "grad_norm": 0.659975918850703, "learning_rate": 2.53023082044501e-05, "loss": 0.1103, "step": 984 }, { "epoch": 0.43, "grad_norm": 0.6388937655092335, "learning_rate": 2.5274852220499298e-05, "loss": 0.1036, "step": 985 }, { "epoch": 0.43, "grad_norm": 0.583635308804365, "learning_rate": 2.524738554833051e-05, "loss": 0.0966, "step": 986 }, { "epoch": 0.43, "grad_norm": 0.6160485103443342, "learning_rate": 2.5219908243598328e-05, "loss": 0.0974, "step": 987 }, { "epoch": 0.43, "grad_norm": 0.6363189940151639, "learning_rate": 2.5192420361978904e-05, "loss": 0.111, "step": 988 }, { "epoch": 0.43, "grad_norm": 0.6691713271527053, "learning_rate": 2.5164921959169797e-05, "loss": 0.1055, "step": 989 }, { "epoch": 0.43, "grad_norm": 0.6660940398202866, "learning_rate": 2.5137413090889915e-05, "loss": 0.1161, "step": 990 }, { "epoch": 0.44, "grad_norm": 0.6894112809318709, "learning_rate": 2.510989381287935e-05, "loss": 0.1197, "step": 991 }, { "epoch": 0.44, "grad_norm": 0.6706529637432145, "learning_rate": 2.508236418089929e-05, "loss": 0.1115, "step": 992 }, { "epoch": 0.44, "grad_norm": 0.5200542886069934, "learning_rate": 2.505482425073191e-05, "loss": 0.0846, "step": 993 }, { "epoch": 0.44, "grad_norm": 0.7189094946990552, "learning_rate": 2.5027274078180242e-05, "loss": 0.1186, "step": 994 }, { "epoch": 0.44, "grad_norm": 0.6718642211759156, "learning_rate": 2.4999713719068085e-05, "loss": 0.0944, "step": 995 }, { "epoch": 0.44, "grad_norm": 0.6252430204942255, "learning_rate": 2.497214322923987e-05, "loss": 0.1022, "step": 996 }, { "epoch": 0.44, "grad_norm": 0.6478646517130842, "learning_rate": 2.4944562664560544e-05, "loss": 0.0965, "step": 997 }, { "epoch": 0.44, "grad_norm": 0.7486615556373746, "learning_rate": 2.49169720809155e-05, "loss": 0.1167, "step": 998 }, { "epoch": 0.44, "grad_norm": 0.6273070670266299, "learning_rate": 2.4889371534210395e-05, "loss": 0.0907, "step": 999 }, { "epoch": 0.44, "grad_norm": 0.7614040800737304, "learning_rate": 2.486176108037111e-05, "loss": 0.1244, "step": 1000 }, { "epoch": 0.44, "grad_norm": 0.6562587646625694, "learning_rate": 2.483414077534356e-05, "loss": 0.1062, "step": 1001 }, { "epoch": 0.44, "grad_norm": 0.7399617198760126, "learning_rate": 2.4806510675093663e-05, "loss": 0.11, "step": 1002 }, { "epoch": 0.44, "grad_norm": 0.6837462118974879, "learning_rate": 2.4778870835607153e-05, "loss": 0.1116, "step": 1003 }, { "epoch": 0.44, "grad_norm": 0.7652215867972375, "learning_rate": 2.4751221312889514e-05, "loss": 0.1221, "step": 1004 }, { "epoch": 0.44, "grad_norm": 0.6831092207221389, "learning_rate": 2.4723562162965843e-05, "loss": 0.1058, "step": 1005 }, { "epoch": 0.44, "grad_norm": 0.613947148445007, "learning_rate": 2.469589344188075e-05, "loss": 0.097, "step": 1006 }, { "epoch": 0.44, "grad_norm": 0.7162370704562377, "learning_rate": 2.466821520569824e-05, "loss": 0.1349, "step": 1007 }, { "epoch": 0.44, "grad_norm": 0.6241183499792388, "learning_rate": 2.464052751050159e-05, "loss": 0.0924, "step": 1008 }, { "epoch": 0.44, "grad_norm": 0.6149447531425044, "learning_rate": 2.4612830412393245e-05, "loss": 0.0858, "step": 1009 }, { "epoch": 0.44, "grad_norm": 0.6209210329603241, "learning_rate": 2.4585123967494716e-05, "loss": 0.1032, "step": 1010 }, { "epoch": 0.44, "grad_norm": 0.6867868235658252, "learning_rate": 2.455740823194643e-05, "loss": 0.1055, "step": 1011 }, { "epoch": 0.44, "grad_norm": 0.7240496013020941, "learning_rate": 2.4529683261907663e-05, "loss": 0.1079, "step": 1012 }, { "epoch": 0.44, "grad_norm": 0.644029780578936, "learning_rate": 2.4501949113556382e-05, "loss": 0.0914, "step": 1013 }, { "epoch": 0.45, "grad_norm": 0.7155061997998737, "learning_rate": 2.4474205843089166e-05, "loss": 0.0911, "step": 1014 }, { "epoch": 0.45, "grad_norm": 0.6864614879282901, "learning_rate": 2.4446453506721074e-05, "loss": 0.1105, "step": 1015 }, { "epoch": 0.45, "grad_norm": 0.6420881496160517, "learning_rate": 2.4418692160685537e-05, "loss": 0.0947, "step": 1016 }, { "epoch": 0.45, "grad_norm": 0.6459624381896327, "learning_rate": 2.439092186123423e-05, "loss": 0.0942, "step": 1017 }, { "epoch": 0.45, "grad_norm": 0.6501773564754684, "learning_rate": 2.4363142664636978e-05, "loss": 0.1045, "step": 1018 }, { "epoch": 0.45, "grad_norm": 0.6351341938163457, "learning_rate": 2.433535462718164e-05, "loss": 0.0995, "step": 1019 }, { "epoch": 0.45, "grad_norm": 0.5900694813064317, "learning_rate": 2.430755780517399e-05, "loss": 0.0785, "step": 1020 }, { "epoch": 0.45, "grad_norm": 0.6524426276828322, "learning_rate": 2.427975225493757e-05, "loss": 0.0995, "step": 1021 }, { "epoch": 0.45, "grad_norm": 0.6765884033248402, "learning_rate": 2.425193803281367e-05, "loss": 0.1106, "step": 1022 }, { "epoch": 0.45, "grad_norm": 0.864876735647472, "learning_rate": 2.422411519516108e-05, "loss": 0.1245, "step": 1023 }, { "epoch": 0.45, "grad_norm": 0.6875210672098425, "learning_rate": 2.4196283798356103e-05, "loss": 0.1042, "step": 1024 }, { "epoch": 0.45, "grad_norm": 0.6540258728867836, "learning_rate": 2.416844389879235e-05, "loss": 0.1078, "step": 1025 }, { "epoch": 0.45, "grad_norm": 0.6161172496349205, "learning_rate": 2.4140595552880685e-05, "loss": 0.0973, "step": 1026 }, { "epoch": 0.45, "grad_norm": 0.7131860826227667, "learning_rate": 2.411273881704907e-05, "loss": 0.1169, "step": 1027 }, { "epoch": 0.45, "grad_norm": 0.6523937046317387, "learning_rate": 2.4084873747742475e-05, "loss": 0.1014, "step": 1028 }, { "epoch": 0.45, "grad_norm": 0.7149115358325583, "learning_rate": 2.4057000401422754e-05, "loss": 0.1065, "step": 1029 }, { "epoch": 0.45, "grad_norm": 0.6281547321096093, "learning_rate": 2.402911883456853e-05, "loss": 0.0908, "step": 1030 }, { "epoch": 0.45, "grad_norm": 0.7175076570002608, "learning_rate": 2.4001229103675084e-05, "loss": 0.1117, "step": 1031 }, { "epoch": 0.45, "grad_norm": 0.5628928031588045, "learning_rate": 2.397333126525425e-05, "loss": 0.0933, "step": 1032 }, { "epoch": 0.45, "grad_norm": 0.6976503651745609, "learning_rate": 2.394542537583427e-05, "loss": 0.1023, "step": 1033 }, { "epoch": 0.45, "grad_norm": 0.764089601812265, "learning_rate": 2.3917511491959717e-05, "loss": 0.1211, "step": 1034 }, { "epoch": 0.45, "grad_norm": 0.6968594177283031, "learning_rate": 2.3889589670191346e-05, "loss": 0.1055, "step": 1035 }, { "epoch": 0.46, "grad_norm": 0.5766862286803488, "learning_rate": 2.3861659967106014e-05, "loss": 0.0938, "step": 1036 }, { "epoch": 0.46, "grad_norm": 0.6344845428222327, "learning_rate": 2.383372243929654e-05, "loss": 0.0883, "step": 1037 }, { "epoch": 0.46, "grad_norm": 0.7109997481038524, "learning_rate": 2.38057771433716e-05, "loss": 0.098, "step": 1038 }, { "epoch": 0.46, "grad_norm": 0.5994483657025542, "learning_rate": 2.3777824135955595e-05, "loss": 0.089, "step": 1039 }, { "epoch": 0.46, "grad_norm": 0.7745897591666174, "learning_rate": 2.3749863473688583e-05, "loss": 0.1184, "step": 1040 }, { "epoch": 0.46, "grad_norm": 0.7631887092833795, "learning_rate": 2.3721895213226098e-05, "loss": 0.1234, "step": 1041 }, { "epoch": 0.46, "grad_norm": 0.7958532027981371, "learning_rate": 2.369391941123909e-05, "loss": 0.1281, "step": 1042 }, { "epoch": 0.46, "grad_norm": 0.5828455918149873, "learning_rate": 2.3665936124413787e-05, "loss": 0.0847, "step": 1043 }, { "epoch": 0.46, "grad_norm": 0.6349606023791675, "learning_rate": 2.3637945409451586e-05, "loss": 0.0933, "step": 1044 }, { "epoch": 0.46, "grad_norm": 0.6945028842072437, "learning_rate": 2.360994732306892e-05, "loss": 0.1052, "step": 1045 }, { "epoch": 0.46, "grad_norm": 0.6148901566826032, "learning_rate": 2.3581941921997187e-05, "loss": 0.0937, "step": 1046 }, { "epoch": 0.46, "grad_norm": 0.666989551276475, "learning_rate": 2.3553929262982576e-05, "loss": 0.112, "step": 1047 }, { "epoch": 0.46, "grad_norm": 0.8277071852491629, "learning_rate": 2.3525909402786002e-05, "loss": 0.1291, "step": 1048 }, { "epoch": 0.46, "grad_norm": 0.5979370006289251, "learning_rate": 2.3497882398182963e-05, "loss": 0.0935, "step": 1049 }, { "epoch": 0.46, "grad_norm": 0.6491674887276938, "learning_rate": 2.346984830596344e-05, "loss": 0.101, "step": 1050 }, { "epoch": 0.46, "grad_norm": 0.5467396704941234, "learning_rate": 2.344180718293176e-05, "loss": 0.0726, "step": 1051 }, { "epoch": 0.46, "grad_norm": 0.748156626645532, "learning_rate": 2.341375908590652e-05, "loss": 0.108, "step": 1052 }, { "epoch": 0.46, "grad_norm": 0.620437779495393, "learning_rate": 2.3385704071720424e-05, "loss": 0.1046, "step": 1053 }, { "epoch": 0.46, "grad_norm": 0.6887067910364316, "learning_rate": 2.3357642197220217e-05, "loss": 0.0962, "step": 1054 }, { "epoch": 0.46, "grad_norm": 0.700877369526463, "learning_rate": 2.3329573519266515e-05, "loss": 0.0941, "step": 1055 }, { "epoch": 0.46, "grad_norm": 0.6852807291844675, "learning_rate": 2.3301498094733756e-05, "loss": 0.1112, "step": 1056 }, { "epoch": 0.46, "grad_norm": 0.5923875687153869, "learning_rate": 2.3273415980510006e-05, "loss": 0.1019, "step": 1057 }, { "epoch": 0.46, "grad_norm": 0.677128039620295, "learning_rate": 2.324532723349692e-05, "loss": 0.1106, "step": 1058 }, { "epoch": 0.47, "grad_norm": 0.6270958568636267, "learning_rate": 2.321723191060958e-05, "loss": 0.0905, "step": 1059 }, { "epoch": 0.47, "grad_norm": 0.5702564002885752, "learning_rate": 2.3189130068776392e-05, "loss": 0.0877, "step": 1060 }, { "epoch": 0.47, "grad_norm": 0.727838564676688, "learning_rate": 2.3161021764938976e-05, "loss": 0.1235, "step": 1061 }, { "epoch": 0.47, "grad_norm": 0.7241224519147117, "learning_rate": 2.3132907056052035e-05, "loss": 0.1061, "step": 1062 }, { "epoch": 0.47, "grad_norm": 0.6530099311105559, "learning_rate": 2.3104785999083264e-05, "loss": 0.1045, "step": 1063 }, { "epoch": 0.47, "grad_norm": 0.6392078309022874, "learning_rate": 2.307665865101321e-05, "loss": 0.1105, "step": 1064 }, { "epoch": 0.47, "grad_norm": 0.6171790013754271, "learning_rate": 2.304852506883517e-05, "loss": 0.0961, "step": 1065 }, { "epoch": 0.47, "grad_norm": 0.6007970423414826, "learning_rate": 2.3020385309555085e-05, "loss": 0.0914, "step": 1066 }, { "epoch": 0.47, "grad_norm": 0.6110171663679205, "learning_rate": 2.2992239430191385e-05, "loss": 0.0975, "step": 1067 }, { "epoch": 0.47, "grad_norm": 0.662854273557472, "learning_rate": 2.296408748777494e-05, "loss": 0.1038, "step": 1068 }, { "epoch": 0.47, "grad_norm": 0.6131384616505577, "learning_rate": 2.2935929539348854e-05, "loss": 0.094, "step": 1069 }, { "epoch": 0.47, "grad_norm": 0.6859692307695114, "learning_rate": 2.2907765641968455e-05, "loss": 0.0997, "step": 1070 }, { "epoch": 0.47, "grad_norm": 0.6252707371832457, "learning_rate": 2.2879595852701092e-05, "loss": 0.0737, "step": 1071 }, { "epoch": 0.47, "grad_norm": 0.7013198781343973, "learning_rate": 2.285142022862606e-05, "loss": 0.1096, "step": 1072 }, { "epoch": 0.47, "grad_norm": 0.7088225629284103, "learning_rate": 2.2823238826834473e-05, "loss": 0.1075, "step": 1073 }, { "epoch": 0.47, "grad_norm": 0.6222788702159183, "learning_rate": 2.2795051704429164e-05, "loss": 0.0963, "step": 1074 }, { "epoch": 0.47, "grad_norm": 0.584504473209804, "learning_rate": 2.276685891852454e-05, "loss": 0.0931, "step": 1075 }, { "epoch": 0.47, "grad_norm": 0.5947565782110248, "learning_rate": 2.2738660526246507e-05, "loss": 0.0957, "step": 1076 }, { "epoch": 0.47, "grad_norm": 0.5502282200168032, "learning_rate": 2.27104565847323e-05, "loss": 0.0739, "step": 1077 }, { "epoch": 0.47, "grad_norm": 0.6590452608157352, "learning_rate": 2.2682247151130437e-05, "loss": 0.11, "step": 1078 }, { "epoch": 0.47, "grad_norm": 0.7233433524183542, "learning_rate": 2.2654032282600523e-05, "loss": 0.124, "step": 1079 }, { "epoch": 0.47, "grad_norm": 0.552356685287325, "learning_rate": 2.2625812036313213e-05, "loss": 0.081, "step": 1080 }, { "epoch": 0.47, "grad_norm": 0.6083718136370274, "learning_rate": 2.2597586469450034e-05, "loss": 0.0984, "step": 1081 }, { "epoch": 0.48, "grad_norm": 0.6093746821084782, "learning_rate": 2.256935563920329e-05, "loss": 0.0889, "step": 1082 }, { "epoch": 0.48, "grad_norm": 0.6621206502038786, "learning_rate": 2.254111960277598e-05, "loss": 0.1096, "step": 1083 }, { "epoch": 0.48, "grad_norm": 0.6685153219639826, "learning_rate": 2.2512878417381626e-05, "loss": 0.0992, "step": 1084 }, { "epoch": 0.48, "grad_norm": 0.5794745750823632, "learning_rate": 2.248463214024419e-05, "loss": 0.0891, "step": 1085 }, { "epoch": 0.48, "grad_norm": 0.6273161885631118, "learning_rate": 2.245638082859795e-05, "loss": 0.0915, "step": 1086 }, { "epoch": 0.48, "grad_norm": 0.6552817604229957, "learning_rate": 2.2428124539687383e-05, "loss": 0.1044, "step": 1087 }, { "epoch": 0.48, "grad_norm": 0.6156417224762234, "learning_rate": 2.239986333076707e-05, "loss": 0.1027, "step": 1088 }, { "epoch": 0.48, "grad_norm": 0.6234936442705126, "learning_rate": 2.237159725910153e-05, "loss": 0.1065, "step": 1089 }, { "epoch": 0.48, "grad_norm": 0.6131464573160951, "learning_rate": 2.2343326381965165e-05, "loss": 0.0917, "step": 1090 }, { "epoch": 0.48, "grad_norm": 0.5820911299862138, "learning_rate": 2.2315050756642086e-05, "loss": 0.0782, "step": 1091 }, { "epoch": 0.48, "grad_norm": 0.6455375676071013, "learning_rate": 2.2286770440426056e-05, "loss": 0.0803, "step": 1092 }, { "epoch": 0.48, "grad_norm": 0.6011455626799175, "learning_rate": 2.2258485490620317e-05, "loss": 0.0919, "step": 1093 }, { "epoch": 0.48, "grad_norm": 0.6326857781391027, "learning_rate": 2.2230195964537507e-05, "loss": 0.099, "step": 1094 }, { "epoch": 0.48, "grad_norm": 0.6852809248859164, "learning_rate": 2.2201901919499548e-05, "loss": 0.105, "step": 1095 }, { "epoch": 0.48, "grad_norm": 0.6019341942980888, "learning_rate": 2.2173603412837505e-05, "loss": 0.096, "step": 1096 }, { "epoch": 0.48, "grad_norm": 0.6879952770822343, "learning_rate": 2.214530050189149e-05, "loss": 0.0933, "step": 1097 }, { "epoch": 0.48, "grad_norm": 0.6504144062815099, "learning_rate": 2.211699324401053e-05, "loss": 0.0991, "step": 1098 }, { "epoch": 0.48, "grad_norm": 0.5946850942301436, "learning_rate": 2.208868169655247e-05, "loss": 0.0884, "step": 1099 }, { "epoch": 0.48, "grad_norm": 0.5629594304152179, "learning_rate": 2.2060365916883866e-05, "loss": 0.0845, "step": 1100 }, { "epoch": 0.48, "grad_norm": 0.5826506423168853, "learning_rate": 2.203204596237979e-05, "loss": 0.0712, "step": 1101 }, { "epoch": 0.48, "grad_norm": 0.6300462614210238, "learning_rate": 2.2003721890423845e-05, "loss": 0.105, "step": 1102 }, { "epoch": 0.48, "grad_norm": 0.5635002303152621, "learning_rate": 2.1975393758407924e-05, "loss": 0.0774, "step": 1103 }, { "epoch": 0.48, "grad_norm": 0.639393529987026, "learning_rate": 2.1947061623732167e-05, "loss": 0.1042, "step": 1104 }, { "epoch": 0.49, "grad_norm": 0.6535678730911875, "learning_rate": 2.1918725543804835e-05, "loss": 0.1033, "step": 1105 }, { "epoch": 0.49, "grad_norm": 0.6439007999121942, "learning_rate": 2.189038557604216e-05, "loss": 0.0862, "step": 1106 }, { "epoch": 0.49, "grad_norm": 0.6606533278325808, "learning_rate": 2.186204177786827e-05, "loss": 0.099, "step": 1107 }, { "epoch": 0.49, "grad_norm": 0.5852646843095839, "learning_rate": 2.183369420671504e-05, "loss": 0.0865, "step": 1108 }, { "epoch": 0.49, "grad_norm": 0.59001776042166, "learning_rate": 2.1805342920022e-05, "loss": 0.0873, "step": 1109 }, { "epoch": 0.49, "grad_norm": 0.6512396142757491, "learning_rate": 2.1776987975236218e-05, "loss": 0.0998, "step": 1110 }, { "epoch": 0.49, "grad_norm": 0.5940633499083241, "learning_rate": 2.1748629429812144e-05, "loss": 0.0943, "step": 1111 }, { "epoch": 0.49, "grad_norm": 0.5778586573137178, "learning_rate": 2.1720267341211562e-05, "loss": 0.0798, "step": 1112 }, { "epoch": 0.49, "grad_norm": 0.5872833308963166, "learning_rate": 2.1691901766903394e-05, "loss": 0.0865, "step": 1113 }, { "epoch": 0.49, "grad_norm": 0.7051752081735182, "learning_rate": 2.1663532764363664e-05, "loss": 0.096, "step": 1114 }, { "epoch": 0.49, "grad_norm": 0.6056990400865672, "learning_rate": 2.1635160391075315e-05, "loss": 0.0727, "step": 1115 }, { "epoch": 0.49, "grad_norm": 0.6217695194672963, "learning_rate": 2.1606784704528135e-05, "loss": 0.1054, "step": 1116 }, { "epoch": 0.49, "grad_norm": 0.6388726575028912, "learning_rate": 2.1578405762218627e-05, "loss": 0.0911, "step": 1117 }, { "epoch": 0.49, "grad_norm": 0.5941982083377386, "learning_rate": 2.1550023621649872e-05, "loss": 0.0849, "step": 1118 }, { "epoch": 0.49, "grad_norm": 0.5990173697747333, "learning_rate": 2.1521638340331453e-05, "loss": 0.0973, "step": 1119 }, { "epoch": 0.49, "grad_norm": 0.6431083462767465, "learning_rate": 2.1493249975779303e-05, "loss": 0.0962, "step": 1120 }, { "epoch": 0.49, "grad_norm": 0.5703280459858788, "learning_rate": 2.1464858585515604e-05, "loss": 0.0862, "step": 1121 }, { "epoch": 0.49, "grad_norm": 0.607990918290863, "learning_rate": 2.1436464227068688e-05, "loss": 0.0879, "step": 1122 }, { "epoch": 0.49, "grad_norm": 0.5932372288299818, "learning_rate": 2.1408066957972867e-05, "loss": 0.0923, "step": 1123 }, { "epoch": 0.49, "grad_norm": 0.6370735350163664, "learning_rate": 2.137966683576839e-05, "loss": 0.0945, "step": 1124 }, { "epoch": 0.49, "grad_norm": 0.6062059897739475, "learning_rate": 2.1351263918001246e-05, "loss": 0.0889, "step": 1125 }, { "epoch": 0.49, "grad_norm": 0.6278671542009154, "learning_rate": 2.1322858262223122e-05, "loss": 0.0858, "step": 1126 }, { "epoch": 0.5, "grad_norm": 0.6321854015547451, "learning_rate": 2.1294449925991236e-05, "loss": 0.0908, "step": 1127 }, { "epoch": 0.5, "grad_norm": 0.6453481179445552, "learning_rate": 2.1266038966868245e-05, "loss": 0.0877, "step": 1128 }, { "epoch": 0.5, "grad_norm": 0.552968302584012, "learning_rate": 2.1237625442422113e-05, "loss": 0.0816, "step": 1129 }, { "epoch": 0.5, "grad_norm": 0.7355594417169079, "learning_rate": 2.1209209410226003e-05, "loss": 0.0901, "step": 1130 }, { "epoch": 0.5, "grad_norm": 0.5650409950305053, "learning_rate": 2.118079092785817e-05, "loss": 0.0902, "step": 1131 }, { "epoch": 0.5, "grad_norm": 0.5654701653257537, "learning_rate": 2.1152370052901828e-05, "loss": 0.0789, "step": 1132 }, { "epoch": 0.5, "grad_norm": 0.7412544408604078, "learning_rate": 2.1123946842945023e-05, "loss": 0.1081, "step": 1133 }, { "epoch": 0.5, "grad_norm": 0.715037710422466, "learning_rate": 2.1095521355580564e-05, "loss": 0.0992, "step": 1134 }, { "epoch": 0.5, "grad_norm": 0.785373107417302, "learning_rate": 2.1067093648405847e-05, "loss": 0.1082, "step": 1135 }, { "epoch": 0.5, "grad_norm": 0.5764600130292682, "learning_rate": 2.1038663779022772e-05, "loss": 0.0885, "step": 1136 }, { "epoch": 0.5, "grad_norm": 0.5913593953971951, "learning_rate": 2.1010231805037632e-05, "loss": 0.0844, "step": 1137 }, { "epoch": 0.5, "grad_norm": 0.6069089189394817, "learning_rate": 2.098179778406098e-05, "loss": 0.0907, "step": 1138 }, { "epoch": 0.5, "grad_norm": 0.6107322299244344, "learning_rate": 2.0953361773707502e-05, "loss": 0.0992, "step": 1139 }, { "epoch": 0.5, "grad_norm": 0.6525102505664327, "learning_rate": 2.092492383159594e-05, "loss": 0.0985, "step": 1140 }, { "epoch": 0.5, "grad_norm": 0.6109553410103149, "learning_rate": 2.089648401534892e-05, "loss": 0.0854, "step": 1141 }, { "epoch": 0.5, "grad_norm": 0.5901571792623502, "learning_rate": 2.0868042382592896e-05, "loss": 0.0881, "step": 1142 }, { "epoch": 0.5, "grad_norm": 0.569424604313208, "learning_rate": 2.083959899095798e-05, "loss": 0.081, "step": 1143 }, { "epoch": 0.5, "grad_norm": 0.6567535512063647, "learning_rate": 2.0811153898077873e-05, "loss": 0.0956, "step": 1144 }, { "epoch": 0.5, "grad_norm": 0.6363656873019721, "learning_rate": 2.0782707161589677e-05, "loss": 0.0885, "step": 1145 }, { "epoch": 0.5, "grad_norm": 0.6492702238791086, "learning_rate": 2.0754258839133888e-05, "loss": 0.099, "step": 1146 }, { "epoch": 0.5, "grad_norm": 0.649168751210925, "learning_rate": 2.0725808988354156e-05, "loss": 0.0892, "step": 1147 }, { "epoch": 0.5, "grad_norm": 0.6333003577214033, "learning_rate": 2.0697357666897268e-05, "loss": 0.0783, "step": 1148 }, { "epoch": 0.5, "grad_norm": 0.6286365570504873, "learning_rate": 2.0668904932412966e-05, "loss": 0.0841, "step": 1149 }, { "epoch": 0.51, "grad_norm": 0.6092320788114873, "learning_rate": 2.0640450842553878e-05, "loss": 0.0817, "step": 1150 }, { "epoch": 0.51, "grad_norm": 0.7581094769479132, "learning_rate": 2.061199545497536e-05, "loss": 0.116, "step": 1151 }, { "epoch": 0.51, "grad_norm": 0.6698391254355283, "learning_rate": 2.05835388273354e-05, "loss": 0.1062, "step": 1152 }, { "epoch": 0.51, "grad_norm": 0.5222549174886676, "learning_rate": 2.055508101729451e-05, "loss": 0.0777, "step": 1153 }, { "epoch": 0.51, "grad_norm": 0.6312787106094057, "learning_rate": 2.0526622082515586e-05, "loss": 0.0953, "step": 1154 }, { "epoch": 0.51, "grad_norm": 0.5789716552358399, "learning_rate": 2.04981620806638e-05, "loss": 0.092, "step": 1155 }, { "epoch": 0.51, "grad_norm": 0.65169362548601, "learning_rate": 2.0469701069406514e-05, "loss": 0.0954, "step": 1156 }, { "epoch": 0.51, "grad_norm": 0.5311093985801728, "learning_rate": 2.044123910641309e-05, "loss": 0.0856, "step": 1157 }, { "epoch": 0.51, "grad_norm": 0.6942166898028438, "learning_rate": 2.041277624935486e-05, "loss": 0.1036, "step": 1158 }, { "epoch": 0.51, "grad_norm": 0.5856624211881358, "learning_rate": 2.038431255590495e-05, "loss": 0.092, "step": 1159 }, { "epoch": 0.51, "grad_norm": 0.5964762189366207, "learning_rate": 2.035584808373817e-05, "loss": 0.0843, "step": 1160 }, { "epoch": 0.51, "grad_norm": 0.5645446439157702, "learning_rate": 2.032738289053093e-05, "loss": 0.0733, "step": 1161 }, { "epoch": 0.51, "grad_norm": 0.6638667322950158, "learning_rate": 2.0298917033961088e-05, "loss": 0.0975, "step": 1162 }, { "epoch": 0.51, "grad_norm": 0.637890515674925, "learning_rate": 2.0270450571707844e-05, "loss": 0.0841, "step": 1163 }, { "epoch": 0.51, "grad_norm": 0.5978983968231865, "learning_rate": 2.024198356145164e-05, "loss": 0.0853, "step": 1164 }, { "epoch": 0.51, "grad_norm": 0.6069890777195964, "learning_rate": 2.0213516060874e-05, "loss": 0.0935, "step": 1165 }, { "epoch": 0.51, "grad_norm": 0.6551522201097544, "learning_rate": 2.0185048127657483e-05, "loss": 0.1017, "step": 1166 }, { "epoch": 0.51, "grad_norm": 0.6915119111958444, "learning_rate": 2.015657981948548e-05, "loss": 0.1, "step": 1167 }, { "epoch": 0.51, "grad_norm": 0.6562823787149286, "learning_rate": 2.0128111194042185e-05, "loss": 0.0947, "step": 1168 }, { "epoch": 0.51, "grad_norm": 0.6346174096417518, "learning_rate": 2.009964230901239e-05, "loss": 0.086, "step": 1169 }, { "epoch": 0.51, "grad_norm": 0.5676272943967388, "learning_rate": 2.007117322208145e-05, "loss": 0.0735, "step": 1170 }, { "epoch": 0.51, "grad_norm": 0.607633302450991, "learning_rate": 2.0042703990935112e-05, "loss": 0.0957, "step": 1171 }, { "epoch": 0.51, "grad_norm": 0.6582471793172623, "learning_rate": 2.0014234673259417e-05, "loss": 0.0929, "step": 1172 }, { "epoch": 0.52, "grad_norm": 0.6217172240425245, "learning_rate": 1.9985765326740593e-05, "loss": 0.0788, "step": 1173 }, { "epoch": 0.52, "grad_norm": 0.6336844053688565, "learning_rate": 1.9957296009064894e-05, "loss": 0.1047, "step": 1174 }, { "epoch": 0.52, "grad_norm": 0.6038896210467265, "learning_rate": 1.9928826777918558e-05, "loss": 0.0878, "step": 1175 }, { "epoch": 0.52, "grad_norm": 0.5792976291931964, "learning_rate": 1.9900357690987614e-05, "loss": 0.0801, "step": 1176 }, { "epoch": 0.52, "grad_norm": 0.6114620269282579, "learning_rate": 1.9871888805957825e-05, "loss": 0.0839, "step": 1177 }, { "epoch": 0.52, "grad_norm": 0.555507939669079, "learning_rate": 1.9843420180514524e-05, "loss": 0.086, "step": 1178 }, { "epoch": 0.52, "grad_norm": 0.6170950470258431, "learning_rate": 1.9814951872342524e-05, "loss": 0.0814, "step": 1179 }, { "epoch": 0.52, "grad_norm": 0.5399593135729566, "learning_rate": 1.9786483939126003e-05, "loss": 0.0788, "step": 1180 }, { "epoch": 0.52, "grad_norm": 0.5059068281974035, "learning_rate": 1.9758016438548374e-05, "loss": 0.0556, "step": 1181 }, { "epoch": 0.52, "grad_norm": 0.6185574116238833, "learning_rate": 1.972954942829216e-05, "loss": 0.0889, "step": 1182 }, { "epoch": 0.52, "grad_norm": 0.6541786884207524, "learning_rate": 1.970108296603892e-05, "loss": 0.093, "step": 1183 }, { "epoch": 0.52, "grad_norm": 0.6608141042173399, "learning_rate": 1.967261710946907e-05, "loss": 0.0897, "step": 1184 }, { "epoch": 0.52, "grad_norm": 0.6875397063840126, "learning_rate": 1.9644151916261834e-05, "loss": 0.0931, "step": 1185 }, { "epoch": 0.52, "grad_norm": 0.5983059910430442, "learning_rate": 1.9615687444095058e-05, "loss": 0.0878, "step": 1186 }, { "epoch": 0.52, "grad_norm": 0.5698170273301696, "learning_rate": 1.9587223750645142e-05, "loss": 0.085, "step": 1187 }, { "epoch": 0.52, "grad_norm": 0.5574099127081376, "learning_rate": 1.955876089358691e-05, "loss": 0.0833, "step": 1188 }, { "epoch": 0.52, "grad_norm": 0.5642805528663376, "learning_rate": 1.9530298930593496e-05, "loss": 0.0788, "step": 1189 }, { "epoch": 0.52, "grad_norm": 0.5549815298564003, "learning_rate": 1.9501837919336203e-05, "loss": 0.0723, "step": 1190 }, { "epoch": 0.52, "grad_norm": 0.6498168843268112, "learning_rate": 1.947337791748442e-05, "loss": 0.0915, "step": 1191 }, { "epoch": 0.52, "grad_norm": 0.5858495264768172, "learning_rate": 1.9444918982705498e-05, "loss": 0.08, "step": 1192 }, { "epoch": 0.52, "grad_norm": 0.5297490120788982, "learning_rate": 1.941646117266461e-05, "loss": 0.0731, "step": 1193 }, { "epoch": 0.52, "grad_norm": 0.5383616604699047, "learning_rate": 1.9388004545024647e-05, "loss": 0.0722, "step": 1194 }, { "epoch": 0.52, "grad_norm": 0.5518178181384963, "learning_rate": 1.935954915744613e-05, "loss": 0.0661, "step": 1195 }, { "epoch": 0.53, "grad_norm": 0.6008571353603273, "learning_rate": 1.9331095067587034e-05, "loss": 0.0794, "step": 1196 }, { "epoch": 0.53, "grad_norm": 0.5836491467841795, "learning_rate": 1.9302642333102742e-05, "loss": 0.0843, "step": 1197 }, { "epoch": 0.53, "grad_norm": 0.6026247974404115, "learning_rate": 1.9274191011645854e-05, "loss": 0.0825, "step": 1198 }, { "epoch": 0.53, "grad_norm": 0.6281679451908263, "learning_rate": 1.924574116086612e-05, "loss": 0.0932, "step": 1199 }, { "epoch": 0.53, "grad_norm": 0.5872218488114412, "learning_rate": 1.9217292838410323e-05, "loss": 0.0982, "step": 1200 }, { "epoch": 0.53, "grad_norm": 0.604208588515946, "learning_rate": 1.918884610192214e-05, "loss": 0.0865, "step": 1201 }, { "epoch": 0.53, "grad_norm": 0.5959960467460272, "learning_rate": 1.9160401009042026e-05, "loss": 0.0846, "step": 1202 }, { "epoch": 0.53, "grad_norm": 0.5787832276134567, "learning_rate": 1.9131957617407108e-05, "loss": 0.0779, "step": 1203 }, { "epoch": 0.53, "grad_norm": 0.5806539997213049, "learning_rate": 1.9103515984651083e-05, "loss": 0.0891, "step": 1204 }, { "epoch": 0.53, "grad_norm": 0.6008104354456258, "learning_rate": 1.9075076168404072e-05, "loss": 0.0717, "step": 1205 }, { "epoch": 0.53, "grad_norm": 0.5691428797774726, "learning_rate": 1.90466382262925e-05, "loss": 0.0753, "step": 1206 }, { "epoch": 0.53, "grad_norm": 0.7163433334362023, "learning_rate": 1.9018202215939025e-05, "loss": 0.0783, "step": 1207 }, { "epoch": 0.53, "grad_norm": 0.5180223681745939, "learning_rate": 1.8989768194962364e-05, "loss": 0.0671, "step": 1208 }, { "epoch": 0.53, "grad_norm": 0.5867089851064587, "learning_rate": 1.8961336220977234e-05, "loss": 0.0761, "step": 1209 }, { "epoch": 0.53, "grad_norm": 0.5389628110841415, "learning_rate": 1.8932906351594163e-05, "loss": 0.0681, "step": 1210 }, { "epoch": 0.53, "grad_norm": 0.6117381466729318, "learning_rate": 1.890447864441944e-05, "loss": 0.0809, "step": 1211 }, { "epoch": 0.53, "grad_norm": 0.5261922060657153, "learning_rate": 1.8876053157054977e-05, "loss": 0.067, "step": 1212 }, { "epoch": 0.53, "grad_norm": 0.5583801394623797, "learning_rate": 1.8847629947098182e-05, "loss": 0.0669, "step": 1213 }, { "epoch": 0.53, "grad_norm": 0.5498307867275458, "learning_rate": 1.8819209072141832e-05, "loss": 0.0712, "step": 1214 }, { "epoch": 0.53, "grad_norm": 0.5492813031318701, "learning_rate": 1.8790790589774e-05, "loss": 0.0733, "step": 1215 }, { "epoch": 0.53, "grad_norm": 0.70302199562901, "learning_rate": 1.8762374557577893e-05, "loss": 0.0976, "step": 1216 }, { "epoch": 0.53, "grad_norm": 0.5757386728846309, "learning_rate": 1.8733961033131765e-05, "loss": 0.0789, "step": 1217 }, { "epoch": 0.54, "grad_norm": 0.5603921656824724, "learning_rate": 1.8705550074008768e-05, "loss": 0.0777, "step": 1218 }, { "epoch": 0.54, "grad_norm": 0.4686210295955937, "learning_rate": 1.8677141737776885e-05, "loss": 0.0598, "step": 1219 }, { "epoch": 0.54, "grad_norm": 0.5726128784242391, "learning_rate": 1.8648736081998757e-05, "loss": 0.0797, "step": 1220 }, { "epoch": 0.54, "grad_norm": 0.6197974875254266, "learning_rate": 1.862033316423162e-05, "loss": 0.0964, "step": 1221 }, { "epoch": 0.54, "grad_norm": 0.5551221783073881, "learning_rate": 1.8591933042027136e-05, "loss": 0.0759, "step": 1222 }, { "epoch": 0.54, "grad_norm": 0.6281513635597044, "learning_rate": 1.856353577293132e-05, "loss": 0.0866, "step": 1223 }, { "epoch": 0.54, "grad_norm": 0.4604056409519657, "learning_rate": 1.85351414144844e-05, "loss": 0.0679, "step": 1224 }, { "epoch": 0.54, "grad_norm": 0.5592260983014418, "learning_rate": 1.850675002422071e-05, "loss": 0.0751, "step": 1225 }, { "epoch": 0.54, "grad_norm": 0.5688762870426995, "learning_rate": 1.8478361659668557e-05, "loss": 0.0784, "step": 1226 }, { "epoch": 0.54, "grad_norm": 0.5265628543026238, "learning_rate": 1.8449976378350135e-05, "loss": 0.0713, "step": 1227 }, { "epoch": 0.54, "grad_norm": 0.5807377824586111, "learning_rate": 1.8421594237781376e-05, "loss": 0.0787, "step": 1228 }, { "epoch": 0.54, "grad_norm": 0.5332728469593431, "learning_rate": 1.839321529547187e-05, "loss": 0.0748, "step": 1229 }, { "epoch": 0.54, "grad_norm": 0.572171148111313, "learning_rate": 1.836483960892469e-05, "loss": 0.0797, "step": 1230 }, { "epoch": 0.54, "grad_norm": 0.6327044076790445, "learning_rate": 1.8336467235636342e-05, "loss": 0.0775, "step": 1231 }, { "epoch": 0.54, "grad_norm": 0.7370608299130998, "learning_rate": 1.8308098233096606e-05, "loss": 0.1043, "step": 1232 }, { "epoch": 0.54, "grad_norm": 0.5857159259196653, "learning_rate": 1.8279732658788448e-05, "loss": 0.0815, "step": 1233 }, { "epoch": 0.54, "grad_norm": 0.6453838142725543, "learning_rate": 1.825137057018786e-05, "loss": 0.0757, "step": 1234 }, { "epoch": 0.54, "grad_norm": 0.6313227082860273, "learning_rate": 1.8223012024763785e-05, "loss": 0.0925, "step": 1235 }, { "epoch": 0.54, "grad_norm": 0.5802651306194613, "learning_rate": 1.8194657079978e-05, "loss": 0.0805, "step": 1236 }, { "epoch": 0.54, "grad_norm": 0.6053606982876246, "learning_rate": 1.816630579328497e-05, "loss": 0.092, "step": 1237 }, { "epoch": 0.54, "grad_norm": 0.6995796465667706, "learning_rate": 1.8137958222131737e-05, "loss": 0.0954, "step": 1238 }, { "epoch": 0.54, "grad_norm": 0.6153962421923618, "learning_rate": 1.8109614423957844e-05, "loss": 0.0904, "step": 1239 }, { "epoch": 0.54, "grad_norm": 0.5684209573948513, "learning_rate": 1.8081274456195165e-05, "loss": 0.0975, "step": 1240 }, { "epoch": 0.55, "grad_norm": 0.5585006689667004, "learning_rate": 1.8052938376267836e-05, "loss": 0.0889, "step": 1241 }, { "epoch": 0.55, "grad_norm": 0.5443768527309278, "learning_rate": 1.8024606241592086e-05, "loss": 0.0717, "step": 1242 }, { "epoch": 0.55, "grad_norm": 0.5721251539913984, "learning_rate": 1.7996278109576162e-05, "loss": 0.0681, "step": 1243 }, { "epoch": 0.55, "grad_norm": 0.5407967520638141, "learning_rate": 1.796795403762021e-05, "loss": 0.0671, "step": 1244 }, { "epoch": 0.55, "grad_norm": 0.6383969476399195, "learning_rate": 1.7939634083116148e-05, "loss": 0.0846, "step": 1245 }, { "epoch": 0.55, "grad_norm": 0.5887117196453053, "learning_rate": 1.7911318303447532e-05, "loss": 0.0864, "step": 1246 }, { "epoch": 0.55, "grad_norm": 0.6696058144996024, "learning_rate": 1.7883006755989477e-05, "loss": 0.0916, "step": 1247 }, { "epoch": 0.55, "grad_norm": 0.5987481508644554, "learning_rate": 1.785469949810852e-05, "loss": 0.0786, "step": 1248 }, { "epoch": 0.55, "grad_norm": 0.5785947643809913, "learning_rate": 1.7826396587162506e-05, "loss": 0.0757, "step": 1249 }, { "epoch": 0.55, "grad_norm": 0.581992640910864, "learning_rate": 1.779809808050046e-05, "loss": 0.0654, "step": 1250 }, { "epoch": 0.55, "grad_norm": 0.6043560262974035, "learning_rate": 1.7769804035462496e-05, "loss": 0.0807, "step": 1251 }, { "epoch": 0.55, "grad_norm": 0.6427042179398389, "learning_rate": 1.7741514509379686e-05, "loss": 0.0746, "step": 1252 }, { "epoch": 0.55, "grad_norm": 0.5697516727766092, "learning_rate": 1.771322955957395e-05, "loss": 0.0759, "step": 1253 }, { "epoch": 0.55, "grad_norm": 0.5471756789729559, "learning_rate": 1.7684949243357917e-05, "loss": 0.0694, "step": 1254 }, { "epoch": 0.55, "grad_norm": 0.5251365529446473, "learning_rate": 1.7656673618034838e-05, "loss": 0.0711, "step": 1255 }, { "epoch": 0.55, "grad_norm": 0.5503967512235156, "learning_rate": 1.7628402740898473e-05, "loss": 0.0654, "step": 1256 }, { "epoch": 0.55, "grad_norm": 0.550073182049, "learning_rate": 1.760013666923294e-05, "loss": 0.0752, "step": 1257 }, { "epoch": 0.55, "grad_norm": 0.5875679473949055, "learning_rate": 1.757187546031262e-05, "loss": 0.0815, "step": 1258 }, { "epoch": 0.55, "grad_norm": 0.674427374148578, "learning_rate": 1.7543619171402055e-05, "loss": 0.0961, "step": 1259 }, { "epoch": 0.55, "grad_norm": 0.5522310371268908, "learning_rate": 1.7515367859755817e-05, "loss": 0.072, "step": 1260 }, { "epoch": 0.55, "grad_norm": 0.521163641776389, "learning_rate": 1.7487121582618384e-05, "loss": 0.0689, "step": 1261 }, { "epoch": 0.55, "grad_norm": 0.5071607956169868, "learning_rate": 1.7458880397224024e-05, "loss": 0.0753, "step": 1262 }, { "epoch": 0.55, "grad_norm": 0.501395364644977, "learning_rate": 1.7430644360796712e-05, "loss": 0.0684, "step": 1263 }, { "epoch": 0.56, "grad_norm": 0.6518480393347105, "learning_rate": 1.740241353054997e-05, "loss": 0.0779, "step": 1264 }, { "epoch": 0.56, "grad_norm": 0.5604717190964015, "learning_rate": 1.7374187963686794e-05, "loss": 0.0766, "step": 1265 }, { "epoch": 0.56, "grad_norm": 0.5778116459528139, "learning_rate": 1.734596771739948e-05, "loss": 0.0879, "step": 1266 }, { "epoch": 0.56, "grad_norm": 0.5879159712792963, "learning_rate": 1.7317752848869566e-05, "loss": 0.0714, "step": 1267 }, { "epoch": 0.56, "grad_norm": 0.6357135443966311, "learning_rate": 1.72895434152677e-05, "loss": 0.1009, "step": 1268 }, { "epoch": 0.56, "grad_norm": 0.5092331601047306, "learning_rate": 1.7261339473753503e-05, "loss": 0.0752, "step": 1269 }, { "epoch": 0.56, "grad_norm": 0.5096042129042284, "learning_rate": 1.7233141081475467e-05, "loss": 0.0705, "step": 1270 }, { "epoch": 0.56, "grad_norm": 0.516345791877376, "learning_rate": 1.7204948295570846e-05, "loss": 0.07, "step": 1271 }, { "epoch": 0.56, "grad_norm": 0.5678407590444183, "learning_rate": 1.7176761173165533e-05, "loss": 0.0752, "step": 1272 }, { "epoch": 0.56, "grad_norm": 0.47189207804008926, "learning_rate": 1.714857977137395e-05, "loss": 0.0636, "step": 1273 }, { "epoch": 0.56, "grad_norm": 0.6868549051373867, "learning_rate": 1.7120404147298915e-05, "loss": 0.0927, "step": 1274 }, { "epoch": 0.56, "grad_norm": 0.6366339082886703, "learning_rate": 1.709223435803155e-05, "loss": 0.0996, "step": 1275 }, { "epoch": 0.56, "grad_norm": 0.4972865415670463, "learning_rate": 1.7064070460651146e-05, "loss": 0.0692, "step": 1276 }, { "epoch": 0.56, "grad_norm": 0.6448352443613837, "learning_rate": 1.7035912512225072e-05, "loss": 0.0829, "step": 1277 }, { "epoch": 0.56, "grad_norm": 0.5754316844909417, "learning_rate": 1.700776056980862e-05, "loss": 0.0731, "step": 1278 }, { "epoch": 0.56, "grad_norm": 0.5501939875783463, "learning_rate": 1.697961469044492e-05, "loss": 0.0739, "step": 1279 }, { "epoch": 0.56, "grad_norm": 0.5463909459227538, "learning_rate": 1.695147493116483e-05, "loss": 0.071, "step": 1280 }, { "epoch": 0.56, "grad_norm": 0.5260412449095392, "learning_rate": 1.69233413489868e-05, "loss": 0.0766, "step": 1281 }, { "epoch": 0.56, "grad_norm": 0.6219455228527094, "learning_rate": 1.689521400091674e-05, "loss": 0.0985, "step": 1282 }, { "epoch": 0.56, "grad_norm": 0.5681794396263968, "learning_rate": 1.6867092943947968e-05, "loss": 0.0676, "step": 1283 }, { "epoch": 0.56, "grad_norm": 0.5516490727852738, "learning_rate": 1.6838978235061027e-05, "loss": 0.0874, "step": 1284 }, { "epoch": 0.56, "grad_norm": 0.5326910669211165, "learning_rate": 1.6810869931223618e-05, "loss": 0.0706, "step": 1285 }, { "epoch": 0.56, "grad_norm": 0.5037451789757785, "learning_rate": 1.6782768089390423e-05, "loss": 0.0658, "step": 1286 }, { "epoch": 0.57, "grad_norm": 0.46793926381751505, "learning_rate": 1.6754672766503086e-05, "loss": 0.061, "step": 1287 }, { "epoch": 0.57, "grad_norm": 0.5626231429623033, "learning_rate": 1.6726584019489997e-05, "loss": 0.0774, "step": 1288 }, { "epoch": 0.57, "grad_norm": 0.7848036244649923, "learning_rate": 1.6698501905266255e-05, "loss": 0.1137, "step": 1289 }, { "epoch": 0.57, "grad_norm": 0.5633639218087783, "learning_rate": 1.6670426480733488e-05, "loss": 0.0768, "step": 1290 }, { "epoch": 0.57, "grad_norm": 0.5310700336131601, "learning_rate": 1.6642357802779786e-05, "loss": 0.0652, "step": 1291 }, { "epoch": 0.57, "grad_norm": 0.5377203223650573, "learning_rate": 1.661429592827958e-05, "loss": 0.0712, "step": 1292 }, { "epoch": 0.57, "grad_norm": 0.5725437048508926, "learning_rate": 1.658624091409349e-05, "loss": 0.0672, "step": 1293 }, { "epoch": 0.57, "grad_norm": 0.5654712984523599, "learning_rate": 1.6558192817068244e-05, "loss": 0.0762, "step": 1294 }, { "epoch": 0.57, "grad_norm": 0.5927319331462616, "learning_rate": 1.6530151694036566e-05, "loss": 0.0739, "step": 1295 }, { "epoch": 0.57, "grad_norm": 0.5799634929381519, "learning_rate": 1.6502117601817037e-05, "loss": 0.0778, "step": 1296 }, { "epoch": 0.57, "grad_norm": 0.49820842740696425, "learning_rate": 1.6474090597214005e-05, "loss": 0.0678, "step": 1297 }, { "epoch": 0.57, "grad_norm": 0.5244770829109788, "learning_rate": 1.6446070737017428e-05, "loss": 0.0622, "step": 1298 }, { "epoch": 0.57, "grad_norm": 0.6460961043143132, "learning_rate": 1.641805807800282e-05, "loss": 0.0971, "step": 1299 }, { "epoch": 0.57, "grad_norm": 0.5639828897866962, "learning_rate": 1.639005267693108e-05, "loss": 0.0749, "step": 1300 }, { "epoch": 0.57, "grad_norm": 0.4868886543925225, "learning_rate": 1.6362054590548424e-05, "loss": 0.0675, "step": 1301 }, { "epoch": 0.57, "grad_norm": 0.6240612609575911, "learning_rate": 1.633406387558622e-05, "loss": 0.0861, "step": 1302 }, { "epoch": 0.57, "grad_norm": 0.5323831870225858, "learning_rate": 1.6306080588760918e-05, "loss": 0.0707, "step": 1303 }, { "epoch": 0.57, "grad_norm": 0.5318854122285757, "learning_rate": 1.627810478677391e-05, "loss": 0.0706, "step": 1304 }, { "epoch": 0.57, "grad_norm": 0.4965845818768228, "learning_rate": 1.6250136526311427e-05, "loss": 0.0618, "step": 1305 }, { "epoch": 0.57, "grad_norm": 0.591963565125671, "learning_rate": 1.6222175864044408e-05, "loss": 0.0733, "step": 1306 }, { "epoch": 0.57, "grad_norm": 0.5704172377888858, "learning_rate": 1.619422285662841e-05, "loss": 0.0666, "step": 1307 }, { "epoch": 0.57, "grad_norm": 0.5425053547010417, "learning_rate": 1.616627756070346e-05, "loss": 0.0684, "step": 1308 }, { "epoch": 0.58, "grad_norm": 0.5677037603444928, "learning_rate": 1.613834003289399e-05, "loss": 0.0681, "step": 1309 }, { "epoch": 0.58, "grad_norm": 0.576695737459713, "learning_rate": 1.611041032980866e-05, "loss": 0.0752, "step": 1310 }, { "epoch": 0.58, "grad_norm": 0.5717844323322341, "learning_rate": 1.6082488508040287e-05, "loss": 0.069, "step": 1311 }, { "epoch": 0.58, "grad_norm": 0.562201500209936, "learning_rate": 1.6054574624165734e-05, "loss": 0.065, "step": 1312 }, { "epoch": 0.58, "grad_norm": 0.639612700650674, "learning_rate": 1.602666873474576e-05, "loss": 0.0783, "step": 1313 }, { "epoch": 0.58, "grad_norm": 0.4687652108585518, "learning_rate": 1.5998770896324923e-05, "loss": 0.0665, "step": 1314 }, { "epoch": 0.58, "grad_norm": 0.5092110668444348, "learning_rate": 1.5970881165431475e-05, "loss": 0.0618, "step": 1315 }, { "epoch": 0.58, "grad_norm": 0.5730615750477063, "learning_rate": 1.5942999598577252e-05, "loss": 0.0791, "step": 1316 }, { "epoch": 0.58, "grad_norm": 0.5582806169712888, "learning_rate": 1.5915126252257532e-05, "loss": 0.0829, "step": 1317 }, { "epoch": 0.58, "grad_norm": 0.5097952063151407, "learning_rate": 1.5887261182950934e-05, "loss": 0.0656, "step": 1318 }, { "epoch": 0.58, "grad_norm": 0.548208491013843, "learning_rate": 1.585940444711932e-05, "loss": 0.0736, "step": 1319 }, { "epoch": 0.58, "grad_norm": 0.5961196693794795, "learning_rate": 1.583155610120765e-05, "loss": 0.0897, "step": 1320 }, { "epoch": 0.58, "grad_norm": 0.5563829059626195, "learning_rate": 1.5803716201643907e-05, "loss": 0.0623, "step": 1321 }, { "epoch": 0.58, "grad_norm": 0.6595978717282083, "learning_rate": 1.5775884804838925e-05, "loss": 0.0933, "step": 1322 }, { "epoch": 0.58, "grad_norm": 0.6156654422082103, "learning_rate": 1.5748061967186335e-05, "loss": 0.0898, "step": 1323 }, { "epoch": 0.58, "grad_norm": 0.5203592972430782, "learning_rate": 1.5720247745062426e-05, "loss": 0.0627, "step": 1324 }, { "epoch": 0.58, "grad_norm": 0.5529190671514603, "learning_rate": 1.5692442194826022e-05, "loss": 0.0744, "step": 1325 }, { "epoch": 0.58, "grad_norm": 0.6017471872176015, "learning_rate": 1.566464537281837e-05, "loss": 0.0825, "step": 1326 }, { "epoch": 0.58, "grad_norm": 0.5788941293289271, "learning_rate": 1.5636857335363026e-05, "loss": 0.0775, "step": 1327 }, { "epoch": 0.58, "grad_norm": 0.48818132522461893, "learning_rate": 1.5609078138765778e-05, "loss": 0.0615, "step": 1328 }, { "epoch": 0.58, "grad_norm": 0.6327490598173477, "learning_rate": 1.5581307839314473e-05, "loss": 0.0764, "step": 1329 }, { "epoch": 0.58, "grad_norm": 0.6345714095914262, "learning_rate": 1.555354649327893e-05, "loss": 0.0897, "step": 1330 }, { "epoch": 0.58, "grad_norm": 0.5330941504601002, "learning_rate": 1.5525794156910837e-05, "loss": 0.0624, "step": 1331 }, { "epoch": 0.59, "grad_norm": 0.5501241348804137, "learning_rate": 1.5498050886443618e-05, "loss": 0.0754, "step": 1332 }, { "epoch": 0.59, "grad_norm": 0.5651742625088397, "learning_rate": 1.5470316738092347e-05, "loss": 0.0704, "step": 1333 }, { "epoch": 0.59, "grad_norm": 0.5558169779689902, "learning_rate": 1.5442591768053575e-05, "loss": 0.0769, "step": 1334 }, { "epoch": 0.59, "grad_norm": 0.5510807981339998, "learning_rate": 1.541487603250529e-05, "loss": 0.0737, "step": 1335 }, { "epoch": 0.59, "grad_norm": 0.6284987272524333, "learning_rate": 1.5387169587606758e-05, "loss": 0.0841, "step": 1336 }, { "epoch": 0.59, "grad_norm": 0.6024778758533725, "learning_rate": 1.535947248949842e-05, "loss": 0.0789, "step": 1337 }, { "epoch": 0.59, "grad_norm": 0.4991837335523023, "learning_rate": 1.5331784794301766e-05, "loss": 0.0639, "step": 1338 }, { "epoch": 0.59, "grad_norm": 0.6082549928710319, "learning_rate": 1.5304106558119253e-05, "loss": 0.0817, "step": 1339 }, { "epoch": 0.59, "grad_norm": 0.5496936638569455, "learning_rate": 1.5276437837034157e-05, "loss": 0.0764, "step": 1340 }, { "epoch": 0.59, "grad_norm": 0.5974821946751186, "learning_rate": 1.5248778687110496e-05, "loss": 0.0846, "step": 1341 }, { "epoch": 0.59, "grad_norm": 0.49677522665224716, "learning_rate": 1.5221129164392854e-05, "loss": 0.0613, "step": 1342 }, { "epoch": 0.59, "grad_norm": 0.529580409834521, "learning_rate": 1.5193489324906342e-05, "loss": 0.0648, "step": 1343 }, { "epoch": 0.59, "grad_norm": 0.5280356777430179, "learning_rate": 1.5165859224656439e-05, "loss": 0.075, "step": 1344 }, { "epoch": 0.59, "grad_norm": 0.46947142399182096, "learning_rate": 1.5138238919628898e-05, "loss": 0.0626, "step": 1345 }, { "epoch": 0.59, "grad_norm": 0.4762087900749942, "learning_rate": 1.511062846578961e-05, "loss": 0.0618, "step": 1346 }, { "epoch": 0.59, "grad_norm": 0.5298525437773021, "learning_rate": 1.5083027919084506e-05, "loss": 0.0736, "step": 1347 }, { "epoch": 0.59, "grad_norm": 0.5736667808733569, "learning_rate": 1.5055437335439458e-05, "loss": 0.0783, "step": 1348 }, { "epoch": 0.59, "grad_norm": 0.4937924300444779, "learning_rate": 1.5027856770760143e-05, "loss": 0.0594, "step": 1349 }, { "epoch": 0.59, "grad_norm": 0.4476221654142289, "learning_rate": 1.5000286280931919e-05, "loss": 0.0535, "step": 1350 }, { "epoch": 0.59, "grad_norm": 0.5688790374089545, "learning_rate": 1.4972725921819761e-05, "loss": 0.0829, "step": 1351 }, { "epoch": 0.59, "grad_norm": 0.4970457449427074, "learning_rate": 1.4945175749268093e-05, "loss": 0.0695, "step": 1352 }, { "epoch": 0.59, "grad_norm": 0.4795983812632139, "learning_rate": 1.4917635819100716e-05, "loss": 0.0544, "step": 1353 }, { "epoch": 0.59, "grad_norm": 0.5964437414692382, "learning_rate": 1.4890106187120657e-05, "loss": 0.0637, "step": 1354 }, { "epoch": 0.6, "grad_norm": 0.5104470503450661, "learning_rate": 1.486258690911009e-05, "loss": 0.0602, "step": 1355 }, { "epoch": 0.6, "grad_norm": 0.513457423942091, "learning_rate": 1.4835078040830201e-05, "loss": 0.0644, "step": 1356 }, { "epoch": 0.6, "grad_norm": 0.5119207725972943, "learning_rate": 1.4807579638021108e-05, "loss": 0.0726, "step": 1357 }, { "epoch": 0.6, "grad_norm": 0.5196004548982651, "learning_rate": 1.478009175640168e-05, "loss": 0.058, "step": 1358 }, { "epoch": 0.6, "grad_norm": 0.5695646425355706, "learning_rate": 1.4752614451669497e-05, "loss": 0.0681, "step": 1359 }, { "epoch": 0.6, "grad_norm": 0.5670585559315535, "learning_rate": 1.4725147779500709e-05, "loss": 0.0659, "step": 1360 }, { "epoch": 0.6, "grad_norm": 0.5072878548271995, "learning_rate": 1.4697691795549914e-05, "loss": 0.0639, "step": 1361 }, { "epoch": 0.6, "grad_norm": 0.6146300481294531, "learning_rate": 1.4670246555450036e-05, "loss": 0.0762, "step": 1362 }, { "epoch": 0.6, "grad_norm": 0.5080206811104377, "learning_rate": 1.4642812114812266e-05, "loss": 0.0655, "step": 1363 }, { "epoch": 0.6, "grad_norm": 0.5704523463778471, "learning_rate": 1.4615388529225878e-05, "loss": 0.0736, "step": 1364 }, { "epoch": 0.6, "grad_norm": 0.586778317207418, "learning_rate": 1.4587975854258182e-05, "loss": 0.0788, "step": 1365 }, { "epoch": 0.6, "grad_norm": 0.5494047509254282, "learning_rate": 1.4560574145454349e-05, "loss": 0.0656, "step": 1366 }, { "epoch": 0.6, "grad_norm": 0.5628054107609263, "learning_rate": 1.4533183458337346e-05, "loss": 0.0714, "step": 1367 }, { "epoch": 0.6, "grad_norm": 0.5240579031445247, "learning_rate": 1.4505803848407811e-05, "loss": 0.0694, "step": 1368 }, { "epoch": 0.6, "grad_norm": 0.4935470755578461, "learning_rate": 1.4478435371143933e-05, "loss": 0.059, "step": 1369 }, { "epoch": 0.6, "grad_norm": 0.5074143242512802, "learning_rate": 1.4451078082001334e-05, "loss": 0.066, "step": 1370 }, { "epoch": 0.6, "grad_norm": 0.4615146127600527, "learning_rate": 1.4423732036412972e-05, "loss": 0.0616, "step": 1371 }, { "epoch": 0.6, "grad_norm": 0.5758660490823785, "learning_rate": 1.4396397289789033e-05, "loss": 0.0746, "step": 1372 }, { "epoch": 0.6, "grad_norm": 0.482689169529526, "learning_rate": 1.4369073897516801e-05, "loss": 0.0664, "step": 1373 }, { "epoch": 0.6, "grad_norm": 0.6377582606607186, "learning_rate": 1.4341761914960536e-05, "loss": 0.0814, "step": 1374 }, { "epoch": 0.6, "grad_norm": 0.6221684927841612, "learning_rate": 1.4314461397461411e-05, "loss": 0.0705, "step": 1375 }, { "epoch": 0.6, "grad_norm": 0.5273820933777924, "learning_rate": 1.4287172400337344e-05, "loss": 0.0716, "step": 1376 }, { "epoch": 0.6, "grad_norm": 0.46155676218959085, "learning_rate": 1.4259894978882925e-05, "loss": 0.0555, "step": 1377 }, { "epoch": 0.61, "grad_norm": 0.5166930206048198, "learning_rate": 1.423262918836927e-05, "loss": 0.0609, "step": 1378 }, { "epoch": 0.61, "grad_norm": 0.6252336303828528, "learning_rate": 1.4205375084043937e-05, "loss": 0.0763, "step": 1379 }, { "epoch": 0.61, "grad_norm": 0.6196759433794099, "learning_rate": 1.4178132721130822e-05, "loss": 0.0763, "step": 1380 }, { "epoch": 0.61, "grad_norm": 0.5743326087226805, "learning_rate": 1.4150902154830007e-05, "loss": 0.0664, "step": 1381 }, { "epoch": 0.61, "grad_norm": 0.758656454588549, "learning_rate": 1.4123683440317672e-05, "loss": 0.0922, "step": 1382 }, { "epoch": 0.61, "grad_norm": 0.6240512083557352, "learning_rate": 1.409647663274599e-05, "loss": 0.0839, "step": 1383 }, { "epoch": 0.61, "grad_norm": 0.49589782510473984, "learning_rate": 1.406928178724301e-05, "loss": 0.0646, "step": 1384 }, { "epoch": 0.61, "grad_norm": 0.5114308486463077, "learning_rate": 1.4042098958912544e-05, "loss": 0.0663, "step": 1385 }, { "epoch": 0.61, "grad_norm": 0.49962220223955534, "learning_rate": 1.4014928202834033e-05, "loss": 0.0696, "step": 1386 }, { "epoch": 0.61, "grad_norm": 0.48821323684033957, "learning_rate": 1.3987769574062489e-05, "loss": 0.0633, "step": 1387 }, { "epoch": 0.61, "grad_norm": 0.5471686528487715, "learning_rate": 1.3960623127628318e-05, "loss": 0.0708, "step": 1388 }, { "epoch": 0.61, "grad_norm": 0.4816597698827779, "learning_rate": 1.3933488918537275e-05, "loss": 0.0594, "step": 1389 }, { "epoch": 0.61, "grad_norm": 0.601283937832739, "learning_rate": 1.390636700177029e-05, "loss": 0.09, "step": 1390 }, { "epoch": 0.61, "grad_norm": 0.5214832320182933, "learning_rate": 1.3879257432283395e-05, "loss": 0.0823, "step": 1391 }, { "epoch": 0.61, "grad_norm": 0.6114555201771721, "learning_rate": 1.3852160265007615e-05, "loss": 0.08, "step": 1392 }, { "epoch": 0.61, "grad_norm": 0.5601629610603993, "learning_rate": 1.3825075554848833e-05, "loss": 0.0752, "step": 1393 }, { "epoch": 0.61, "grad_norm": 0.5286074156182915, "learning_rate": 1.3798003356687684e-05, "loss": 0.0632, "step": 1394 }, { "epoch": 0.61, "grad_norm": 0.482691268761955, "learning_rate": 1.3770943725379468e-05, "loss": 0.0608, "step": 1395 }, { "epoch": 0.61, "grad_norm": 0.5486653313890913, "learning_rate": 1.3743896715754006e-05, "loss": 0.0625, "step": 1396 }, { "epoch": 0.61, "grad_norm": 0.46245522282328194, "learning_rate": 1.3716862382615569e-05, "loss": 0.0529, "step": 1397 }, { "epoch": 0.61, "grad_norm": 0.5432219429547707, "learning_rate": 1.3689840780742695e-05, "loss": 0.0732, "step": 1398 }, { "epoch": 0.61, "grad_norm": 0.5055385735940986, "learning_rate": 1.3662831964888179e-05, "loss": 0.0606, "step": 1399 }, { "epoch": 0.61, "grad_norm": 0.45320404941705894, "learning_rate": 1.3635835989778865e-05, "loss": 0.0559, "step": 1400 }, { "epoch": 0.62, "grad_norm": 0.5726152985672619, "learning_rate": 1.3608852910115616e-05, "loss": 0.0796, "step": 1401 }, { "epoch": 0.62, "grad_norm": 0.5531328739532549, "learning_rate": 1.358188278057313e-05, "loss": 0.0682, "step": 1402 }, { "epoch": 0.62, "grad_norm": 0.5216821588352727, "learning_rate": 1.3554925655799882e-05, "loss": 0.0642, "step": 1403 }, { "epoch": 0.62, "grad_norm": 0.5795184431309143, "learning_rate": 1.3527981590418005e-05, "loss": 0.0754, "step": 1404 }, { "epoch": 0.62, "grad_norm": 0.6461872674314547, "learning_rate": 1.3501050639023157e-05, "loss": 0.0933, "step": 1405 }, { "epoch": 0.62, "grad_norm": 0.47091737991517063, "learning_rate": 1.3474132856184416e-05, "loss": 0.0519, "step": 1406 }, { "epoch": 0.62, "grad_norm": 0.48914517078684877, "learning_rate": 1.3447228296444202e-05, "loss": 0.0603, "step": 1407 }, { "epoch": 0.62, "grad_norm": 0.5590237649598507, "learning_rate": 1.3420337014318115e-05, "loss": 0.0688, "step": 1408 }, { "epoch": 0.62, "grad_norm": 0.5315021589635457, "learning_rate": 1.3393459064294879e-05, "loss": 0.0585, "step": 1409 }, { "epoch": 0.62, "grad_norm": 0.5095019773177242, "learning_rate": 1.3366594500836176e-05, "loss": 0.0649, "step": 1410 }, { "epoch": 0.62, "grad_norm": 0.4254905688364637, "learning_rate": 1.3339743378376578e-05, "loss": 0.0503, "step": 1411 }, { "epoch": 0.62, "grad_norm": 0.4762876531028997, "learning_rate": 1.3312905751323414e-05, "loss": 0.0576, "step": 1412 }, { "epoch": 0.62, "grad_norm": 0.48098996432099456, "learning_rate": 1.3286081674056693e-05, "loss": 0.056, "step": 1413 }, { "epoch": 0.62, "grad_norm": 0.6310350628147636, "learning_rate": 1.325927120092893e-05, "loss": 0.0753, "step": 1414 }, { "epoch": 0.62, "grad_norm": 0.5010802104133005, "learning_rate": 1.3232474386265096e-05, "loss": 0.0723, "step": 1415 }, { "epoch": 0.62, "grad_norm": 0.5652791878757911, "learning_rate": 1.3205691284362493e-05, "loss": 0.075, "step": 1416 }, { "epoch": 0.62, "grad_norm": 0.6125878222427363, "learning_rate": 1.3178921949490624e-05, "loss": 0.0778, "step": 1417 }, { "epoch": 0.62, "grad_norm": 0.5188315706579499, "learning_rate": 1.3152166435891091e-05, "loss": 0.0639, "step": 1418 }, { "epoch": 0.62, "grad_norm": 0.574281364712006, "learning_rate": 1.3125424797777516e-05, "loss": 0.0655, "step": 1419 }, { "epoch": 0.62, "grad_norm": 0.5556574386979573, "learning_rate": 1.309869708933538e-05, "loss": 0.0971, "step": 1420 }, { "epoch": 0.62, "grad_norm": 0.5848790081137124, "learning_rate": 1.3071983364721958e-05, "loss": 0.0678, "step": 1421 }, { "epoch": 0.62, "grad_norm": 0.5105215438477713, "learning_rate": 1.3045283678066172e-05, "loss": 0.0643, "step": 1422 }, { "epoch": 0.63, "grad_norm": 0.42989618262401624, "learning_rate": 1.3018598083468508e-05, "loss": 0.0469, "step": 1423 }, { "epoch": 0.63, "grad_norm": 0.529191038241305, "learning_rate": 1.2991926635000907e-05, "loss": 0.0686, "step": 1424 }, { "epoch": 0.63, "grad_norm": 0.5548114369511021, "learning_rate": 1.2965269386706634e-05, "loss": 0.0734, "step": 1425 }, { "epoch": 0.63, "grad_norm": 0.5569808990706505, "learning_rate": 1.293862639260018e-05, "loss": 0.0685, "step": 1426 }, { "epoch": 0.63, "grad_norm": 0.5498081295251244, "learning_rate": 1.2911997706667156e-05, "loss": 0.0642, "step": 1427 }, { "epoch": 0.63, "grad_norm": 0.5143805137607594, "learning_rate": 1.2885383382864192e-05, "loss": 0.0627, "step": 1428 }, { "epoch": 0.63, "grad_norm": 0.451187381230212, "learning_rate": 1.2858783475118806e-05, "loss": 0.0494, "step": 1429 }, { "epoch": 0.63, "grad_norm": 0.627382054471859, "learning_rate": 1.2832198037329296e-05, "loss": 0.0757, "step": 1430 }, { "epoch": 0.63, "grad_norm": 0.5565262526414638, "learning_rate": 1.2805627123364659e-05, "loss": 0.0698, "step": 1431 }, { "epoch": 0.63, "grad_norm": 0.5346361682091912, "learning_rate": 1.277907078706445e-05, "loss": 0.0648, "step": 1432 }, { "epoch": 0.63, "grad_norm": 0.5316471410572047, "learning_rate": 1.2752529082238701e-05, "loss": 0.069, "step": 1433 }, { "epoch": 0.63, "grad_norm": 0.4592231940813413, "learning_rate": 1.2726002062667777e-05, "loss": 0.063, "step": 1434 }, { "epoch": 0.63, "grad_norm": 0.4807488935093835, "learning_rate": 1.2699489782102292e-05, "loss": 0.0549, "step": 1435 }, { "epoch": 0.63, "grad_norm": 0.474771636728978, "learning_rate": 1.2672992294263012e-05, "loss": 0.0588, "step": 1436 }, { "epoch": 0.63, "grad_norm": 0.5513352928436234, "learning_rate": 1.2646509652840711e-05, "loss": 0.0667, "step": 1437 }, { "epoch": 0.63, "grad_norm": 0.48497923725459274, "learning_rate": 1.2620041911496081e-05, "loss": 0.0698, "step": 1438 }, { "epoch": 0.63, "grad_norm": 0.5538703648574059, "learning_rate": 1.2593589123859626e-05, "loss": 0.0654, "step": 1439 }, { "epoch": 0.63, "grad_norm": 0.5225901506306042, "learning_rate": 1.256715134353156e-05, "loss": 0.0765, "step": 1440 }, { "epoch": 0.63, "grad_norm": 0.47132417853910114, "learning_rate": 1.254072862408168e-05, "loss": 0.0632, "step": 1441 }, { "epoch": 0.63, "grad_norm": 0.4239780846321205, "learning_rate": 1.251432101904925e-05, "loss": 0.052, "step": 1442 }, { "epoch": 0.63, "grad_norm": 0.5239886328832903, "learning_rate": 1.2487928581942934e-05, "loss": 0.0684, "step": 1443 }, { "epoch": 0.63, "grad_norm": 0.4149127184475764, "learning_rate": 1.246155136624065e-05, "loss": 0.0551, "step": 1444 }, { "epoch": 0.63, "grad_norm": 0.5312312347239407, "learning_rate": 1.2435189425389483e-05, "loss": 0.0703, "step": 1445 }, { "epoch": 0.64, "grad_norm": 0.569598574119361, "learning_rate": 1.2408842812805549e-05, "loss": 0.0821, "step": 1446 }, { "epoch": 0.64, "grad_norm": 0.5900306910678916, "learning_rate": 1.2382511581873908e-05, "loss": 0.0776, "step": 1447 }, { "epoch": 0.64, "grad_norm": 0.5752746579117197, "learning_rate": 1.2356195785948475e-05, "loss": 0.0726, "step": 1448 }, { "epoch": 0.64, "grad_norm": 0.520116719937947, "learning_rate": 1.2329895478351874e-05, "loss": 0.0638, "step": 1449 }, { "epoch": 0.64, "grad_norm": 0.4759876348548337, "learning_rate": 1.230361071237533e-05, "loss": 0.0571, "step": 1450 }, { "epoch": 0.64, "grad_norm": 0.541569257805204, "learning_rate": 1.2277341541278606e-05, "loss": 0.0607, "step": 1451 }, { "epoch": 0.64, "grad_norm": 0.45982854602188716, "learning_rate": 1.2251088018289844e-05, "loss": 0.0487, "step": 1452 }, { "epoch": 0.64, "grad_norm": 0.5126296228378501, "learning_rate": 1.2224850196605509e-05, "loss": 0.0541, "step": 1453 }, { "epoch": 0.64, "grad_norm": 0.4613824087275177, "learning_rate": 1.2198628129390195e-05, "loss": 0.0543, "step": 1454 }, { "epoch": 0.64, "grad_norm": 0.5434933056511206, "learning_rate": 1.2172421869776632e-05, "loss": 0.0705, "step": 1455 }, { "epoch": 0.64, "grad_norm": 0.5160765260368292, "learning_rate": 1.2146231470865484e-05, "loss": 0.0544, "step": 1456 }, { "epoch": 0.64, "grad_norm": 0.540096231645682, "learning_rate": 1.21200569857253e-05, "loss": 0.0613, "step": 1457 }, { "epoch": 0.64, "grad_norm": 0.6011789758968067, "learning_rate": 1.209389846739236e-05, "loss": 0.0635, "step": 1458 }, { "epoch": 0.64, "grad_norm": 0.5224367225240505, "learning_rate": 1.2067755968870607e-05, "loss": 0.0581, "step": 1459 }, { "epoch": 0.64, "grad_norm": 0.5248040821453843, "learning_rate": 1.2041629543131523e-05, "loss": 0.0606, "step": 1460 }, { "epoch": 0.64, "grad_norm": 0.4144209983231614, "learning_rate": 1.201551924311402e-05, "loss": 0.0404, "step": 1461 }, { "epoch": 0.64, "grad_norm": 0.5389137062713839, "learning_rate": 1.1989425121724327e-05, "loss": 0.0628, "step": 1462 }, { "epoch": 0.64, "grad_norm": 0.575233885632625, "learning_rate": 1.1963347231835904e-05, "loss": 0.0695, "step": 1463 }, { "epoch": 0.64, "grad_norm": 0.5674204460965359, "learning_rate": 1.1937285626289313e-05, "loss": 0.0688, "step": 1464 }, { "epoch": 0.64, "grad_norm": 0.526437843124276, "learning_rate": 1.1911240357892135e-05, "loss": 0.0637, "step": 1465 }, { "epoch": 0.64, "grad_norm": 0.5433008838492608, "learning_rate": 1.1885211479418815e-05, "loss": 0.0585, "step": 1466 }, { "epoch": 0.64, "grad_norm": 0.6014782487403925, "learning_rate": 1.185919904361062e-05, "loss": 0.0701, "step": 1467 }, { "epoch": 0.64, "grad_norm": 0.5279424024399223, "learning_rate": 1.1833203103175483e-05, "loss": 0.0678, "step": 1468 }, { "epoch": 0.65, "grad_norm": 0.5219251505802605, "learning_rate": 1.1807223710787932e-05, "loss": 0.065, "step": 1469 }, { "epoch": 0.65, "grad_norm": 0.5075146902241396, "learning_rate": 1.1781260919088933e-05, "loss": 0.0779, "step": 1470 }, { "epoch": 0.65, "grad_norm": 0.4924361040086128, "learning_rate": 1.1755314780685838e-05, "loss": 0.0578, "step": 1471 }, { "epoch": 0.65, "grad_norm": 0.5317535258087933, "learning_rate": 1.1729385348152253e-05, "loss": 0.0606, "step": 1472 }, { "epoch": 0.65, "grad_norm": 0.5035625734777777, "learning_rate": 1.1703472674027932e-05, "loss": 0.066, "step": 1473 }, { "epoch": 0.65, "grad_norm": 0.5363263057225011, "learning_rate": 1.1677576810818658e-05, "loss": 0.0684, "step": 1474 }, { "epoch": 0.65, "grad_norm": 0.7245314728434411, "learning_rate": 1.1651697810996174e-05, "loss": 0.0874, "step": 1475 }, { "epoch": 0.65, "grad_norm": 0.4969710005381988, "learning_rate": 1.1625835726998039e-05, "loss": 0.0746, "step": 1476 }, { "epoch": 0.65, "grad_norm": 0.4682484327545659, "learning_rate": 1.159999061122754e-05, "loss": 0.0531, "step": 1477 }, { "epoch": 0.65, "grad_norm": 0.48823324348967334, "learning_rate": 1.1574162516053579e-05, "loss": 0.053, "step": 1478 }, { "epoch": 0.65, "grad_norm": 0.45931973481719013, "learning_rate": 1.1548351493810576e-05, "loss": 0.0587, "step": 1479 }, { "epoch": 0.65, "grad_norm": 0.5474528121209167, "learning_rate": 1.152255759679834e-05, "loss": 0.0663, "step": 1480 }, { "epoch": 0.65, "grad_norm": 0.49917477815540684, "learning_rate": 1.1496780877282024e-05, "loss": 0.0555, "step": 1481 }, { "epoch": 0.65, "grad_norm": 0.477078143206799, "learning_rate": 1.1471021387491912e-05, "loss": 0.0575, "step": 1482 }, { "epoch": 0.65, "grad_norm": 0.4880355521974438, "learning_rate": 1.1445279179623419e-05, "loss": 0.06, "step": 1483 }, { "epoch": 0.65, "grad_norm": 0.431461607805343, "learning_rate": 1.1419554305836922e-05, "loss": 0.0542, "step": 1484 }, { "epoch": 0.65, "grad_norm": 0.49096051425034504, "learning_rate": 1.1393846818257712e-05, "loss": 0.066, "step": 1485 }, { "epoch": 0.65, "grad_norm": 0.5017577210304115, "learning_rate": 1.1368156768975795e-05, "loss": 0.0624, "step": 1486 }, { "epoch": 0.65, "grad_norm": 0.5707201088970253, "learning_rate": 1.1342484210045871e-05, "loss": 0.0635, "step": 1487 }, { "epoch": 0.65, "grad_norm": 0.47744516022818195, "learning_rate": 1.1316829193487217e-05, "loss": 0.0489, "step": 1488 }, { "epoch": 0.65, "grad_norm": 0.44547812410176807, "learning_rate": 1.129119177128355e-05, "loss": 0.0469, "step": 1489 }, { "epoch": 0.65, "grad_norm": 0.4865327887017357, "learning_rate": 1.12655719953829e-05, "loss": 0.0543, "step": 1490 }, { "epoch": 0.65, "grad_norm": 0.5849447104825531, "learning_rate": 1.1239969917697607e-05, "loss": 0.081, "step": 1491 }, { "epoch": 0.66, "grad_norm": 0.46030429408055107, "learning_rate": 1.1214385590104105e-05, "loss": 0.0594, "step": 1492 }, { "epoch": 0.66, "grad_norm": 0.45178191322330224, "learning_rate": 1.1188819064442882e-05, "loss": 0.0495, "step": 1493 }, { "epoch": 0.66, "grad_norm": 0.5644934616723865, "learning_rate": 1.1163270392518325e-05, "loss": 0.0658, "step": 1494 }, { "epoch": 0.66, "grad_norm": 0.530986664523761, "learning_rate": 1.1137739626098689e-05, "loss": 0.0507, "step": 1495 }, { "epoch": 0.66, "grad_norm": 0.5175622037921966, "learning_rate": 1.1112226816915914e-05, "loss": 0.0565, "step": 1496 }, { "epoch": 0.66, "grad_norm": 0.5620447541842413, "learning_rate": 1.1086732016665569e-05, "loss": 0.0683, "step": 1497 }, { "epoch": 0.66, "grad_norm": 0.4601152327282511, "learning_rate": 1.1061255277006729e-05, "loss": 0.055, "step": 1498 }, { "epoch": 0.66, "grad_norm": 0.4159817032572941, "learning_rate": 1.103579664956187e-05, "loss": 0.0515, "step": 1499 }, { "epoch": 0.66, "grad_norm": 0.5264429947259384, "learning_rate": 1.1010356185916771e-05, "loss": 0.0644, "step": 1500 }, { "epoch": 0.66, "grad_norm": 0.5442533870498757, "learning_rate": 1.0984933937620408e-05, "loss": 0.0679, "step": 1501 }, { "epoch": 0.66, "grad_norm": 0.505962876737277, "learning_rate": 1.0959529956184842e-05, "loss": 0.0596, "step": 1502 }, { "epoch": 0.66, "grad_norm": 0.4199144229671258, "learning_rate": 1.0934144293085132e-05, "loss": 0.0484, "step": 1503 }, { "epoch": 0.66, "grad_norm": 0.4912776712093496, "learning_rate": 1.0908776999759199e-05, "loss": 0.0619, "step": 1504 }, { "epoch": 0.66, "grad_norm": 0.47930809884581965, "learning_rate": 1.0883428127607778e-05, "loss": 0.0611, "step": 1505 }, { "epoch": 0.66, "grad_norm": 0.46824244995568093, "learning_rate": 1.0858097727994234e-05, "loss": 0.0522, "step": 1506 }, { "epoch": 0.66, "grad_norm": 0.5169644442364393, "learning_rate": 1.0832785852244525e-05, "loss": 0.0645, "step": 1507 }, { "epoch": 0.66, "grad_norm": 0.4375088043689375, "learning_rate": 1.0807492551647086e-05, "loss": 0.0531, "step": 1508 }, { "epoch": 0.66, "grad_norm": 0.47115437258327186, "learning_rate": 1.0782217877452697e-05, "loss": 0.0533, "step": 1509 }, { "epoch": 0.66, "grad_norm": 0.5611245239367965, "learning_rate": 1.075696188087439e-05, "loss": 0.079, "step": 1510 }, { "epoch": 0.66, "grad_norm": 0.5348015047998022, "learning_rate": 1.073172461308736e-05, "loss": 0.0679, "step": 1511 }, { "epoch": 0.66, "grad_norm": 0.6581620562953735, "learning_rate": 1.0706506125228867e-05, "loss": 0.1032, "step": 1512 }, { "epoch": 0.66, "grad_norm": 0.4977498384204336, "learning_rate": 1.0681306468398107e-05, "loss": 0.0654, "step": 1513 }, { "epoch": 0.67, "grad_norm": 0.5331613153966384, "learning_rate": 1.0656125693656092e-05, "loss": 0.0575, "step": 1514 }, { "epoch": 0.67, "grad_norm": 0.5123727419492388, "learning_rate": 1.0630963852025625e-05, "loss": 0.0602, "step": 1515 }, { "epoch": 0.67, "grad_norm": 0.42119580803208856, "learning_rate": 1.060582099449111e-05, "loss": 0.0541, "step": 1516 }, { "epoch": 0.67, "grad_norm": 0.4970325687970171, "learning_rate": 1.0580697171998494e-05, "loss": 0.0557, "step": 1517 }, { "epoch": 0.67, "grad_norm": 0.5220547817558902, "learning_rate": 1.0555592435455155e-05, "loss": 0.051, "step": 1518 }, { "epoch": 0.67, "grad_norm": 0.4777865870243367, "learning_rate": 1.0530506835729794e-05, "loss": 0.0616, "step": 1519 }, { "epoch": 0.67, "grad_norm": 0.5181911940216953, "learning_rate": 1.0505440423652347e-05, "loss": 0.0661, "step": 1520 }, { "epoch": 0.67, "grad_norm": 0.47975701440371654, "learning_rate": 1.0480393250013856e-05, "loss": 0.0624, "step": 1521 }, { "epoch": 0.67, "grad_norm": 0.48496671650915757, "learning_rate": 1.0455365365566388e-05, "loss": 0.0571, "step": 1522 }, { "epoch": 0.67, "grad_norm": 0.5365806335246618, "learning_rate": 1.0430356821022926e-05, "loss": 0.055, "step": 1523 }, { "epoch": 0.67, "grad_norm": 0.5399091678184397, "learning_rate": 1.0405367667057264e-05, "loss": 0.0575, "step": 1524 }, { "epoch": 0.67, "grad_norm": 0.5527952989626782, "learning_rate": 1.0380397954303908e-05, "loss": 0.0531, "step": 1525 }, { "epoch": 0.67, "grad_norm": 0.5662272992752149, "learning_rate": 1.0355447733357966e-05, "loss": 0.0652, "step": 1526 }, { "epoch": 0.67, "grad_norm": 0.5199884396423434, "learning_rate": 1.0330517054775056e-05, "loss": 0.0653, "step": 1527 }, { "epoch": 0.67, "grad_norm": 0.421816680550613, "learning_rate": 1.0305605969071185e-05, "loss": 0.0481, "step": 1528 }, { "epoch": 0.67, "grad_norm": 0.5373142198599659, "learning_rate": 1.0280714526722693e-05, "loss": 0.0629, "step": 1529 }, { "epoch": 0.67, "grad_norm": 0.43129434979485926, "learning_rate": 1.0255842778166074e-05, "loss": 0.0429, "step": 1530 }, { "epoch": 0.67, "grad_norm": 0.47506216608614654, "learning_rate": 1.023099077379794e-05, "loss": 0.0562, "step": 1531 }, { "epoch": 0.67, "grad_norm": 0.6103303707258544, "learning_rate": 1.0206158563974906e-05, "loss": 0.0718, "step": 1532 }, { "epoch": 0.67, "grad_norm": 0.44324424694090814, "learning_rate": 1.0181346199013467e-05, "loss": 0.0514, "step": 1533 }, { "epoch": 0.67, "grad_norm": 0.42927366615711565, "learning_rate": 1.015655372918988e-05, "loss": 0.0632, "step": 1534 }, { "epoch": 0.67, "grad_norm": 0.5998290215701453, "learning_rate": 1.0131781204740147e-05, "loss": 0.0773, "step": 1535 }, { "epoch": 0.67, "grad_norm": 0.5303218250432638, "learning_rate": 1.0107028675859811e-05, "loss": 0.0626, "step": 1536 }, { "epoch": 0.68, "grad_norm": 0.44062670511226937, "learning_rate": 1.008229619270392e-05, "loss": 0.0492, "step": 1537 }, { "epoch": 0.68, "grad_norm": 0.5006748651822909, "learning_rate": 1.0057583805386875e-05, "loss": 0.0658, "step": 1538 }, { "epoch": 0.68, "grad_norm": 0.4881628884863566, "learning_rate": 1.0032891563982399e-05, "loss": 0.0548, "step": 1539 }, { "epoch": 0.68, "grad_norm": 0.48765635367272914, "learning_rate": 1.0008219518523367e-05, "loss": 0.061, "step": 1540 }, { "epoch": 0.68, "grad_norm": 0.44086995924838684, "learning_rate": 9.983567719001744e-06, "loss": 0.0577, "step": 1541 }, { "epoch": 0.68, "grad_norm": 0.4695288386735233, "learning_rate": 9.95893621536846e-06, "loss": 0.0619, "step": 1542 }, { "epoch": 0.68, "grad_norm": 0.4368920871083534, "learning_rate": 9.93432505753333e-06, "loss": 0.0509, "step": 1543 }, { "epoch": 0.68, "grad_norm": 0.49211235078189586, "learning_rate": 9.90973429536494e-06, "loss": 0.0662, "step": 1544 }, { "epoch": 0.68, "grad_norm": 0.4679386269326675, "learning_rate": 9.885163978690535e-06, "loss": 0.0504, "step": 1545 }, { "epoch": 0.68, "grad_norm": 0.441308576191756, "learning_rate": 9.860614157295975e-06, "loss": 0.0532, "step": 1546 }, { "epoch": 0.68, "grad_norm": 0.45494966781386914, "learning_rate": 9.836084880925538e-06, "loss": 0.0605, "step": 1547 }, { "epoch": 0.68, "grad_norm": 0.39461227566973295, "learning_rate": 9.811576199281898e-06, "loss": 0.0507, "step": 1548 }, { "epoch": 0.68, "grad_norm": 0.5239706029144249, "learning_rate": 9.787088162026009e-06, "loss": 0.0603, "step": 1549 }, { "epoch": 0.68, "grad_norm": 0.510123934459812, "learning_rate": 9.762620818776984e-06, "loss": 0.0753, "step": 1550 }, { "epoch": 0.68, "grad_norm": 0.47553871750669857, "learning_rate": 9.73817421911199e-06, "loss": 0.0592, "step": 1551 }, { "epoch": 0.68, "grad_norm": 0.4883734349015062, "learning_rate": 9.713748412566173e-06, "loss": 0.051, "step": 1552 }, { "epoch": 0.68, "grad_norm": 0.4953845052998364, "learning_rate": 9.689343448632562e-06, "loss": 0.0602, "step": 1553 }, { "epoch": 0.68, "grad_norm": 0.5045162028327952, "learning_rate": 9.664959376761945e-06, "loss": 0.059, "step": 1554 }, { "epoch": 0.68, "grad_norm": 0.5337321413344583, "learning_rate": 9.640596246362748e-06, "loss": 0.0567, "step": 1555 }, { "epoch": 0.68, "grad_norm": 0.4896048801496403, "learning_rate": 9.616254106801008e-06, "loss": 0.0546, "step": 1556 }, { "epoch": 0.68, "grad_norm": 0.4696533082558593, "learning_rate": 9.591933007400209e-06, "loss": 0.0522, "step": 1557 }, { "epoch": 0.68, "grad_norm": 0.47739000861104675, "learning_rate": 9.567632997441198e-06, "loss": 0.0528, "step": 1558 }, { "epoch": 0.68, "grad_norm": 0.47802036112750035, "learning_rate": 9.543354126162096e-06, "loss": 0.0485, "step": 1559 }, { "epoch": 0.69, "grad_norm": 0.4286016442228634, "learning_rate": 9.519096442758192e-06, "loss": 0.0575, "step": 1560 }, { "epoch": 0.69, "grad_norm": 0.45656723529153814, "learning_rate": 9.494859996381838e-06, "loss": 0.0522, "step": 1561 }, { "epoch": 0.69, "grad_norm": 0.49994349160355406, "learning_rate": 9.470644836142356e-06, "loss": 0.0536, "step": 1562 }, { "epoch": 0.69, "grad_norm": 0.4997292023720749, "learning_rate": 9.446451011105941e-06, "loss": 0.0608, "step": 1563 }, { "epoch": 0.69, "grad_norm": 0.47173583839702254, "learning_rate": 9.422278570295553e-06, "loss": 0.0504, "step": 1564 }, { "epoch": 0.69, "grad_norm": 0.5939161921358718, "learning_rate": 9.398127562690824e-06, "loss": 0.0605, "step": 1565 }, { "epoch": 0.69, "grad_norm": 0.5533998614651224, "learning_rate": 9.373998037227957e-06, "loss": 0.0635, "step": 1566 }, { "epoch": 0.69, "grad_norm": 0.46367949854714563, "learning_rate": 9.34989004279962e-06, "loss": 0.0558, "step": 1567 }, { "epoch": 0.69, "grad_norm": 0.4698531680087694, "learning_rate": 9.325803628254865e-06, "loss": 0.0619, "step": 1568 }, { "epoch": 0.69, "grad_norm": 0.4789135115603457, "learning_rate": 9.301738842399005e-06, "loss": 0.056, "step": 1569 }, { "epoch": 0.69, "grad_norm": 0.4848335157095517, "learning_rate": 9.277695733993553e-06, "loss": 0.0558, "step": 1570 }, { "epoch": 0.69, "grad_norm": 0.48548719940144813, "learning_rate": 9.253674351756057e-06, "loss": 0.0649, "step": 1571 }, { "epoch": 0.69, "grad_norm": 0.5663798358940626, "learning_rate": 9.229674744360067e-06, "loss": 0.0652, "step": 1572 }, { "epoch": 0.69, "grad_norm": 0.42806689083828886, "learning_rate": 9.20569696043502e-06, "loss": 0.0475, "step": 1573 }, { "epoch": 0.69, "grad_norm": 0.41384739956754873, "learning_rate": 9.181741048566126e-06, "loss": 0.0594, "step": 1574 }, { "epoch": 0.69, "grad_norm": 0.4642917497080165, "learning_rate": 9.15780705729425e-06, "loss": 0.0513, "step": 1575 }, { "epoch": 0.69, "grad_norm": 0.48629564536841646, "learning_rate": 9.133895035115882e-06, "loss": 0.057, "step": 1576 }, { "epoch": 0.69, "grad_norm": 0.4662474906792499, "learning_rate": 9.11000503048297e-06, "loss": 0.0546, "step": 1577 }, { "epoch": 0.69, "grad_norm": 0.40552152830401267, "learning_rate": 9.08613709180287e-06, "loss": 0.05, "step": 1578 }, { "epoch": 0.69, "grad_norm": 0.63067057237901, "learning_rate": 9.062291267438183e-06, "loss": 0.0671, "step": 1579 }, { "epoch": 0.69, "grad_norm": 0.43178160768632, "learning_rate": 9.038467605706758e-06, "loss": 0.0472, "step": 1580 }, { "epoch": 0.69, "grad_norm": 0.4325399049915896, "learning_rate": 9.0146661548815e-06, "loss": 0.0475, "step": 1581 }, { "epoch": 0.69, "grad_norm": 0.46080000491568607, "learning_rate": 8.990886963190316e-06, "loss": 0.0477, "step": 1582 }, { "epoch": 0.7, "grad_norm": 0.3894528923684423, "learning_rate": 8.967130078816017e-06, "loss": 0.0451, "step": 1583 }, { "epoch": 0.7, "grad_norm": 0.4243231664191512, "learning_rate": 8.943395549896208e-06, "loss": 0.0444, "step": 1584 }, { "epoch": 0.7, "grad_norm": 0.46515401535343126, "learning_rate": 8.919683424523198e-06, "loss": 0.0529, "step": 1585 }, { "epoch": 0.7, "grad_norm": 0.5531862925390345, "learning_rate": 8.895993750743897e-06, "loss": 0.0654, "step": 1586 }, { "epoch": 0.7, "grad_norm": 0.47002838969725147, "learning_rate": 8.872326576559724e-06, "loss": 0.0534, "step": 1587 }, { "epoch": 0.7, "grad_norm": 0.5051668058284231, "learning_rate": 8.848681949926514e-06, "loss": 0.0597, "step": 1588 }, { "epoch": 0.7, "grad_norm": 0.5269592013337469, "learning_rate": 8.825059918754402e-06, "loss": 0.0695, "step": 1589 }, { "epoch": 0.7, "grad_norm": 0.4655046813198839, "learning_rate": 8.801460530907764e-06, "loss": 0.0471, "step": 1590 }, { "epoch": 0.7, "grad_norm": 0.4608331820876901, "learning_rate": 8.777883834205058e-06, "loss": 0.0491, "step": 1591 }, { "epoch": 0.7, "grad_norm": 0.4891205250500983, "learning_rate": 8.754329876418786e-06, "loss": 0.0595, "step": 1592 }, { "epoch": 0.7, "grad_norm": 0.4352259800925935, "learning_rate": 8.730798705275382e-06, "loss": 0.0518, "step": 1593 }, { "epoch": 0.7, "grad_norm": 0.5245438171917561, "learning_rate": 8.707290368455103e-06, "loss": 0.0566, "step": 1594 }, { "epoch": 0.7, "grad_norm": 0.5195381049370807, "learning_rate": 8.683804913591918e-06, "loss": 0.0685, "step": 1595 }, { "epoch": 0.7, "grad_norm": 0.45042332566085547, "learning_rate": 8.660342388273444e-06, "loss": 0.0475, "step": 1596 }, { "epoch": 0.7, "grad_norm": 0.4542930773431173, "learning_rate": 8.636902840040855e-06, "loss": 0.0506, "step": 1597 }, { "epoch": 0.7, "grad_norm": 0.4465052542222463, "learning_rate": 8.613486316388756e-06, "loss": 0.0524, "step": 1598 }, { "epoch": 0.7, "grad_norm": 0.5335127382232462, "learning_rate": 8.590092864765064e-06, "loss": 0.0625, "step": 1599 }, { "epoch": 0.7, "grad_norm": 0.4870087483714966, "learning_rate": 8.566722532571006e-06, "loss": 0.053, "step": 1600 }, { "epoch": 0.7, "grad_norm": 0.45018015275367157, "learning_rate": 8.543375367160922e-06, "loss": 0.043, "step": 1601 }, { "epoch": 0.7, "grad_norm": 0.42910949671914805, "learning_rate": 8.520051415842224e-06, "loss": 0.0535, "step": 1602 }, { "epoch": 0.7, "grad_norm": 0.6105327367823554, "learning_rate": 8.49675072587528e-06, "loss": 0.0624, "step": 1603 }, { "epoch": 0.7, "grad_norm": 0.5211480588507713, "learning_rate": 8.473473344473328e-06, "loss": 0.066, "step": 1604 }, { "epoch": 0.71, "grad_norm": 0.5559000811278793, "learning_rate": 8.450219318802382e-06, "loss": 0.0649, "step": 1605 }, { "epoch": 0.71, "grad_norm": 0.45695071040569013, "learning_rate": 8.426988695981123e-06, "loss": 0.0513, "step": 1606 }, { "epoch": 0.71, "grad_norm": 0.48403324868740644, "learning_rate": 8.403781523080814e-06, "loss": 0.0509, "step": 1607 }, { "epoch": 0.71, "grad_norm": 0.4915050437606361, "learning_rate": 8.380597847125202e-06, "loss": 0.0517, "step": 1608 }, { "epoch": 0.71, "grad_norm": 0.5287741300248835, "learning_rate": 8.35743771509043e-06, "loss": 0.0525, "step": 1609 }, { "epoch": 0.71, "grad_norm": 0.546742423199978, "learning_rate": 8.334301173904924e-06, "loss": 0.0622, "step": 1610 }, { "epoch": 0.71, "grad_norm": 0.4296160898691455, "learning_rate": 8.311188270449316e-06, "loss": 0.0449, "step": 1611 }, { "epoch": 0.71, "grad_norm": 0.4870001198515852, "learning_rate": 8.288099051556338e-06, "loss": 0.0556, "step": 1612 }, { "epoch": 0.71, "grad_norm": 0.662722121195912, "learning_rate": 8.26503356401073e-06, "loss": 0.0741, "step": 1613 }, { "epoch": 0.71, "grad_norm": 0.4430679842555276, "learning_rate": 8.241991854549166e-06, "loss": 0.0493, "step": 1614 }, { "epoch": 0.71, "grad_norm": 0.4582177923250132, "learning_rate": 8.218973969860102e-06, "loss": 0.0524, "step": 1615 }, { "epoch": 0.71, "grad_norm": 0.4669803753503285, "learning_rate": 8.195979956583742e-06, "loss": 0.0487, "step": 1616 }, { "epoch": 0.71, "grad_norm": 0.49080076592380945, "learning_rate": 8.17300986131193e-06, "loss": 0.0701, "step": 1617 }, { "epoch": 0.71, "grad_norm": 0.45108364943535484, "learning_rate": 8.150063730588032e-06, "loss": 0.0515, "step": 1618 }, { "epoch": 0.71, "grad_norm": 0.44474828681272754, "learning_rate": 8.127141610906837e-06, "loss": 0.0493, "step": 1619 }, { "epoch": 0.71, "grad_norm": 0.5058633811526936, "learning_rate": 8.104243548714523e-06, "loss": 0.0504, "step": 1620 }, { "epoch": 0.71, "grad_norm": 0.4996399775386735, "learning_rate": 8.081369590408494e-06, "loss": 0.0627, "step": 1621 }, { "epoch": 0.71, "grad_norm": 0.3971066031921181, "learning_rate": 8.058519782337324e-06, "loss": 0.0497, "step": 1622 }, { "epoch": 0.71, "grad_norm": 0.39585038407728595, "learning_rate": 8.035694170800628e-06, "loss": 0.0462, "step": 1623 }, { "epoch": 0.71, "grad_norm": 0.4674293989021966, "learning_rate": 8.01289280204903e-06, "loss": 0.0598, "step": 1624 }, { "epoch": 0.71, "grad_norm": 0.4264318762236069, "learning_rate": 7.990115722284011e-06, "loss": 0.0507, "step": 1625 }, { "epoch": 0.71, "grad_norm": 0.47141490636254935, "learning_rate": 7.967362977657837e-06, "loss": 0.061, "step": 1626 }, { "epoch": 0.71, "grad_norm": 0.43637037900796194, "learning_rate": 7.944634614273467e-06, "loss": 0.0523, "step": 1627 }, { "epoch": 0.72, "grad_norm": 0.4894778885479325, "learning_rate": 7.921930678184458e-06, "loss": 0.0552, "step": 1628 }, { "epoch": 0.72, "grad_norm": 0.44067335472187175, "learning_rate": 7.899251215394868e-06, "loss": 0.0464, "step": 1629 }, { "epoch": 0.72, "grad_norm": 0.40108145685679664, "learning_rate": 7.876596271859169e-06, "loss": 0.0392, "step": 1630 }, { "epoch": 0.72, "grad_norm": 0.5245397832617841, "learning_rate": 7.85396589348215e-06, "loss": 0.0569, "step": 1631 }, { "epoch": 0.72, "grad_norm": 0.41780279554853406, "learning_rate": 7.831360126118821e-06, "loss": 0.0451, "step": 1632 }, { "epoch": 0.72, "grad_norm": 0.6257966686163765, "learning_rate": 7.808779015574323e-06, "loss": 0.0736, "step": 1633 }, { "epoch": 0.72, "grad_norm": 0.45913278331229757, "learning_rate": 7.786222607603859e-06, "loss": 0.0488, "step": 1634 }, { "epoch": 0.72, "grad_norm": 0.4687555789615512, "learning_rate": 7.763690947912541e-06, "loss": 0.0498, "step": 1635 }, { "epoch": 0.72, "grad_norm": 0.47069437169210493, "learning_rate": 7.741184082155354e-06, "loss": 0.0531, "step": 1636 }, { "epoch": 0.72, "grad_norm": 0.473346049223043, "learning_rate": 7.71870205593704e-06, "loss": 0.0574, "step": 1637 }, { "epoch": 0.72, "grad_norm": 0.5186053465040562, "learning_rate": 7.69624491481203e-06, "loss": 0.0634, "step": 1638 }, { "epoch": 0.72, "grad_norm": 0.43804186234889664, "learning_rate": 7.673812704284294e-06, "loss": 0.049, "step": 1639 }, { "epoch": 0.72, "grad_norm": 0.467223271942862, "learning_rate": 7.651405469807305e-06, "loss": 0.0558, "step": 1640 }, { "epoch": 0.72, "grad_norm": 0.40068030694203377, "learning_rate": 7.629023256783943e-06, "loss": 0.0456, "step": 1641 }, { "epoch": 0.72, "grad_norm": 0.40616603130253837, "learning_rate": 7.606666110566374e-06, "loss": 0.0526, "step": 1642 }, { "epoch": 0.72, "grad_norm": 0.4582811690758661, "learning_rate": 7.58433407645595e-06, "loss": 0.0544, "step": 1643 }, { "epoch": 0.72, "grad_norm": 0.4180552847580791, "learning_rate": 7.562027199703184e-06, "loss": 0.051, "step": 1644 }, { "epoch": 0.72, "grad_norm": 0.46366990562219984, "learning_rate": 7.53974552550758e-06, "loss": 0.0625, "step": 1645 }, { "epoch": 0.72, "grad_norm": 0.528927072089804, "learning_rate": 7.517489099017592e-06, "loss": 0.0612, "step": 1646 }, { "epoch": 0.72, "grad_norm": 0.42768424101751135, "learning_rate": 7.495257965330502e-06, "loss": 0.0421, "step": 1647 }, { "epoch": 0.72, "grad_norm": 0.46276593037764036, "learning_rate": 7.473052169492357e-06, "loss": 0.0476, "step": 1648 }, { "epoch": 0.72, "grad_norm": 0.4211284778832942, "learning_rate": 7.450871756497851e-06, "loss": 0.0455, "step": 1649 }, { "epoch": 0.72, "grad_norm": 0.4705942273306305, "learning_rate": 7.428716771290252e-06, "loss": 0.044, "step": 1650 }, { "epoch": 0.73, "grad_norm": 0.5060428301453777, "learning_rate": 7.406587258761304e-06, "loss": 0.0599, "step": 1651 }, { "epoch": 0.73, "grad_norm": 0.45485988180788733, "learning_rate": 7.3844832637511346e-06, "loss": 0.0523, "step": 1652 }, { "epoch": 0.73, "grad_norm": 0.40423594582354844, "learning_rate": 7.3624048310481685e-06, "loss": 0.05, "step": 1653 }, { "epoch": 0.73, "grad_norm": 0.4182956522363397, "learning_rate": 7.340352005389033e-06, "loss": 0.0503, "step": 1654 }, { "epoch": 0.73, "grad_norm": 0.4928100523951386, "learning_rate": 7.318324831458472e-06, "loss": 0.0611, "step": 1655 }, { "epoch": 0.73, "grad_norm": 0.41619817104657236, "learning_rate": 7.296323353889245e-06, "loss": 0.0439, "step": 1656 }, { "epoch": 0.73, "grad_norm": 0.4705816088325612, "learning_rate": 7.274347617262048e-06, "loss": 0.0556, "step": 1657 }, { "epoch": 0.73, "grad_norm": 0.47278093632280377, "learning_rate": 7.252397666105435e-06, "loss": 0.0448, "step": 1658 }, { "epoch": 0.73, "grad_norm": 0.3760861720045564, "learning_rate": 7.230473544895682e-06, "loss": 0.0461, "step": 1659 }, { "epoch": 0.73, "grad_norm": 0.40577956618296407, "learning_rate": 7.208575298056739e-06, "loss": 0.0399, "step": 1660 }, { "epoch": 0.73, "grad_norm": 0.3974929346800227, "learning_rate": 7.186702969960144e-06, "loss": 0.0421, "step": 1661 }, { "epoch": 0.73, "grad_norm": 0.390682063703073, "learning_rate": 7.1648566049249036e-06, "loss": 0.0412, "step": 1662 }, { "epoch": 0.73, "grad_norm": 0.4461909881275912, "learning_rate": 7.143036247217405e-06, "loss": 0.0527, "step": 1663 }, { "epoch": 0.73, "grad_norm": 0.3977204771204441, "learning_rate": 7.121241941051346e-06, "loss": 0.0465, "step": 1664 }, { "epoch": 0.73, "grad_norm": 0.47347710006327093, "learning_rate": 7.0994737305876585e-06, "loss": 0.0465, "step": 1665 }, { "epoch": 0.73, "grad_norm": 0.515439202328162, "learning_rate": 7.077731659934377e-06, "loss": 0.0526, "step": 1666 }, { "epoch": 0.73, "grad_norm": 0.4560921957859219, "learning_rate": 7.056015773146554e-06, "loss": 0.0425, "step": 1667 }, { "epoch": 0.73, "grad_norm": 0.45374379164281586, "learning_rate": 7.034326114226222e-06, "loss": 0.0497, "step": 1668 }, { "epoch": 0.73, "grad_norm": 0.4932508939840314, "learning_rate": 7.01266272712225e-06, "loss": 0.0578, "step": 1669 }, { "epoch": 0.73, "grad_norm": 0.42387370554445963, "learning_rate": 6.991025655730275e-06, "loss": 0.0489, "step": 1670 }, { "epoch": 0.73, "grad_norm": 0.46285326521237774, "learning_rate": 6.969414943892612e-06, "loss": 0.0571, "step": 1671 }, { "epoch": 0.73, "grad_norm": 0.4313986604956256, "learning_rate": 6.947830635398165e-06, "loss": 0.0473, "step": 1672 }, { "epoch": 0.73, "grad_norm": 0.427285014968251, "learning_rate": 6.926272773982341e-06, "loss": 0.0423, "step": 1673 }, { "epoch": 0.74, "grad_norm": 0.44368987011844024, "learning_rate": 6.904741403326951e-06, "loss": 0.0547, "step": 1674 }, { "epoch": 0.74, "grad_norm": 0.5028359631362254, "learning_rate": 6.883236567060137e-06, "loss": 0.0624, "step": 1675 }, { "epoch": 0.74, "grad_norm": 0.5894833151920814, "learning_rate": 6.86175830875627e-06, "loss": 0.0619, "step": 1676 }, { "epoch": 0.74, "grad_norm": 0.4180327214992712, "learning_rate": 6.84030667193587e-06, "loss": 0.0536, "step": 1677 }, { "epoch": 0.74, "grad_norm": 0.437367828022519, "learning_rate": 6.818881700065514e-06, "loss": 0.0592, "step": 1678 }, { "epoch": 0.74, "grad_norm": 0.4574295981493844, "learning_rate": 6.797483436557748e-06, "loss": 0.0583, "step": 1679 }, { "epoch": 0.74, "grad_norm": 0.48600157533005706, "learning_rate": 6.776111924771003e-06, "loss": 0.0566, "step": 1680 }, { "epoch": 0.74, "grad_norm": 0.4297500218074631, "learning_rate": 6.754767208009494e-06, "loss": 0.0492, "step": 1681 }, { "epoch": 0.74, "grad_norm": 0.39603279969056604, "learning_rate": 6.733449329523169e-06, "loss": 0.0512, "step": 1682 }, { "epoch": 0.74, "grad_norm": 0.43482641324660365, "learning_rate": 6.712158332507559e-06, "loss": 0.044, "step": 1683 }, { "epoch": 0.74, "grad_norm": 0.485077164449799, "learning_rate": 6.690894260103742e-06, "loss": 0.0497, "step": 1684 }, { "epoch": 0.74, "grad_norm": 0.4503980002415611, "learning_rate": 6.669657155398252e-06, "loss": 0.0486, "step": 1685 }, { "epoch": 0.74, "grad_norm": 0.48296767028273235, "learning_rate": 6.648447061422974e-06, "loss": 0.0505, "step": 1686 }, { "epoch": 0.74, "grad_norm": 0.39748554950215037, "learning_rate": 6.627264021155029e-06, "loss": 0.0421, "step": 1687 }, { "epoch": 0.74, "grad_norm": 0.4048444200669945, "learning_rate": 6.606108077516773e-06, "loss": 0.0453, "step": 1688 }, { "epoch": 0.74, "grad_norm": 0.4939877118925986, "learning_rate": 6.5849792733756245e-06, "loss": 0.0567, "step": 1689 }, { "epoch": 0.74, "grad_norm": 0.48614198117405244, "learning_rate": 6.563877651544024e-06, "loss": 0.0599, "step": 1690 }, { "epoch": 0.74, "grad_norm": 0.4819644040788172, "learning_rate": 6.542803254779306e-06, "loss": 0.0583, "step": 1691 }, { "epoch": 0.74, "grad_norm": 0.44566511949990106, "learning_rate": 6.521756125783685e-06, "loss": 0.0478, "step": 1692 }, { "epoch": 0.74, "grad_norm": 0.49154637981447935, "learning_rate": 6.500736307204092e-06, "loss": 0.0517, "step": 1693 }, { "epoch": 0.74, "grad_norm": 0.47881608210634014, "learning_rate": 6.479743841632126e-06, "loss": 0.0459, "step": 1694 }, { "epoch": 0.74, "grad_norm": 0.3949284206794991, "learning_rate": 6.4587787716039684e-06, "loss": 0.0426, "step": 1695 }, { "epoch": 0.75, "grad_norm": 0.4345383707373188, "learning_rate": 6.43784113960028e-06, "loss": 0.0505, "step": 1696 }, { "epoch": 0.75, "grad_norm": 0.42741552862421317, "learning_rate": 6.4169309880461376e-06, "loss": 0.0415, "step": 1697 }, { "epoch": 0.75, "grad_norm": 0.41849603994152584, "learning_rate": 6.396048359310925e-06, "loss": 0.049, "step": 1698 }, { "epoch": 0.75, "grad_norm": 0.45310454486848273, "learning_rate": 6.375193295708262e-06, "loss": 0.0595, "step": 1699 }, { "epoch": 0.75, "grad_norm": 0.44705150093041685, "learning_rate": 6.354365839495913e-06, "loss": 0.0501, "step": 1700 }, { "epoch": 0.75, "grad_norm": 0.37501627171499874, "learning_rate": 6.333566032875698e-06, "loss": 0.04, "step": 1701 }, { "epoch": 0.75, "grad_norm": 0.48313025539545174, "learning_rate": 6.3127939179934364e-06, "loss": 0.0525, "step": 1702 }, { "epoch": 0.75, "grad_norm": 0.46096898634100714, "learning_rate": 6.292049536938796e-06, "loss": 0.0482, "step": 1703 }, { "epoch": 0.75, "grad_norm": 0.43249844198001425, "learning_rate": 6.2713329317452794e-06, "loss": 0.0477, "step": 1704 }, { "epoch": 0.75, "grad_norm": 0.45709563885830085, "learning_rate": 6.2506441443900925e-06, "loss": 0.0531, "step": 1705 }, { "epoch": 0.75, "grad_norm": 0.5059801642352223, "learning_rate": 6.2299832167941e-06, "loss": 0.0592, "step": 1706 }, { "epoch": 0.75, "grad_norm": 0.4444604720099998, "learning_rate": 6.209350190821679e-06, "loss": 0.05, "step": 1707 }, { "epoch": 0.75, "grad_norm": 0.47457774178535284, "learning_rate": 6.188745108280687e-06, "loss": 0.0684, "step": 1708 }, { "epoch": 0.75, "grad_norm": 0.4611115854265973, "learning_rate": 6.168168010922375e-06, "loss": 0.0481, "step": 1709 }, { "epoch": 0.75, "grad_norm": 0.4434477932187651, "learning_rate": 6.14761894044128e-06, "loss": 0.0418, "step": 1710 }, { "epoch": 0.75, "grad_norm": 0.6730528199679972, "learning_rate": 6.127097938475122e-06, "loss": 0.0797, "step": 1711 }, { "epoch": 0.75, "grad_norm": 0.3981579566621382, "learning_rate": 6.106605046604794e-06, "loss": 0.0497, "step": 1712 }, { "epoch": 0.75, "grad_norm": 0.4894106201810772, "learning_rate": 6.086140306354198e-06, "loss": 0.0489, "step": 1713 }, { "epoch": 0.75, "grad_norm": 0.5452924445981286, "learning_rate": 6.065703759190205e-06, "loss": 0.0679, "step": 1714 }, { "epoch": 0.75, "grad_norm": 0.5266488759993558, "learning_rate": 6.045295446522554e-06, "loss": 0.0544, "step": 1715 }, { "epoch": 0.75, "grad_norm": 0.39090524157681383, "learning_rate": 6.02491540970378e-06, "loss": 0.0411, "step": 1716 }, { "epoch": 0.75, "grad_norm": 0.4211250987088186, "learning_rate": 6.00456369002912e-06, "loss": 0.048, "step": 1717 }, { "epoch": 0.75, "grad_norm": 0.4335144073474727, "learning_rate": 5.984240328736431e-06, "loss": 0.0495, "step": 1718 }, { "epoch": 0.76, "grad_norm": 0.5039463175139767, "learning_rate": 5.9639453670061144e-06, "loss": 0.0596, "step": 1719 }, { "epoch": 0.76, "grad_norm": 0.4176016469955395, "learning_rate": 5.943678845961025e-06, "loss": 0.0529, "step": 1720 }, { "epoch": 0.76, "grad_norm": 0.4377877957755127, "learning_rate": 5.923440806666383e-06, "loss": 0.0566, "step": 1721 }, { "epoch": 0.76, "grad_norm": 0.4318849208868167, "learning_rate": 5.903231290129707e-06, "loss": 0.0612, "step": 1722 }, { "epoch": 0.76, "grad_norm": 0.5443981210416928, "learning_rate": 5.8830503373007155e-06, "loss": 0.0678, "step": 1723 }, { "epoch": 0.76, "grad_norm": 0.42752017575890316, "learning_rate": 5.862897989071248e-06, "loss": 0.0405, "step": 1724 }, { "epoch": 0.76, "grad_norm": 0.4429279194201175, "learning_rate": 5.8427742862751835e-06, "loss": 0.0466, "step": 1725 }, { "epoch": 0.76, "grad_norm": 0.40226111443043505, "learning_rate": 5.822679269688374e-06, "loss": 0.0451, "step": 1726 }, { "epoch": 0.76, "grad_norm": 0.3815847092334517, "learning_rate": 5.802612980028519e-06, "loss": 0.0504, "step": 1727 }, { "epoch": 0.76, "grad_norm": 0.47208379332454636, "learning_rate": 5.78257545795512e-06, "loss": 0.05, "step": 1728 }, { "epoch": 0.76, "grad_norm": 0.41207032222026363, "learning_rate": 5.762566744069404e-06, "loss": 0.0525, "step": 1729 }, { "epoch": 0.76, "grad_norm": 0.40864958372525456, "learning_rate": 5.742586878914214e-06, "loss": 0.0546, "step": 1730 }, { "epoch": 0.76, "grad_norm": 0.4431180759475435, "learning_rate": 5.722635902973914e-06, "loss": 0.0531, "step": 1731 }, { "epoch": 0.76, "grad_norm": 0.3564158179216772, "learning_rate": 5.702713856674375e-06, "loss": 0.0317, "step": 1732 }, { "epoch": 0.76, "grad_norm": 0.38983174842600904, "learning_rate": 5.682820780382819e-06, "loss": 0.0435, "step": 1733 }, { "epoch": 0.76, "grad_norm": 0.4510195751668071, "learning_rate": 5.662956714407784e-06, "loss": 0.0427, "step": 1734 }, { "epoch": 0.76, "grad_norm": 0.45036049747859863, "learning_rate": 5.643121698998995e-06, "loss": 0.0586, "step": 1735 }, { "epoch": 0.76, "grad_norm": 0.5709638142485423, "learning_rate": 5.623315774347353e-06, "loss": 0.0595, "step": 1736 }, { "epoch": 0.76, "grad_norm": 0.5170579131356504, "learning_rate": 5.603538980584795e-06, "loss": 0.0586, "step": 1737 }, { "epoch": 0.76, "grad_norm": 0.4133155491230597, "learning_rate": 5.583791357784228e-06, "loss": 0.0502, "step": 1738 }, { "epoch": 0.76, "grad_norm": 0.41317632601609383, "learning_rate": 5.564072945959453e-06, "loss": 0.039, "step": 1739 }, { "epoch": 0.76, "grad_norm": 0.4351548453399067, "learning_rate": 5.544383785065088e-06, "loss": 0.0479, "step": 1740 }, { "epoch": 0.76, "grad_norm": 0.379817005723947, "learning_rate": 5.524723914996475e-06, "loss": 0.0434, "step": 1741 }, { "epoch": 0.77, "grad_norm": 0.4203978037154777, "learning_rate": 5.505093375589607e-06, "loss": 0.0398, "step": 1742 }, { "epoch": 0.77, "grad_norm": 0.3870427479458668, "learning_rate": 5.485492206621048e-06, "loss": 0.0422, "step": 1743 }, { "epoch": 0.77, "grad_norm": 0.5360731637149132, "learning_rate": 5.465920447807844e-06, "loss": 0.0484, "step": 1744 }, { "epoch": 0.77, "grad_norm": 0.42149180210417836, "learning_rate": 5.446378138807446e-06, "loss": 0.0402, "step": 1745 }, { "epoch": 0.77, "grad_norm": 0.39468958481867, "learning_rate": 5.42686531921766e-06, "loss": 0.0378, "step": 1746 }, { "epoch": 0.77, "grad_norm": 0.42942271310384217, "learning_rate": 5.407382028576495e-06, "loss": 0.044, "step": 1747 }, { "epoch": 0.77, "grad_norm": 0.43665726421634, "learning_rate": 5.387928306362155e-06, "loss": 0.0429, "step": 1748 }, { "epoch": 0.77, "grad_norm": 0.4384595702007456, "learning_rate": 5.36850419199292e-06, "loss": 0.0505, "step": 1749 }, { "epoch": 0.77, "grad_norm": 0.37105313128980244, "learning_rate": 5.3491097248270975e-06, "loss": 0.0431, "step": 1750 }, { "epoch": 0.77, "grad_norm": 0.5878842375652276, "learning_rate": 5.329744944162889e-06, "loss": 0.0594, "step": 1751 }, { "epoch": 0.77, "grad_norm": 0.426861381620718, "learning_rate": 5.310409889238357e-06, "loss": 0.0433, "step": 1752 }, { "epoch": 0.77, "grad_norm": 0.43798564455053096, "learning_rate": 5.29110459923135e-06, "loss": 0.0462, "step": 1753 }, { "epoch": 0.77, "grad_norm": 0.44885979284214667, "learning_rate": 5.271829113259388e-06, "loss": 0.0503, "step": 1754 }, { "epoch": 0.77, "grad_norm": 0.45667732424145346, "learning_rate": 5.252583470379584e-06, "loss": 0.0486, "step": 1755 }, { "epoch": 0.77, "grad_norm": 0.4851964476267834, "learning_rate": 5.233367709588621e-06, "loss": 0.0526, "step": 1756 }, { "epoch": 0.77, "grad_norm": 0.5339440867051118, "learning_rate": 5.214181869822601e-06, "loss": 0.0578, "step": 1757 }, { "epoch": 0.77, "grad_norm": 0.4565089248775024, "learning_rate": 5.195025989957012e-06, "loss": 0.0566, "step": 1758 }, { "epoch": 0.77, "grad_norm": 0.4712741443155777, "learning_rate": 5.175900108806633e-06, "loss": 0.0455, "step": 1759 }, { "epoch": 0.77, "grad_norm": 0.4711577882781462, "learning_rate": 5.156804265125455e-06, "loss": 0.0657, "step": 1760 }, { "epoch": 0.77, "grad_norm": 0.39668862479738454, "learning_rate": 5.137738497606608e-06, "loss": 0.0383, "step": 1761 }, { "epoch": 0.77, "grad_norm": 0.36104067116431543, "learning_rate": 5.118702844882279e-06, "loss": 0.0367, "step": 1762 }, { "epoch": 0.77, "grad_norm": 0.4320184598826475, "learning_rate": 5.099697345523635e-06, "loss": 0.0406, "step": 1763 }, { "epoch": 0.77, "grad_norm": 0.41308786738961684, "learning_rate": 5.0807220380407465e-06, "loss": 0.0441, "step": 1764 }, { "epoch": 0.78, "grad_norm": 0.4639352527673443, "learning_rate": 5.061776960882503e-06, "loss": 0.0443, "step": 1765 }, { "epoch": 0.78, "grad_norm": 0.4066137372745888, "learning_rate": 5.0428621524365425e-06, "loss": 0.051, "step": 1766 }, { "epoch": 0.78, "grad_norm": 0.4432549878887046, "learning_rate": 5.02397765102917e-06, "loss": 0.0454, "step": 1767 }, { "epoch": 0.78, "grad_norm": 0.42421729156074833, "learning_rate": 5.005123494925284e-06, "loss": 0.0386, "step": 1768 }, { "epoch": 0.78, "grad_norm": 0.4509788034019124, "learning_rate": 4.986299722328281e-06, "loss": 0.04, "step": 1769 }, { "epoch": 0.78, "grad_norm": 0.4205611220399729, "learning_rate": 4.9675063713800245e-06, "loss": 0.0455, "step": 1770 }, { "epoch": 0.78, "grad_norm": 0.35470969512999484, "learning_rate": 4.948743480160696e-06, "loss": 0.0422, "step": 1771 }, { "epoch": 0.78, "grad_norm": 0.4310595368223596, "learning_rate": 4.930011086688782e-06, "loss": 0.0535, "step": 1772 }, { "epoch": 0.78, "grad_norm": 0.44225395296399245, "learning_rate": 4.911309228920973e-06, "loss": 0.0517, "step": 1773 }, { "epoch": 0.78, "grad_norm": 0.4420540537502564, "learning_rate": 4.892637944752083e-06, "loss": 0.0447, "step": 1774 }, { "epoch": 0.78, "grad_norm": 0.4153479290755492, "learning_rate": 4.873997272014963e-06, "loss": 0.0461, "step": 1775 }, { "epoch": 0.78, "grad_norm": 0.3715219336294669, "learning_rate": 4.85538724848045e-06, "loss": 0.0404, "step": 1776 }, { "epoch": 0.78, "grad_norm": 0.3931953550374839, "learning_rate": 4.836807911857284e-06, "loss": 0.0481, "step": 1777 }, { "epoch": 0.78, "grad_norm": 0.4933014329002104, "learning_rate": 4.818259299792019e-06, "loss": 0.053, "step": 1778 }, { "epoch": 0.78, "grad_norm": 0.4578886222697928, "learning_rate": 4.799741449868933e-06, "loss": 0.0487, "step": 1779 }, { "epoch": 0.78, "grad_norm": 0.38085736005201165, "learning_rate": 4.781254399610009e-06, "loss": 0.05, "step": 1780 }, { "epoch": 0.78, "grad_norm": 0.36443803026150623, "learning_rate": 4.762798186474795e-06, "loss": 0.0358, "step": 1781 }, { "epoch": 0.78, "grad_norm": 0.4260949195591172, "learning_rate": 4.744372847860368e-06, "loss": 0.0516, "step": 1782 }, { "epoch": 0.78, "grad_norm": 0.4430770983718562, "learning_rate": 4.725978421101234e-06, "loss": 0.0479, "step": 1783 }, { "epoch": 0.78, "grad_norm": 0.43819608829946977, "learning_rate": 4.707614943469277e-06, "loss": 0.0543, "step": 1784 }, { "epoch": 0.78, "grad_norm": 0.476672581950522, "learning_rate": 4.689282452173656e-06, "loss": 0.052, "step": 1785 }, { "epoch": 0.78, "grad_norm": 0.4087840240794991, "learning_rate": 4.670980984360756e-06, "loss": 0.0403, "step": 1786 }, { "epoch": 0.78, "grad_norm": 0.43155802285912614, "learning_rate": 4.65271057711409e-06, "loss": 0.0452, "step": 1787 }, { "epoch": 0.79, "grad_norm": 0.47661151008614205, "learning_rate": 4.6344712674542455e-06, "loss": 0.0604, "step": 1788 }, { "epoch": 0.79, "grad_norm": 0.390751131674799, "learning_rate": 4.616263092338788e-06, "loss": 0.0438, "step": 1789 }, { "epoch": 0.79, "grad_norm": 0.3888180711767424, "learning_rate": 4.5980860886622015e-06, "loss": 0.0511, "step": 1790 }, { "epoch": 0.79, "grad_norm": 0.41829160894791306, "learning_rate": 4.579940293255804e-06, "loss": 0.0395, "step": 1791 }, { "epoch": 0.79, "grad_norm": 0.43019486730281137, "learning_rate": 4.561825742887687e-06, "loss": 0.0527, "step": 1792 }, { "epoch": 0.79, "grad_norm": 0.40047770227851365, "learning_rate": 4.543742474262616e-06, "loss": 0.0421, "step": 1793 }, { "epoch": 0.79, "grad_norm": 0.3862897824969175, "learning_rate": 4.525690524022e-06, "loss": 0.0436, "step": 1794 }, { "epoch": 0.79, "grad_norm": 0.43580320538908407, "learning_rate": 4.507669928743749e-06, "loss": 0.0462, "step": 1795 }, { "epoch": 0.79, "grad_norm": 0.4351491951498982, "learning_rate": 4.489680724942263e-06, "loss": 0.0456, "step": 1796 }, { "epoch": 0.79, "grad_norm": 0.423127645881528, "learning_rate": 4.4717229490683446e-06, "loss": 0.0405, "step": 1797 }, { "epoch": 0.79, "grad_norm": 0.3820379166908764, "learning_rate": 4.4537966375091e-06, "loss": 0.047, "step": 1798 }, { "epoch": 0.79, "grad_norm": 0.4163715827639046, "learning_rate": 4.4359018265878695e-06, "loss": 0.0425, "step": 1799 }, { "epoch": 0.79, "grad_norm": 0.47860268559895963, "learning_rate": 4.418038552564194e-06, "loss": 0.0607, "step": 1800 }, { "epoch": 0.79, "grad_norm": 0.41743975881441453, "learning_rate": 4.400206851633693e-06, "loss": 0.0539, "step": 1801 }, { "epoch": 0.79, "grad_norm": 0.460681876730326, "learning_rate": 4.382406759928017e-06, "loss": 0.0638, "step": 1802 }, { "epoch": 0.79, "grad_norm": 0.47164553564048134, "learning_rate": 4.3646383135147555e-06, "loss": 0.0536, "step": 1803 }, { "epoch": 0.79, "grad_norm": 0.4247030728581755, "learning_rate": 4.346901548397397e-06, "loss": 0.0402, "step": 1804 }, { "epoch": 0.79, "grad_norm": 0.4213650942101772, "learning_rate": 4.329196500515225e-06, "loss": 0.0584, "step": 1805 }, { "epoch": 0.79, "grad_norm": 0.43021746811239714, "learning_rate": 4.311523205743256e-06, "loss": 0.0444, "step": 1806 }, { "epoch": 0.79, "grad_norm": 0.4290412204074876, "learning_rate": 4.2938816998921615e-06, "loss": 0.0471, "step": 1807 }, { "epoch": 0.79, "grad_norm": 0.4473942234452947, "learning_rate": 4.276272018708212e-06, "loss": 0.0459, "step": 1808 }, { "epoch": 0.79, "grad_norm": 0.3803356776074693, "learning_rate": 4.258694197873181e-06, "loss": 0.0389, "step": 1809 }, { "epoch": 0.8, "grad_norm": 0.43089577113905153, "learning_rate": 4.2411482730042945e-06, "loss": 0.0448, "step": 1810 }, { "epoch": 0.8, "grad_norm": 0.4031437078467266, "learning_rate": 4.223634279654141e-06, "loss": 0.0395, "step": 1811 }, { "epoch": 0.8, "grad_norm": 0.3564262907622504, "learning_rate": 4.2061522533106154e-06, "loss": 0.0445, "step": 1812 }, { "epoch": 0.8, "grad_norm": 0.39664213678035437, "learning_rate": 4.188702229396826e-06, "loss": 0.0432, "step": 1813 }, { "epoch": 0.8, "grad_norm": 0.37075876877537595, "learning_rate": 4.171284243271061e-06, "loss": 0.0461, "step": 1814 }, { "epoch": 0.8, "grad_norm": 0.411092108299489, "learning_rate": 4.1538983302266624e-06, "loss": 0.0465, "step": 1815 }, { "epoch": 0.8, "grad_norm": 0.45678241182946655, "learning_rate": 4.136544525491999e-06, "loss": 0.0548, "step": 1816 }, { "epoch": 0.8, "grad_norm": 0.46000131806412814, "learning_rate": 4.1192228642303746e-06, "loss": 0.0488, "step": 1817 }, { "epoch": 0.8, "grad_norm": 0.4848140910571134, "learning_rate": 4.10193338153998e-06, "loss": 0.0418, "step": 1818 }, { "epoch": 0.8, "grad_norm": 0.36961380840796126, "learning_rate": 4.084676112453774e-06, "loss": 0.0437, "step": 1819 }, { "epoch": 0.8, "grad_norm": 0.4168260587362277, "learning_rate": 4.067451091939456e-06, "loss": 0.0423, "step": 1820 }, { "epoch": 0.8, "grad_norm": 0.40461011168741917, "learning_rate": 4.050258354899394e-06, "loss": 0.0413, "step": 1821 }, { "epoch": 0.8, "grad_norm": 0.5812786023475613, "learning_rate": 4.033097936170531e-06, "loss": 0.062, "step": 1822 }, { "epoch": 0.8, "grad_norm": 0.36473878089692685, "learning_rate": 4.015969870524308e-06, "loss": 0.0324, "step": 1823 }, { "epoch": 0.8, "grad_norm": 0.3586336242513691, "learning_rate": 3.99887419266664e-06, "loss": 0.0373, "step": 1824 }, { "epoch": 0.8, "grad_norm": 0.5695828512543668, "learning_rate": 3.981810937237798e-06, "loss": 0.0557, "step": 1825 }, { "epoch": 0.8, "grad_norm": 0.4575902503531785, "learning_rate": 3.964780138812361e-06, "loss": 0.0473, "step": 1826 }, { "epoch": 0.8, "grad_norm": 0.39475155335544093, "learning_rate": 3.94778183189914e-06, "loss": 0.0376, "step": 1827 }, { "epoch": 0.8, "grad_norm": 0.4350113077721904, "learning_rate": 3.930816050941113e-06, "loss": 0.0462, "step": 1828 }, { "epoch": 0.8, "grad_norm": 0.37376075863105146, "learning_rate": 3.91388283031535e-06, "loss": 0.0409, "step": 1829 }, { "epoch": 0.8, "grad_norm": 0.5122818796692326, "learning_rate": 3.8969822043329445e-06, "loss": 0.0533, "step": 1830 }, { "epoch": 0.8, "grad_norm": 0.4374923535087487, "learning_rate": 3.880114207238948e-06, "loss": 0.043, "step": 1831 }, { "epoch": 0.8, "grad_norm": 0.4654118248332772, "learning_rate": 3.8632788732122926e-06, "loss": 0.0522, "step": 1832 }, { "epoch": 0.81, "grad_norm": 0.378301038827158, "learning_rate": 3.84647623636573e-06, "loss": 0.0429, "step": 1833 }, { "epoch": 0.81, "grad_norm": 0.40064393381139224, "learning_rate": 3.829706330745759e-06, "loss": 0.0366, "step": 1834 }, { "epoch": 0.81, "grad_norm": 0.4235197078956278, "learning_rate": 3.812969190332554e-06, "loss": 0.0521, "step": 1835 }, { "epoch": 0.81, "grad_norm": 0.5089575256151787, "learning_rate": 3.7962648490399036e-06, "loss": 0.0505, "step": 1836 }, { "epoch": 0.81, "grad_norm": 0.3857973276637535, "learning_rate": 3.7795933407151243e-06, "loss": 0.0485, "step": 1837 }, { "epoch": 0.81, "grad_norm": 0.4422169389124916, "learning_rate": 3.762954699139032e-06, "loss": 0.052, "step": 1838 }, { "epoch": 0.81, "grad_norm": 0.4352888874241404, "learning_rate": 3.7463489580258116e-06, "loss": 0.0493, "step": 1839 }, { "epoch": 0.81, "grad_norm": 0.4516257773022333, "learning_rate": 3.7297761510229968e-06, "loss": 0.0643, "step": 1840 }, { "epoch": 0.81, "grad_norm": 0.3953510491795897, "learning_rate": 3.7132363117114056e-06, "loss": 0.0423, "step": 1841 }, { "epoch": 0.81, "grad_norm": 0.4131502863406893, "learning_rate": 3.6967294736050363e-06, "loss": 0.0454, "step": 1842 }, { "epoch": 0.81, "grad_norm": 0.3638200967962495, "learning_rate": 3.6802556701510138e-06, "loss": 0.0417, "step": 1843 }, { "epoch": 0.81, "grad_norm": 0.4007087188288603, "learning_rate": 3.6638149347295327e-06, "loss": 0.0386, "step": 1844 }, { "epoch": 0.81, "grad_norm": 0.39474045522818096, "learning_rate": 3.6474073006537957e-06, "loss": 0.0457, "step": 1845 }, { "epoch": 0.81, "grad_norm": 0.4120086800010433, "learning_rate": 3.631032801169925e-06, "loss": 0.0509, "step": 1846 }, { "epoch": 0.81, "grad_norm": 0.43260199733269517, "learning_rate": 3.6146914694568813e-06, "loss": 0.0477, "step": 1847 }, { "epoch": 0.81, "grad_norm": 0.4490116803130414, "learning_rate": 3.5983833386264567e-06, "loss": 0.0502, "step": 1848 }, { "epoch": 0.81, "grad_norm": 0.37310504959035773, "learning_rate": 3.582108441723149e-06, "loss": 0.0423, "step": 1849 }, { "epoch": 0.81, "grad_norm": 0.44756025912902875, "learning_rate": 3.5658668117241145e-06, "loss": 0.0451, "step": 1850 }, { "epoch": 0.81, "grad_norm": 0.38204392332185144, "learning_rate": 3.5496584815391065e-06, "loss": 0.0369, "step": 1851 }, { "epoch": 0.81, "grad_norm": 0.41255236495591563, "learning_rate": 3.533483484010407e-06, "loss": 0.0463, "step": 1852 }, { "epoch": 0.81, "grad_norm": 0.4819075124189869, "learning_rate": 3.51734185191275e-06, "loss": 0.0636, "step": 1853 }, { "epoch": 0.81, "grad_norm": 0.4339873615096895, "learning_rate": 3.501233617953268e-06, "loss": 0.0469, "step": 1854 }, { "epoch": 0.81, "grad_norm": 0.3773324471117594, "learning_rate": 3.4851588147714166e-06, "loss": 0.0476, "step": 1855 }, { "epoch": 0.82, "grad_norm": 0.42440982751537165, "learning_rate": 3.469117474938919e-06, "loss": 0.0475, "step": 1856 }, { "epoch": 0.82, "grad_norm": 0.39510492597138736, "learning_rate": 3.453109630959679e-06, "loss": 0.0451, "step": 1857 }, { "epoch": 0.82, "grad_norm": 0.3819187006318075, "learning_rate": 3.437135315269755e-06, "loss": 0.0356, "step": 1858 }, { "epoch": 0.82, "grad_norm": 0.3724491463581637, "learning_rate": 3.4211945602372354e-06, "loss": 0.0476, "step": 1859 }, { "epoch": 0.82, "grad_norm": 0.45238042363217174, "learning_rate": 3.4052873981622268e-06, "loss": 0.0469, "step": 1860 }, { "epoch": 0.82, "grad_norm": 0.4505154346590953, "learning_rate": 3.3894138612767603e-06, "loss": 0.0537, "step": 1861 }, { "epoch": 0.82, "grad_norm": 0.46610572756071966, "learning_rate": 3.3735739817447488e-06, "loss": 0.0528, "step": 1862 }, { "epoch": 0.82, "grad_norm": 0.38041896298492933, "learning_rate": 3.3577677916618834e-06, "loss": 0.0367, "step": 1863 }, { "epoch": 0.82, "grad_norm": 0.3834466741594489, "learning_rate": 3.341995323055598e-06, "loss": 0.0434, "step": 1864 }, { "epoch": 0.82, "grad_norm": 0.3713129299786089, "learning_rate": 3.3262566078850144e-06, "loss": 0.037, "step": 1865 }, { "epoch": 0.82, "grad_norm": 0.41897967004277087, "learning_rate": 3.310551678040852e-06, "loss": 0.0517, "step": 1866 }, { "epoch": 0.82, "grad_norm": 0.43272434741590343, "learning_rate": 3.2948805653453507e-06, "loss": 0.0406, "step": 1867 }, { "epoch": 0.82, "grad_norm": 0.36492168076154163, "learning_rate": 3.279243301552264e-06, "loss": 0.0373, "step": 1868 }, { "epoch": 0.82, "grad_norm": 0.39378696525476153, "learning_rate": 3.2636399183467394e-06, "loss": 0.0474, "step": 1869 }, { "epoch": 0.82, "grad_norm": 0.40235802708451457, "learning_rate": 3.2480704473452705e-06, "loss": 0.0511, "step": 1870 }, { "epoch": 0.82, "grad_norm": 0.38975781504593515, "learning_rate": 3.232534920095647e-06, "loss": 0.0516, "step": 1871 }, { "epoch": 0.82, "grad_norm": 0.44368889616303125, "learning_rate": 3.217033368076872e-06, "loss": 0.0499, "step": 1872 }, { "epoch": 0.82, "grad_norm": 0.48086669383301245, "learning_rate": 3.2015658226991087e-06, "loss": 0.0443, "step": 1873 }, { "epoch": 0.82, "grad_norm": 0.41360307793209833, "learning_rate": 3.1861323153036116e-06, "loss": 0.0356, "step": 1874 }, { "epoch": 0.82, "grad_norm": 0.4000363154615457, "learning_rate": 3.1707328771626724e-06, "loss": 0.0464, "step": 1875 }, { "epoch": 0.82, "grad_norm": 0.43072933126290375, "learning_rate": 3.1553675394795392e-06, "loss": 0.0505, "step": 1876 }, { "epoch": 0.82, "grad_norm": 0.3887032730992256, "learning_rate": 3.1400363333883722e-06, "loss": 0.0442, "step": 1877 }, { "epoch": 0.82, "grad_norm": 0.3967537364278996, "learning_rate": 3.1247392899541708e-06, "loss": 0.0555, "step": 1878 }, { "epoch": 0.83, "grad_norm": 0.44843123694110154, "learning_rate": 3.109476440172707e-06, "loss": 0.0416, "step": 1879 }, { "epoch": 0.83, "grad_norm": 0.37287153747030255, "learning_rate": 3.0942478149704723e-06, "loss": 0.0386, "step": 1880 }, { "epoch": 0.83, "grad_norm": 0.434570501091734, "learning_rate": 3.079053445204603e-06, "loss": 0.042, "step": 1881 }, { "epoch": 0.83, "grad_norm": 0.42976990689833594, "learning_rate": 3.0638933616628486e-06, "loss": 0.0511, "step": 1882 }, { "epoch": 0.83, "grad_norm": 0.44081827686331637, "learning_rate": 3.048767595063453e-06, "loss": 0.0478, "step": 1883 }, { "epoch": 0.83, "grad_norm": 0.4134780429674835, "learning_rate": 3.0336761760551383e-06, "loss": 0.0418, "step": 1884 }, { "epoch": 0.83, "grad_norm": 0.40063376138899504, "learning_rate": 3.0186191352170423e-06, "loss": 0.0436, "step": 1885 }, { "epoch": 0.83, "grad_norm": 0.4289337830313686, "learning_rate": 3.003596503058632e-06, "loss": 0.0549, "step": 1886 }, { "epoch": 0.83, "grad_norm": 0.38453283868404914, "learning_rate": 2.9886083100196473e-06, "loss": 0.0432, "step": 1887 }, { "epoch": 0.83, "grad_norm": 0.3574559346631503, "learning_rate": 2.9736545864700493e-06, "loss": 0.0353, "step": 1888 }, { "epoch": 0.83, "grad_norm": 0.35556550857443203, "learning_rate": 2.9587353627099723e-06, "loss": 0.0347, "step": 1889 }, { "epoch": 0.83, "grad_norm": 0.39845191705089145, "learning_rate": 2.943850668969628e-06, "loss": 0.0449, "step": 1890 }, { "epoch": 0.83, "grad_norm": 0.4326631508687059, "learning_rate": 2.92900053540925e-06, "loss": 0.049, "step": 1891 }, { "epoch": 0.83, "grad_norm": 0.4719354792862215, "learning_rate": 2.9141849921190735e-06, "loss": 0.0542, "step": 1892 }, { "epoch": 0.83, "grad_norm": 0.41885382091195195, "learning_rate": 2.899404069119225e-06, "loss": 0.0394, "step": 1893 }, { "epoch": 0.83, "grad_norm": 0.3937131879643606, "learning_rate": 2.8846577963596844e-06, "loss": 0.0434, "step": 1894 }, { "epoch": 0.83, "grad_norm": 0.5392894819853435, "learning_rate": 2.8699462037202264e-06, "loss": 0.0509, "step": 1895 }, { "epoch": 0.83, "grad_norm": 0.485942979150038, "learning_rate": 2.85526932101035e-06, "loss": 0.0543, "step": 1896 }, { "epoch": 0.83, "grad_norm": 0.3746383631909582, "learning_rate": 2.840627177969224e-06, "loss": 0.0356, "step": 1897 }, { "epoch": 0.83, "grad_norm": 0.47913263206961837, "learning_rate": 2.826019804265625e-06, "loss": 0.0464, "step": 1898 }, { "epoch": 0.83, "grad_norm": 0.37487236473452124, "learning_rate": 2.8114472294978813e-06, "loss": 0.04, "step": 1899 }, { "epoch": 0.83, "grad_norm": 0.42981939892309745, "learning_rate": 2.7969094831938016e-06, "loss": 0.0406, "step": 1900 }, { "epoch": 0.84, "grad_norm": 0.3877113396625231, "learning_rate": 2.782406594810636e-06, "loss": 0.0411, "step": 1901 }, { "epoch": 0.84, "grad_norm": 0.4327979130128758, "learning_rate": 2.767938593734989e-06, "loss": 0.0488, "step": 1902 }, { "epoch": 0.84, "grad_norm": 0.3981822449087839, "learning_rate": 2.7535055092827857e-06, "loss": 0.0463, "step": 1903 }, { "epoch": 0.84, "grad_norm": 0.43099819032616626, "learning_rate": 2.7391073706991944e-06, "loss": 0.0406, "step": 1904 }, { "epoch": 0.84, "grad_norm": 0.4909250477814849, "learning_rate": 2.7247442071585738e-06, "loss": 0.044, "step": 1905 }, { "epoch": 0.84, "grad_norm": 0.4240927155987829, "learning_rate": 2.710416047764428e-06, "loss": 0.0426, "step": 1906 }, { "epoch": 0.84, "grad_norm": 0.4282657510262557, "learning_rate": 2.6961229215493114e-06, "loss": 0.0367, "step": 1907 }, { "epoch": 0.84, "grad_norm": 0.38682934114151213, "learning_rate": 2.6818648574747984e-06, "loss": 0.0441, "step": 1908 }, { "epoch": 0.84, "grad_norm": 0.42961109522454133, "learning_rate": 2.6676418844314354e-06, "loss": 0.0425, "step": 1909 }, { "epoch": 0.84, "grad_norm": 0.4672532344744944, "learning_rate": 2.6534540312386535e-06, "loss": 0.0576, "step": 1910 }, { "epoch": 0.84, "grad_norm": 0.4083524184567694, "learning_rate": 2.6393013266447054e-06, "loss": 0.0499, "step": 1911 }, { "epoch": 0.84, "grad_norm": 0.4314255674928875, "learning_rate": 2.6251837993266537e-06, "loss": 0.0507, "step": 1912 }, { "epoch": 0.84, "grad_norm": 0.484419205401965, "learning_rate": 2.6111014778902634e-06, "loss": 0.0499, "step": 1913 }, { "epoch": 0.84, "grad_norm": 0.4133331424894713, "learning_rate": 2.597054390869973e-06, "loss": 0.0388, "step": 1914 }, { "epoch": 0.84, "grad_norm": 0.4749220457859551, "learning_rate": 2.5830425667288084e-06, "loss": 0.0609, "step": 1915 }, { "epoch": 0.84, "grad_norm": 0.4322488047327887, "learning_rate": 2.5690660338583718e-06, "loss": 0.0534, "step": 1916 }, { "epoch": 0.84, "grad_norm": 0.4590791644972713, "learning_rate": 2.5551248205787384e-06, "loss": 0.0474, "step": 1917 }, { "epoch": 0.84, "grad_norm": 0.37317321794863456, "learning_rate": 2.541218955138418e-06, "loss": 0.0313, "step": 1918 }, { "epoch": 0.84, "grad_norm": 0.3896185497781714, "learning_rate": 2.527348465714301e-06, "loss": 0.0384, "step": 1919 }, { "epoch": 0.84, "grad_norm": 0.39416112629072775, "learning_rate": 2.513513380411592e-06, "loss": 0.0397, "step": 1920 }, { "epoch": 0.84, "grad_norm": 0.45304532830503375, "learning_rate": 2.4997137272637617e-06, "loss": 0.0549, "step": 1921 }, { "epoch": 0.84, "grad_norm": 0.39789411479229225, "learning_rate": 2.4859495342324857e-06, "loss": 0.0459, "step": 1922 }, { "epoch": 0.84, "grad_norm": 0.3919834358367392, "learning_rate": 2.4722208292075835e-06, "loss": 0.0423, "step": 1923 }, { "epoch": 0.85, "grad_norm": 0.47226366179936147, "learning_rate": 2.458527640006976e-06, "loss": 0.0565, "step": 1924 }, { "epoch": 0.85, "grad_norm": 0.39612252772297946, "learning_rate": 2.4448699943766043e-06, "loss": 0.0454, "step": 1925 }, { "epoch": 0.85, "grad_norm": 0.4369294328846566, "learning_rate": 2.431247919990416e-06, "loss": 0.0443, "step": 1926 }, { "epoch": 0.85, "grad_norm": 0.4260544206844539, "learning_rate": 2.41766144445025e-06, "loss": 0.0501, "step": 1927 }, { "epoch": 0.85, "grad_norm": 0.3813590464435637, "learning_rate": 2.4041105952858313e-06, "loss": 0.0401, "step": 1928 }, { "epoch": 0.85, "grad_norm": 0.41509889059251653, "learning_rate": 2.3905953999546916e-06, "loss": 0.0414, "step": 1929 }, { "epoch": 0.85, "grad_norm": 0.35060314040627233, "learning_rate": 2.3771158858421295e-06, "loss": 0.0511, "step": 1930 }, { "epoch": 0.85, "grad_norm": 0.4759529467935806, "learning_rate": 2.3636720802611257e-06, "loss": 0.0535, "step": 1931 }, { "epoch": 0.85, "grad_norm": 0.4054976245396623, "learning_rate": 2.350264010452312e-06, "loss": 0.054, "step": 1932 }, { "epoch": 0.85, "grad_norm": 0.49260159889574257, "learning_rate": 2.3368917035839234e-06, "loss": 0.05, "step": 1933 }, { "epoch": 0.85, "grad_norm": 0.40584448602588646, "learning_rate": 2.3235551867517205e-06, "loss": 0.0433, "step": 1934 }, { "epoch": 0.85, "grad_norm": 0.43676600874421095, "learning_rate": 2.3102544869789312e-06, "loss": 0.0465, "step": 1935 }, { "epoch": 0.85, "grad_norm": 0.38760349906502345, "learning_rate": 2.296989631216233e-06, "loss": 0.0394, "step": 1936 }, { "epoch": 0.85, "grad_norm": 0.4124214111949858, "learning_rate": 2.2837606463416594e-06, "loss": 0.0468, "step": 1937 }, { "epoch": 0.85, "grad_norm": 0.368747340208903, "learning_rate": 2.2705675591605615e-06, "loss": 0.0449, "step": 1938 }, { "epoch": 0.85, "grad_norm": 0.38096638667772537, "learning_rate": 2.257410396405553e-06, "loss": 0.0431, "step": 1939 }, { "epoch": 0.85, "grad_norm": 0.38841678165168647, "learning_rate": 2.244289184736459e-06, "loss": 0.0413, "step": 1940 }, { "epoch": 0.85, "grad_norm": 0.38674749402428416, "learning_rate": 2.231203950740255e-06, "loss": 0.0422, "step": 1941 }, { "epoch": 0.85, "grad_norm": 0.4596199180720461, "learning_rate": 2.218154720931016e-06, "loss": 0.0521, "step": 1942 }, { "epoch": 0.85, "grad_norm": 0.4725039528373598, "learning_rate": 2.205141521749865e-06, "loss": 0.0442, "step": 1943 }, { "epoch": 0.85, "grad_norm": 0.3416667254475024, "learning_rate": 2.192164379564916e-06, "loss": 0.0365, "step": 1944 }, { "epoch": 0.85, "grad_norm": 0.3880703550991528, "learning_rate": 2.179223320671224e-06, "loss": 0.047, "step": 1945 }, { "epoch": 0.85, "grad_norm": 0.40707965643532495, "learning_rate": 2.16631837129073e-06, "loss": 0.0469, "step": 1946 }, { "epoch": 0.86, "grad_norm": 0.3900276193864703, "learning_rate": 2.153449557572207e-06, "loss": 0.0467, "step": 1947 }, { "epoch": 0.86, "grad_norm": 0.3845385544636131, "learning_rate": 2.140616905591204e-06, "loss": 0.0377, "step": 1948 }, { "epoch": 0.86, "grad_norm": 0.39696601214602495, "learning_rate": 2.1278204413500013e-06, "loss": 0.0371, "step": 1949 }, { "epoch": 0.86, "grad_norm": 0.3892186981164384, "learning_rate": 2.1150601907775602e-06, "loss": 0.0362, "step": 1950 }, { "epoch": 0.86, "grad_norm": 0.37813735894426936, "learning_rate": 2.1023361797294472e-06, "loss": 0.0348, "step": 1951 }, { "epoch": 0.86, "grad_norm": 0.375898377847533, "learning_rate": 2.0896484339878053e-06, "loss": 0.043, "step": 1952 }, { "epoch": 0.86, "grad_norm": 0.4173038399029017, "learning_rate": 2.0769969792613054e-06, "loss": 0.0438, "step": 1953 }, { "epoch": 0.86, "grad_norm": 0.36885328058787953, "learning_rate": 2.0643818411850748e-06, "loss": 0.034, "step": 1954 }, { "epoch": 0.86, "grad_norm": 0.45861290042020336, "learning_rate": 2.051803045320644e-06, "loss": 0.0421, "step": 1955 }, { "epoch": 0.86, "grad_norm": 0.40248992264613026, "learning_rate": 2.0392606171559158e-06, "loss": 0.0386, "step": 1956 }, { "epoch": 0.86, "grad_norm": 0.49794861067237095, "learning_rate": 2.0267545821051083e-06, "loss": 0.0504, "step": 1957 }, { "epoch": 0.86, "grad_norm": 0.3596694196370898, "learning_rate": 2.014284965508688e-06, "loss": 0.0414, "step": 1958 }, { "epoch": 0.86, "grad_norm": 0.3955495553299747, "learning_rate": 2.0018517926333247e-06, "loss": 0.0463, "step": 1959 }, { "epoch": 0.86, "grad_norm": 0.4051759533664561, "learning_rate": 1.9894550886718546e-06, "loss": 0.0369, "step": 1960 }, { "epoch": 0.86, "grad_norm": 0.35946262369454546, "learning_rate": 1.9770948787432154e-06, "loss": 0.0368, "step": 1961 }, { "epoch": 0.86, "grad_norm": 0.4229119061764286, "learning_rate": 1.9647711878923916e-06, "loss": 0.043, "step": 1962 }, { "epoch": 0.86, "grad_norm": 0.4058526186339382, "learning_rate": 1.952484041090379e-06, "loss": 0.0387, "step": 1963 }, { "epoch": 0.86, "grad_norm": 0.3870596490943172, "learning_rate": 1.9402334632341203e-06, "loss": 0.0389, "step": 1964 }, { "epoch": 0.86, "grad_norm": 0.3911309208505601, "learning_rate": 1.9280194791464635e-06, "loss": 0.0366, "step": 1965 }, { "epoch": 0.86, "grad_norm": 0.46592415345979793, "learning_rate": 1.9158421135761054e-06, "loss": 0.0479, "step": 1966 }, { "epoch": 0.86, "grad_norm": 0.41147725951048764, "learning_rate": 1.9037013911975454e-06, "loss": 0.0447, "step": 1967 }, { "epoch": 0.86, "grad_norm": 0.38349893878282637, "learning_rate": 1.8915973366110353e-06, "loss": 0.0373, "step": 1968 }, { "epoch": 0.86, "grad_norm": 0.35406563056309254, "learning_rate": 1.8795299743425222e-06, "loss": 0.0439, "step": 1969 }, { "epoch": 0.87, "grad_norm": 0.43400674345121615, "learning_rate": 1.8674993288436228e-06, "loss": 0.0387, "step": 1970 }, { "epoch": 0.87, "grad_norm": 0.32817572247061033, "learning_rate": 1.8555054244915326e-06, "loss": 0.0328, "step": 1971 }, { "epoch": 0.87, "grad_norm": 0.4403696970884285, "learning_rate": 1.8435482855890118e-06, "loss": 0.0506, "step": 1972 }, { "epoch": 0.87, "grad_norm": 0.3744772406551968, "learning_rate": 1.8316279363643242e-06, "loss": 0.0436, "step": 1973 }, { "epoch": 0.87, "grad_norm": 0.3382783330214544, "learning_rate": 1.8197444009711928e-06, "loss": 0.0418, "step": 1974 }, { "epoch": 0.87, "grad_norm": 0.35833775061095163, "learning_rate": 1.8078977034887369e-06, "loss": 0.0446, "step": 1975 }, { "epoch": 0.87, "grad_norm": 0.36770972187691414, "learning_rate": 1.79608786792143e-06, "loss": 0.0428, "step": 1976 }, { "epoch": 0.87, "grad_norm": 0.3945224685447045, "learning_rate": 1.7843149181990704e-06, "loss": 0.0488, "step": 1977 }, { "epoch": 0.87, "grad_norm": 0.3971400734883753, "learning_rate": 1.7725788781767072e-06, "loss": 0.0517, "step": 1978 }, { "epoch": 0.87, "grad_norm": 0.3497264602066175, "learning_rate": 1.7608797716345894e-06, "loss": 0.0292, "step": 1979 }, { "epoch": 0.87, "grad_norm": 0.45996699562051774, "learning_rate": 1.7492176222781454e-06, "loss": 0.0412, "step": 1980 }, { "epoch": 0.87, "grad_norm": 0.3589193523906705, "learning_rate": 1.7375924537379151e-06, "loss": 0.0318, "step": 1981 }, { "epoch": 0.87, "grad_norm": 0.3465052528499246, "learning_rate": 1.726004289569505e-06, "loss": 0.0461, "step": 1982 }, { "epoch": 0.87, "grad_norm": 0.3832136504645803, "learning_rate": 1.714453153253528e-06, "loss": 0.0439, "step": 1983 }, { "epoch": 0.87, "grad_norm": 0.4196057926494528, "learning_rate": 1.7029390681955927e-06, "loss": 0.0522, "step": 1984 }, { "epoch": 0.87, "grad_norm": 0.40289185975577, "learning_rate": 1.6914620577262163e-06, "loss": 0.0401, "step": 1985 }, { "epoch": 0.87, "grad_norm": 0.3729828185363462, "learning_rate": 1.680022145100797e-06, "loss": 0.0334, "step": 1986 }, { "epoch": 0.87, "grad_norm": 0.4623297822730408, "learning_rate": 1.6686193534995654e-06, "loss": 0.0488, "step": 1987 }, { "epoch": 0.87, "grad_norm": 0.4290332147671506, "learning_rate": 1.6572537060275283e-06, "loss": 0.053, "step": 1988 }, { "epoch": 0.87, "grad_norm": 0.37745747402480895, "learning_rate": 1.6459252257144377e-06, "loss": 0.0413, "step": 1989 }, { "epoch": 0.87, "grad_norm": 0.37067767458860473, "learning_rate": 1.634633935514729e-06, "loss": 0.0335, "step": 1990 }, { "epoch": 0.87, "grad_norm": 0.3708117768990921, "learning_rate": 1.6233798583074856e-06, "loss": 0.0386, "step": 1991 }, { "epoch": 0.88, "grad_norm": 0.4246397553272987, "learning_rate": 1.6121630168963841e-06, "loss": 0.0385, "step": 1992 }, { "epoch": 0.88, "grad_norm": 0.6440608070569691, "learning_rate": 1.600983434009651e-06, "loss": 0.0528, "step": 1993 }, { "epoch": 0.88, "grad_norm": 0.4580607271840697, "learning_rate": 1.5898411323000273e-06, "loss": 0.0501, "step": 1994 }, { "epoch": 0.88, "grad_norm": 0.4574517980572538, "learning_rate": 1.578736134344694e-06, "loss": 0.0403, "step": 1995 }, { "epoch": 0.88, "grad_norm": 0.3972066492286215, "learning_rate": 1.5676684626452576e-06, "loss": 0.0486, "step": 1996 }, { "epoch": 0.88, "grad_norm": 0.44271509688951555, "learning_rate": 1.5566381396276953e-06, "loss": 0.0464, "step": 1997 }, { "epoch": 0.88, "grad_norm": 0.39253058074464087, "learning_rate": 1.545645187642304e-06, "loss": 0.0442, "step": 1998 }, { "epoch": 0.88, "grad_norm": 0.5137598810846755, "learning_rate": 1.5346896289636437e-06, "loss": 0.0545, "step": 1999 }, { "epoch": 0.88, "grad_norm": 0.3538399152082177, "learning_rate": 1.5237714857905174e-06, "loss": 0.0428, "step": 2000 }, { "epoch": 0.88, "grad_norm": 0.37059065194849067, "learning_rate": 1.5128907802459214e-06, "loss": 0.0461, "step": 2001 }, { "epoch": 0.88, "grad_norm": 0.3974122713554775, "learning_rate": 1.5020475343769824e-06, "loss": 0.0483, "step": 2002 }, { "epoch": 0.88, "grad_norm": 0.37535182025895414, "learning_rate": 1.491241770154921e-06, "loss": 0.04, "step": 2003 }, { "epoch": 0.88, "grad_norm": 0.3891218829509565, "learning_rate": 1.4804735094750267e-06, "loss": 0.0471, "step": 2004 }, { "epoch": 0.88, "grad_norm": 0.36069943144197975, "learning_rate": 1.4697427741565817e-06, "loss": 0.0382, "step": 2005 }, { "epoch": 0.88, "grad_norm": 0.33214268777772654, "learning_rate": 1.4590495859428378e-06, "loss": 0.0345, "step": 2006 }, { "epoch": 0.88, "grad_norm": 0.35834402975855073, "learning_rate": 1.4483939665009672e-06, "loss": 0.0531, "step": 2007 }, { "epoch": 0.88, "grad_norm": 0.3803817492891329, "learning_rate": 1.4377759374220169e-06, "loss": 0.0476, "step": 2008 }, { "epoch": 0.88, "grad_norm": 0.4129415850557809, "learning_rate": 1.4271955202208655e-06, "loss": 0.0502, "step": 2009 }, { "epoch": 0.88, "grad_norm": 0.4028800612050954, "learning_rate": 1.4166527363361814e-06, "loss": 0.0446, "step": 2010 }, { "epoch": 0.88, "grad_norm": 0.3579484989294109, "learning_rate": 1.406147607130377e-06, "loss": 0.0344, "step": 2011 }, { "epoch": 0.88, "grad_norm": 0.45241439939635664, "learning_rate": 1.3956801538895692e-06, "loss": 0.0366, "step": 2012 }, { "epoch": 0.88, "grad_norm": 0.3642799432081005, "learning_rate": 1.385250397823532e-06, "loss": 0.0446, "step": 2013 }, { "epoch": 0.88, "grad_norm": 0.43645615027100976, "learning_rate": 1.3748583600656517e-06, "loss": 0.0507, "step": 2014 }, { "epoch": 0.89, "grad_norm": 0.42004834421964643, "learning_rate": 1.3645040616728934e-06, "loss": 0.0444, "step": 2015 }, { "epoch": 0.89, "grad_norm": 0.38645359543657704, "learning_rate": 1.3541875236257473e-06, "loss": 0.0451, "step": 2016 }, { "epoch": 0.89, "grad_norm": 0.4906061290343345, "learning_rate": 1.3439087668281947e-06, "loss": 0.0447, "step": 2017 }, { "epoch": 0.89, "grad_norm": 0.35606602385486186, "learning_rate": 1.3336678121076663e-06, "loss": 0.0386, "step": 2018 }, { "epoch": 0.89, "grad_norm": 0.3321245399015353, "learning_rate": 1.3234646802149853e-06, "loss": 0.0391, "step": 2019 }, { "epoch": 0.89, "grad_norm": 0.35822697417592236, "learning_rate": 1.313299391824341e-06, "loss": 0.034, "step": 2020 }, { "epoch": 0.89, "grad_norm": 0.4295086054766953, "learning_rate": 1.3031719675332455e-06, "loss": 0.0432, "step": 2021 }, { "epoch": 0.89, "grad_norm": 0.41171488027183667, "learning_rate": 1.293082427862491e-06, "loss": 0.0412, "step": 2022 }, { "epoch": 0.89, "grad_norm": 0.34597923545670567, "learning_rate": 1.2830307932560882e-06, "loss": 0.0377, "step": 2023 }, { "epoch": 0.89, "grad_norm": 0.40930084765407304, "learning_rate": 1.273017084081265e-06, "loss": 0.0473, "step": 2024 }, { "epoch": 0.89, "grad_norm": 0.4301712607450389, "learning_rate": 1.2630413206283864e-06, "loss": 0.0452, "step": 2025 }, { "epoch": 0.89, "grad_norm": 0.4273948541243094, "learning_rate": 1.2531035231109413e-06, "loss": 0.043, "step": 2026 }, { "epoch": 0.89, "grad_norm": 0.374819573071526, "learning_rate": 1.2432037116654727e-06, "loss": 0.0483, "step": 2027 }, { "epoch": 0.89, "grad_norm": 0.35139130556752324, "learning_rate": 1.233341906351575e-06, "loss": 0.0328, "step": 2028 }, { "epoch": 0.89, "grad_norm": 0.41124028049450706, "learning_rate": 1.2235181271518192e-06, "loss": 0.0349, "step": 2029 }, { "epoch": 0.89, "grad_norm": 0.3531005593509344, "learning_rate": 1.2137323939717272e-06, "loss": 0.0448, "step": 2030 }, { "epoch": 0.89, "grad_norm": 0.36039543401692087, "learning_rate": 1.2039847266397332e-06, "loss": 0.0449, "step": 2031 }, { "epoch": 0.89, "grad_norm": 0.34897631864554796, "learning_rate": 1.1942751449071354e-06, "loss": 0.0358, "step": 2032 }, { "epoch": 0.89, "grad_norm": 0.37259462914734676, "learning_rate": 1.1846036684480654e-06, "loss": 0.0377, "step": 2033 }, { "epoch": 0.89, "grad_norm": 0.3700949830815237, "learning_rate": 1.1749703168594406e-06, "loss": 0.0329, "step": 2034 }, { "epoch": 0.89, "grad_norm": 0.41362375730602796, "learning_rate": 1.1653751096609312e-06, "loss": 0.038, "step": 2035 }, { "epoch": 0.89, "grad_norm": 0.38910026520205565, "learning_rate": 1.1558180662949092e-06, "loss": 0.0309, "step": 2036 }, { "epoch": 0.89, "grad_norm": 0.3475703229690883, "learning_rate": 1.1462992061264243e-06, "loss": 0.0376, "step": 2037 }, { "epoch": 0.9, "grad_norm": 0.39511137508806776, "learning_rate": 1.1368185484431615e-06, "loss": 0.0471, "step": 2038 }, { "epoch": 0.9, "grad_norm": 0.4225325757791538, "learning_rate": 1.1273761124553805e-06, "loss": 0.0512, "step": 2039 }, { "epoch": 0.9, "grad_norm": 0.4350542267345797, "learning_rate": 1.117971917295908e-06, "loss": 0.0519, "step": 2040 }, { "epoch": 0.9, "grad_norm": 0.42430896115987093, "learning_rate": 1.1086059820200812e-06, "loss": 0.0389, "step": 2041 }, { "epoch": 0.9, "grad_norm": 0.40798068031682555, "learning_rate": 1.0992783256057171e-06, "loss": 0.0446, "step": 2042 }, { "epoch": 0.9, "grad_norm": 0.3452693567035925, "learning_rate": 1.0899889669530594e-06, "loss": 0.039, "step": 2043 }, { "epoch": 0.9, "grad_norm": 0.3496094282778907, "learning_rate": 1.0807379248847583e-06, "loss": 0.0395, "step": 2044 }, { "epoch": 0.9, "grad_norm": 0.385984282032745, "learning_rate": 1.0715252181458259e-06, "loss": 0.0561, "step": 2045 }, { "epoch": 0.9, "grad_norm": 0.43211780253654497, "learning_rate": 1.0623508654035963e-06, "loss": 0.0424, "step": 2046 }, { "epoch": 0.9, "grad_norm": 0.42331669974063685, "learning_rate": 1.0532148852476754e-06, "loss": 0.0412, "step": 2047 }, { "epoch": 0.9, "grad_norm": 0.4862566363021539, "learning_rate": 1.0441172961899392e-06, "loss": 0.0556, "step": 2048 }, { "epoch": 0.9, "grad_norm": 0.36993631256191356, "learning_rate": 1.0350581166644558e-06, "loss": 0.0413, "step": 2049 }, { "epoch": 0.9, "grad_norm": 0.48071110823394325, "learning_rate": 1.0260373650274724e-06, "loss": 0.0458, "step": 2050 }, { "epoch": 0.9, "grad_norm": 0.39874937903496394, "learning_rate": 1.0170550595573726e-06, "loss": 0.0375, "step": 2051 }, { "epoch": 0.9, "grad_norm": 0.44098021615993627, "learning_rate": 1.0081112184546305e-06, "loss": 0.0526, "step": 2052 }, { "epoch": 0.9, "grad_norm": 0.3674076718848533, "learning_rate": 9.99205859841792e-07, "loss": 0.0377, "step": 2053 }, { "epoch": 0.9, "grad_norm": 0.430598904963712, "learning_rate": 9.903390017634185e-07, "loss": 0.0488, "step": 2054 }, { "epoch": 0.9, "grad_norm": 0.3336040495236211, "learning_rate": 9.815106621860626e-07, "loss": 0.0449, "step": 2055 }, { "epoch": 0.9, "grad_norm": 0.394328603539326, "learning_rate": 9.727208589982329e-07, "loss": 0.0493, "step": 2056 }, { "epoch": 0.9, "grad_norm": 0.3514134938980543, "learning_rate": 9.639696100103468e-07, "loss": 0.0336, "step": 2057 }, { "epoch": 0.9, "grad_norm": 0.42400017335838697, "learning_rate": 9.552569329547023e-07, "loss": 0.0416, "step": 2058 }, { "epoch": 0.9, "grad_norm": 0.39751289646652754, "learning_rate": 9.46582845485442e-07, "loss": 0.0346, "step": 2059 }, { "epoch": 0.9, "grad_norm": 0.43341385486027845, "learning_rate": 9.379473651785176e-07, "loss": 0.0468, "step": 2060 }, { "epoch": 0.91, "grad_norm": 0.3713273567947798, "learning_rate": 9.293505095316479e-07, "loss": 0.0314, "step": 2061 }, { "epoch": 0.91, "grad_norm": 0.36315335309138874, "learning_rate": 9.207922959642968e-07, "loss": 0.0398, "step": 2062 }, { "epoch": 0.91, "grad_norm": 0.38912337642690015, "learning_rate": 9.122727418176169e-07, "loss": 0.0356, "step": 2063 }, { "epoch": 0.91, "grad_norm": 0.37668786079284394, "learning_rate": 9.03791864354433e-07, "loss": 0.0389, "step": 2064 }, { "epoch": 0.91, "grad_norm": 0.3732829320717202, "learning_rate": 8.953496807592055e-07, "loss": 0.0367, "step": 2065 }, { "epoch": 0.91, "grad_norm": 0.3539261936521951, "learning_rate": 8.869462081379909e-07, "loss": 0.0385, "step": 2066 }, { "epoch": 0.91, "grad_norm": 0.37350914559153053, "learning_rate": 8.785814635183931e-07, "loss": 0.0386, "step": 2067 }, { "epoch": 0.91, "grad_norm": 0.3746369909582848, "learning_rate": 8.702554638495564e-07, "loss": 0.0407, "step": 2068 }, { "epoch": 0.91, "grad_norm": 0.5182295244940923, "learning_rate": 8.619682260021167e-07, "loss": 0.0515, "step": 2069 }, { "epoch": 0.91, "grad_norm": 0.3682858293561788, "learning_rate": 8.537197667681707e-07, "loss": 0.0435, "step": 2070 }, { "epoch": 0.91, "grad_norm": 0.45512024844129645, "learning_rate": 8.455101028612222e-07, "loss": 0.0478, "step": 2071 }, { "epoch": 0.91, "grad_norm": 0.3760662417778014, "learning_rate": 8.373392509161893e-07, "loss": 0.0352, "step": 2072 }, { "epoch": 0.91, "grad_norm": 0.4184073882774479, "learning_rate": 8.292072274893326e-07, "loss": 0.0532, "step": 2073 }, { "epoch": 0.91, "grad_norm": 0.3690143147085228, "learning_rate": 8.211140490582381e-07, "loss": 0.039, "step": 2074 }, { "epoch": 0.91, "grad_norm": 0.43741235536333045, "learning_rate": 8.130597320217859e-07, "loss": 0.0441, "step": 2075 }, { "epoch": 0.91, "grad_norm": 0.3906754461139806, "learning_rate": 8.050442927001101e-07, "loss": 0.0415, "step": 2076 }, { "epoch": 0.91, "grad_norm": 0.38192552847211725, "learning_rate": 7.970677473345678e-07, "loss": 0.0534, "step": 2077 }, { "epoch": 0.91, "grad_norm": 0.3479313351438078, "learning_rate": 7.891301120877082e-07, "loss": 0.0333, "step": 2078 }, { "epoch": 0.91, "grad_norm": 0.37684849290008654, "learning_rate": 7.812314030432344e-07, "loss": 0.0312, "step": 2079 }, { "epoch": 0.91, "grad_norm": 0.3623058185405505, "learning_rate": 7.73371636205984e-07, "loss": 0.043, "step": 2080 }, { "epoch": 0.91, "grad_norm": 0.3726777852231792, "learning_rate": 7.655508275018752e-07, "loss": 0.0392, "step": 2081 }, { "epoch": 0.91, "grad_norm": 0.3726066668331198, "learning_rate": 7.577689927779052e-07, "loss": 0.0383, "step": 2082 }, { "epoch": 0.92, "grad_norm": 0.3765079656660046, "learning_rate": 7.500261478020787e-07, "loss": 0.0379, "step": 2083 }, { "epoch": 0.92, "grad_norm": 0.4127494397393531, "learning_rate": 7.423223082634079e-07, "loss": 0.0439, "step": 2084 }, { "epoch": 0.92, "grad_norm": 0.39195091284952044, "learning_rate": 7.346574897718684e-07, "loss": 0.0444, "step": 2085 }, { "epoch": 0.92, "grad_norm": 0.40759690457718517, "learning_rate": 7.270317078583766e-07, "loss": 0.0492, "step": 2086 }, { "epoch": 0.92, "grad_norm": 0.3992614641126156, "learning_rate": 7.194449779747347e-07, "loss": 0.0402, "step": 2087 }, { "epoch": 0.92, "grad_norm": 0.3399828921950648, "learning_rate": 7.118973154936259e-07, "loss": 0.035, "step": 2088 }, { "epoch": 0.92, "grad_norm": 0.4082495010534112, "learning_rate": 7.043887357085744e-07, "loss": 0.0388, "step": 2089 }, { "epoch": 0.92, "grad_norm": 0.4035051305685771, "learning_rate": 6.969192538339098e-07, "loss": 0.0448, "step": 2090 }, { "epoch": 0.92, "grad_norm": 0.4235368070247004, "learning_rate": 6.894888850047321e-07, "loss": 0.0483, "step": 2091 }, { "epoch": 0.92, "grad_norm": 0.42783286645977986, "learning_rate": 6.820976442769023e-07, "loss": 0.0478, "step": 2092 }, { "epoch": 0.92, "grad_norm": 0.39381646094958495, "learning_rate": 6.74745546626987e-07, "loss": 0.0452, "step": 2093 }, { "epoch": 0.92, "grad_norm": 0.4039698582041848, "learning_rate": 6.67432606952243e-07, "loss": 0.0435, "step": 2094 }, { "epoch": 0.92, "grad_norm": 0.39894443874549185, "learning_rate": 6.601588400705749e-07, "loss": 0.0398, "step": 2095 }, { "epoch": 0.92, "grad_norm": 0.3652072019209512, "learning_rate": 6.529242607205288e-07, "loss": 0.04, "step": 2096 }, { "epoch": 0.92, "grad_norm": 0.402734067160103, "learning_rate": 6.457288835612363e-07, "loss": 0.0357, "step": 2097 }, { "epoch": 0.92, "grad_norm": 0.37330237222265716, "learning_rate": 6.385727231723927e-07, "loss": 0.0355, "step": 2098 }, { "epoch": 0.92, "grad_norm": 0.39711418303543966, "learning_rate": 6.314557940542388e-07, "loss": 0.0493, "step": 2099 }, { "epoch": 0.92, "grad_norm": 0.38007936075603643, "learning_rate": 6.243781106275148e-07, "loss": 0.0336, "step": 2100 }, { "epoch": 0.92, "grad_norm": 0.3816871060322049, "learning_rate": 6.173396872334402e-07, "loss": 0.046, "step": 2101 }, { "epoch": 0.92, "grad_norm": 0.3660251807529797, "learning_rate": 6.103405381336913e-07, "loss": 0.0514, "step": 2102 }, { "epoch": 0.92, "grad_norm": 0.3906599029206667, "learning_rate": 6.033806775103545e-07, "loss": 0.0366, "step": 2103 }, { "epoch": 0.92, "grad_norm": 0.3336244171483368, "learning_rate": 5.964601194659114e-07, "loss": 0.0361, "step": 2104 }, { "epoch": 0.92, "grad_norm": 0.3501479832270445, "learning_rate": 5.89578878023207e-07, "loss": 0.0394, "step": 2105 }, { "epoch": 0.93, "grad_norm": 0.4133526457356156, "learning_rate": 5.827369671254234e-07, "loss": 0.0356, "step": 2106 }, { "epoch": 0.93, "grad_norm": 0.3647256664188715, "learning_rate": 5.759344006360424e-07, "loss": 0.0347, "step": 2107 }, { "epoch": 0.93, "grad_norm": 0.38153247530791445, "learning_rate": 5.691711923388244e-07, "loss": 0.0431, "step": 2108 }, { "epoch": 0.93, "grad_norm": 0.393730862470311, "learning_rate": 5.624473559377874e-07, "loss": 0.0389, "step": 2109 }, { "epoch": 0.93, "grad_norm": 0.4578178678855361, "learning_rate": 5.55762905057169e-07, "loss": 0.0456, "step": 2110 }, { "epoch": 0.93, "grad_norm": 0.40474020566865054, "learning_rate": 5.491178532413944e-07, "loss": 0.0379, "step": 2111 }, { "epoch": 0.93, "grad_norm": 0.42306458567340355, "learning_rate": 5.425122139550621e-07, "loss": 0.0478, "step": 2112 }, { "epoch": 0.93, "grad_norm": 0.47477210751806853, "learning_rate": 5.359460005829098e-07, "loss": 0.0481, "step": 2113 }, { "epoch": 0.93, "grad_norm": 0.35595644517805836, "learning_rate": 5.294192264297926e-07, "loss": 0.0364, "step": 2114 }, { "epoch": 0.93, "grad_norm": 0.3608796919731402, "learning_rate": 5.229319047206361e-07, "loss": 0.0337, "step": 2115 }, { "epoch": 0.93, "grad_norm": 0.39282133625079424, "learning_rate": 5.164840486004475e-07, "loss": 0.0408, "step": 2116 }, { "epoch": 0.93, "grad_norm": 0.4058159747154884, "learning_rate": 5.100756711342469e-07, "loss": 0.0409, "step": 2117 }, { "epoch": 0.93, "grad_norm": 0.49653724310892566, "learning_rate": 5.037067853070743e-07, "loss": 0.0432, "step": 2118 }, { "epoch": 0.93, "grad_norm": 0.3714615254239154, "learning_rate": 4.973774040239377e-07, "loss": 0.0384, "step": 2119 }, { "epoch": 0.93, "grad_norm": 0.4618872539974985, "learning_rate": 4.91087540109807e-07, "loss": 0.0529, "step": 2120 }, { "epoch": 0.93, "grad_norm": 0.5019407159481163, "learning_rate": 4.848372063095785e-07, "loss": 0.0514, "step": 2121 }, { "epoch": 0.93, "grad_norm": 0.3514482418462704, "learning_rate": 4.78626415288046e-07, "loss": 0.0417, "step": 2122 }, { "epoch": 0.93, "grad_norm": 0.34749636027261865, "learning_rate": 4.7245517962988485e-07, "loss": 0.0317, "step": 2123 }, { "epoch": 0.93, "grad_norm": 0.39070077161227, "learning_rate": 4.663235118396148e-07, "loss": 0.0376, "step": 2124 }, { "epoch": 0.93, "grad_norm": 0.42724640764497396, "learning_rate": 4.6023142434158617e-07, "loss": 0.0483, "step": 2125 }, { "epoch": 0.93, "grad_norm": 0.4173245858405827, "learning_rate": 4.54178929479947e-07, "loss": 0.0418, "step": 2126 }, { "epoch": 0.93, "grad_norm": 0.37950183818834343, "learning_rate": 4.481660395186227e-07, "loss": 0.0453, "step": 2127 }, { "epoch": 0.93, "grad_norm": 0.42450250861771244, "learning_rate": 4.421927666412851e-07, "loss": 0.0463, "step": 2128 }, { "epoch": 0.94, "grad_norm": 0.40168832892983347, "learning_rate": 4.362591229513324e-07, "loss": 0.0483, "step": 2129 }, { "epoch": 0.94, "grad_norm": 0.3781506781883569, "learning_rate": 4.303651204718695e-07, "loss": 0.046, "step": 2130 }, { "epoch": 0.94, "grad_norm": 0.3509469365271963, "learning_rate": 4.245107711456675e-07, "loss": 0.0424, "step": 2131 }, { "epoch": 0.94, "grad_norm": 0.4030341179003194, "learning_rate": 4.186960868351575e-07, "loss": 0.0402, "step": 2132 }, { "epoch": 0.94, "grad_norm": 0.35841005810600346, "learning_rate": 4.1292107932240146e-07, "loss": 0.0372, "step": 2133 }, { "epoch": 0.94, "grad_norm": 0.3856517830671835, "learning_rate": 4.07185760309059e-07, "loss": 0.0407, "step": 2134 }, { "epoch": 0.94, "grad_norm": 0.4056888703138134, "learning_rate": 4.0149014141636965e-07, "loss": 0.0377, "step": 2135 }, { "epoch": 0.94, "grad_norm": 0.37999125602349343, "learning_rate": 3.9583423418513734e-07, "loss": 0.0488, "step": 2136 }, { "epoch": 0.94, "grad_norm": 0.3848530432833172, "learning_rate": 3.9021805007569693e-07, "loss": 0.037, "step": 2137 }, { "epoch": 0.94, "grad_norm": 0.3293412804218251, "learning_rate": 3.8464160046789435e-07, "loss": 0.0292, "step": 2138 }, { "epoch": 0.94, "grad_norm": 0.3972367162698117, "learning_rate": 3.7910489666105556e-07, "loss": 0.0423, "step": 2139 }, { "epoch": 0.94, "grad_norm": 0.3736620120390431, "learning_rate": 3.73607949873982e-07, "loss": 0.0352, "step": 2140 }, { "epoch": 0.94, "grad_norm": 0.37343712810673413, "learning_rate": 3.6815077124491504e-07, "loss": 0.0401, "step": 2141 }, { "epoch": 0.94, "grad_norm": 0.3778437041024785, "learning_rate": 3.6273337183151183e-07, "loss": 0.0445, "step": 2142 }, { "epoch": 0.94, "grad_norm": 0.3830234958174708, "learning_rate": 3.5735576261082484e-07, "loss": 0.0353, "step": 2143 }, { "epoch": 0.94, "grad_norm": 0.44130197491781986, "learning_rate": 3.5201795447928675e-07, "loss": 0.0406, "step": 2144 }, { "epoch": 0.94, "grad_norm": 0.40167937006720356, "learning_rate": 3.4671995825268145e-07, "loss": 0.0473, "step": 2145 }, { "epoch": 0.94, "grad_norm": 0.4358159418751104, "learning_rate": 3.414617846661217e-07, "loss": 0.0428, "step": 2146 }, { "epoch": 0.94, "grad_norm": 0.3566047625356842, "learning_rate": 3.362434443740292e-07, "loss": 0.0388, "step": 2147 }, { "epoch": 0.94, "grad_norm": 0.4509704602647693, "learning_rate": 3.310649479501171e-07, "loss": 0.0428, "step": 2148 }, { "epoch": 0.94, "grad_norm": 0.3968323369261339, "learning_rate": 3.259263058873563e-07, "loss": 0.0523, "step": 2149 }, { "epoch": 0.94, "grad_norm": 0.3611410326767456, "learning_rate": 3.208275285979734e-07, "loss": 0.0344, "step": 2150 }, { "epoch": 0.94, "grad_norm": 0.44573347371156985, "learning_rate": 3.1576862641341076e-07, "loss": 0.0381, "step": 2151 }, { "epoch": 0.95, "grad_norm": 0.3451213791079416, "learning_rate": 3.107496095843154e-07, "loss": 0.0398, "step": 2152 }, { "epoch": 0.95, "grad_norm": 0.41642347232208377, "learning_rate": 3.057704882805146e-07, "loss": 0.0365, "step": 2153 }, { "epoch": 0.95, "grad_norm": 0.3667901506711488, "learning_rate": 3.008312725910023e-07, "loss": 0.0312, "step": 2154 }, { "epoch": 0.95, "grad_norm": 0.47065992671552953, "learning_rate": 2.959319725239085e-07, "loss": 0.0409, "step": 2155 }, { "epoch": 0.95, "grad_norm": 0.40225855842053976, "learning_rate": 2.910725980064832e-07, "loss": 0.0499, "step": 2156 }, { "epoch": 0.95, "grad_norm": 0.432181748429848, "learning_rate": 2.8625315888508144e-07, "loss": 0.0521, "step": 2157 }, { "epoch": 0.95, "grad_norm": 0.37772774044549795, "learning_rate": 2.8147366492513817e-07, "loss": 0.0423, "step": 2158 }, { "epoch": 0.95, "grad_norm": 0.37219872849291813, "learning_rate": 2.7673412581114003e-07, "loss": 0.0402, "step": 2159 }, { "epoch": 0.95, "grad_norm": 0.46251010948164023, "learning_rate": 2.7203455114662937e-07, "loss": 0.0485, "step": 2160 }, { "epoch": 0.95, "grad_norm": 0.39716016556798694, "learning_rate": 2.6737495045415784e-07, "loss": 0.0416, "step": 2161 }, { "epoch": 0.95, "grad_norm": 0.36134513608737084, "learning_rate": 2.6275533317528634e-07, "loss": 0.0391, "step": 2162 }, { "epoch": 0.95, "grad_norm": 0.36448899365514836, "learning_rate": 2.5817570867055606e-07, "loss": 0.0352, "step": 2163 }, { "epoch": 0.95, "grad_norm": 0.47671454771297106, "learning_rate": 2.536360862194753e-07, "loss": 0.0508, "step": 2164 }, { "epoch": 0.95, "grad_norm": 0.3972364724414506, "learning_rate": 2.4913647502049496e-07, "loss": 0.04, "step": 2165 }, { "epoch": 0.95, "grad_norm": 0.4326171875, "learning_rate": 2.44676884190993e-07, "loss": 0.0489, "step": 2166 }, { "epoch": 0.95, "grad_norm": 0.391693370835355, "learning_rate": 2.402573227672589e-07, "loss": 0.0435, "step": 2167 }, { "epoch": 0.95, "grad_norm": 0.36070699143150026, "learning_rate": 2.3587779970446478e-07, "loss": 0.0419, "step": 2168 }, { "epoch": 0.95, "grad_norm": 0.34756975490980907, "learning_rate": 2.3153832387666552e-07, "loss": 0.0307, "step": 2169 }, { "epoch": 0.95, "grad_norm": 0.49555106028798557, "learning_rate": 2.2723890407675864e-07, "loss": 0.0525, "step": 2170 }, { "epoch": 0.95, "grad_norm": 0.3844842801942118, "learning_rate": 2.2297954901648655e-07, "loss": 0.0465, "step": 2171 }, { "epoch": 0.95, "grad_norm": 0.3942759414839151, "learning_rate": 2.1876026732640777e-07, "loss": 0.0524, "step": 2172 }, { "epoch": 0.95, "grad_norm": 0.39074871011644363, "learning_rate": 2.1458106755587682e-07, "loss": 0.0548, "step": 2173 }, { "epoch": 0.95, "grad_norm": 0.4364771295686608, "learning_rate": 2.1044195817303992e-07, "loss": 0.0428, "step": 2174 }, { "epoch": 0.96, "grad_norm": 0.4427888366956021, "learning_rate": 2.0634294756480377e-07, "loss": 0.0372, "step": 2175 }, { "epoch": 0.96, "grad_norm": 0.33529004091984516, "learning_rate": 2.0228404403682456e-07, "loss": 0.031, "step": 2176 }, { "epoch": 0.96, "grad_norm": 0.4135179177326444, "learning_rate": 1.9826525581349675e-07, "loss": 0.0368, "step": 2177 }, { "epoch": 0.96, "grad_norm": 0.33990698259012886, "learning_rate": 1.9428659103792436e-07, "loss": 0.0371, "step": 2178 }, { "epoch": 0.96, "grad_norm": 0.4149975206117217, "learning_rate": 1.9034805777191634e-07, "loss": 0.0425, "step": 2179 }, { "epoch": 0.96, "grad_norm": 0.41887256906658354, "learning_rate": 1.8644966399595566e-07, "loss": 0.0448, "step": 2180 }, { "epoch": 0.96, "grad_norm": 0.37201588149932896, "learning_rate": 1.8259141760920807e-07, "loss": 0.0402, "step": 2181 }, { "epoch": 0.96, "grad_norm": 0.3716253989699012, "learning_rate": 1.787733264294733e-07, "loss": 0.0464, "step": 2182 }, { "epoch": 0.96, "grad_norm": 0.4102387481555161, "learning_rate": 1.7499539819319623e-07, "loss": 0.0383, "step": 2183 }, { "epoch": 0.96, "grad_norm": 0.37423672085419746, "learning_rate": 1.7125764055544225e-07, "loss": 0.0451, "step": 2184 }, { "epoch": 0.96, "grad_norm": 0.43550511660345786, "learning_rate": 1.675600610898709e-07, "loss": 0.0437, "step": 2185 }, { "epoch": 0.96, "grad_norm": 0.456637547126102, "learning_rate": 1.6390266728874227e-07, "loss": 0.0424, "step": 2186 }, { "epoch": 0.96, "grad_norm": 0.3943714346928557, "learning_rate": 1.6028546656287947e-07, "loss": 0.0404, "step": 2187 }, { "epoch": 0.96, "grad_norm": 0.386966144536053, "learning_rate": 1.5670846624167512e-07, "loss": 0.0381, "step": 2188 }, { "epoch": 0.96, "grad_norm": 0.36220746650988483, "learning_rate": 1.531716735730515e-07, "loss": 0.041, "step": 2189 }, { "epoch": 0.96, "grad_norm": 0.36312491631958305, "learning_rate": 1.4967509572347384e-07, "loss": 0.0334, "step": 2190 }, { "epoch": 0.96, "grad_norm": 0.362846154977465, "learning_rate": 1.4621873977791246e-07, "loss": 0.0362, "step": 2191 }, { "epoch": 0.96, "grad_norm": 0.34242943132450154, "learning_rate": 1.4280261273984075e-07, "loss": 0.0465, "step": 2192 }, { "epoch": 0.96, "grad_norm": 0.37663302888867783, "learning_rate": 1.3942672153121728e-07, "loss": 0.0478, "step": 2193 }, { "epoch": 0.96, "grad_norm": 0.41585942591040337, "learning_rate": 1.360910729924725e-07, "loss": 0.039, "step": 2194 }, { "epoch": 0.96, "grad_norm": 0.3987552086720145, "learning_rate": 1.3279567388249536e-07, "loss": 0.0367, "step": 2195 }, { "epoch": 0.96, "grad_norm": 0.4428620426889157, "learning_rate": 1.2954053087862018e-07, "loss": 0.0558, "step": 2196 }, { "epoch": 0.97, "grad_norm": 0.3829874106210562, "learning_rate": 1.2632565057660862e-07, "loss": 0.0373, "step": 2197 }, { "epoch": 0.97, "grad_norm": 0.3962153757774383, "learning_rate": 1.2315103949064322e-07, "loss": 0.0403, "step": 2198 }, { "epoch": 0.97, "grad_norm": 0.4444801180483453, "learning_rate": 1.2001670405330735e-07, "loss": 0.0428, "step": 2199 }, { "epoch": 0.97, "grad_norm": 0.4608567862236696, "learning_rate": 1.169226506155785e-07, "loss": 0.0458, "step": 2200 }, { "epoch": 0.97, "grad_norm": 0.43077169098485574, "learning_rate": 1.138688854468084e-07, "loss": 0.0443, "step": 2201 }, { "epoch": 0.97, "grad_norm": 0.32832541475977833, "learning_rate": 1.1085541473472294e-07, "loss": 0.0397, "step": 2202 }, { "epoch": 0.97, "grad_norm": 0.397855576807219, "learning_rate": 1.0788224458538665e-07, "loss": 0.0499, "step": 2203 }, { "epoch": 0.97, "grad_norm": 0.4302144374214242, "learning_rate": 1.0494938102321827e-07, "loss": 0.0548, "step": 2204 }, { "epoch": 0.97, "grad_norm": 0.3830795329871273, "learning_rate": 1.0205682999095967e-07, "loss": 0.0364, "step": 2205 }, { "epoch": 0.97, "grad_norm": 0.3714232136692346, "learning_rate": 9.920459734966914e-08, "loss": 0.0368, "step": 2206 }, { "epoch": 0.97, "grad_norm": 0.37858859245638166, "learning_rate": 9.639268887870589e-08, "loss": 0.0462, "step": 2207 }, { "epoch": 0.97, "grad_norm": 0.4120367628010104, "learning_rate": 9.362111027572785e-08, "loss": 0.0452, "step": 2208 }, { "epoch": 0.97, "grad_norm": 0.3611897177706527, "learning_rate": 9.088986715667159e-08, "loss": 0.0349, "step": 2209 }, { "epoch": 0.97, "grad_norm": 0.44895101114956426, "learning_rate": 8.819896505574354e-08, "loss": 0.0486, "step": 2210 }, { "epoch": 0.97, "grad_norm": 0.3816381659374841, "learning_rate": 8.55484094254111e-08, "loss": 0.0436, "step": 2211 }, { "epoch": 0.97, "grad_norm": 0.42831063318339757, "learning_rate": 8.293820563638477e-08, "loss": 0.045, "step": 2212 }, { "epoch": 0.97, "grad_norm": 0.38434401519068995, "learning_rate": 8.036835897761164e-08, "loss": 0.0378, "step": 2213 }, { "epoch": 0.97, "grad_norm": 0.4004487530452058, "learning_rate": 7.783887465627305e-08, "loss": 0.0406, "step": 2214 }, { "epoch": 0.97, "grad_norm": 0.3641943582282156, "learning_rate": 7.534975779775577e-08, "loss": 0.0399, "step": 2215 }, { "epoch": 0.97, "grad_norm": 0.3555677506636987, "learning_rate": 7.290101344565648e-08, "loss": 0.0345, "step": 2216 }, { "epoch": 0.97, "grad_norm": 0.3676569757806996, "learning_rate": 7.049264656176613e-08, "loss": 0.0361, "step": 2217 }, { "epoch": 0.97, "grad_norm": 0.37000653467336053, "learning_rate": 6.812466202606117e-08, "loss": 0.0426, "step": 2218 }, { "epoch": 0.97, "grad_norm": 0.42905572048201757, "learning_rate": 6.579706463669234e-08, "loss": 0.0483, "step": 2219 }, { "epoch": 0.98, "grad_norm": 0.39150556495945044, "learning_rate": 6.350985910997364e-08, "loss": 0.0326, "step": 2220 }, { "epoch": 0.98, "grad_norm": 0.3828744838124531, "learning_rate": 6.126305008037791e-08, "loss": 0.0325, "step": 2221 }, { "epoch": 0.98, "grad_norm": 0.500288403304247, "learning_rate": 5.90566421005212e-08, "loss": 0.0577, "step": 2222 }, { "epoch": 0.98, "grad_norm": 0.429255840919404, "learning_rate": 5.6890639641160594e-08, "loss": 0.0461, "step": 2223 }, { "epoch": 0.98, "grad_norm": 0.36787070974841835, "learning_rate": 5.476504709117425e-08, "loss": 0.0354, "step": 2224 }, { "epoch": 0.98, "grad_norm": 0.381303401240094, "learning_rate": 5.2679868757568034e-08, "loss": 0.041, "step": 2225 }, { "epoch": 0.98, "grad_norm": 0.3751556550275032, "learning_rate": 5.063510886545331e-08, "loss": 0.0441, "step": 2226 }, { "epoch": 0.98, "grad_norm": 0.4633457988010123, "learning_rate": 4.8630771558040304e-08, "loss": 0.0483, "step": 2227 }, { "epoch": 0.98, "grad_norm": 0.42923962920362985, "learning_rate": 4.666686089663808e-08, "loss": 0.0466, "step": 2228 }, { "epoch": 0.98, "grad_norm": 0.41745007499125764, "learning_rate": 4.474338086063901e-08, "loss": 0.0482, "step": 2229 }, { "epoch": 0.98, "grad_norm": 0.35254845646312166, "learning_rate": 4.286033534751211e-08, "loss": 0.0395, "step": 2230 }, { "epoch": 0.98, "grad_norm": 0.3673869666635836, "learning_rate": 4.1017728172796365e-08, "loss": 0.0373, "step": 2231 }, { "epoch": 0.98, "grad_norm": 0.36017244430840556, "learning_rate": 3.921556307009189e-08, "loss": 0.0356, "step": 2232 }, { "epoch": 0.98, "grad_norm": 0.388458595991975, "learning_rate": 3.7453843691048765e-08, "loss": 0.0332, "step": 2233 }, { "epoch": 0.98, "grad_norm": 0.35075233144862134, "learning_rate": 3.573257360537152e-08, "loss": 0.0492, "step": 2234 }, { "epoch": 0.98, "grad_norm": 0.35234142480324054, "learning_rate": 3.405175630079693e-08, "loss": 0.0359, "step": 2235 }, { "epoch": 0.98, "grad_norm": 0.36712597270625, "learning_rate": 3.2411395183098435e-08, "loss": 0.0415, "step": 2236 }, { "epoch": 0.98, "grad_norm": 0.3689134494154929, "learning_rate": 3.081149357607061e-08, "loss": 0.0402, "step": 2237 }, { "epoch": 0.98, "grad_norm": 0.3799972753991779, "learning_rate": 2.9252054721531364e-08, "loss": 0.0395, "step": 2238 }, { "epoch": 0.98, "grad_norm": 0.3917828375753828, "learning_rate": 2.7733081779304226e-08, "loss": 0.0427, "step": 2239 }, { "epoch": 0.98, "grad_norm": 0.3908160314749639, "learning_rate": 2.625457782722718e-08, "loss": 0.0467, "step": 2240 }, { "epoch": 0.98, "grad_norm": 0.408853605800495, "learning_rate": 2.481654586112825e-08, "loss": 0.0452, "step": 2241 }, { "epoch": 0.98, "grad_norm": 0.3944861320153985, "learning_rate": 2.3418988794836615e-08, "loss": 0.0376, "step": 2242 }, { "epoch": 0.99, "grad_norm": 0.3804063303226683, "learning_rate": 2.2061909460162623e-08, "loss": 0.0466, "step": 2243 }, { "epoch": 0.99, "grad_norm": 0.4218525527351027, "learning_rate": 2.0745310606899994e-08, "loss": 0.0388, "step": 2244 }, { "epoch": 0.99, "grad_norm": 0.4021741268777111, "learning_rate": 1.94691949028214e-08, "loss": 0.0483, "step": 2245 }, { "epoch": 0.99, "grad_norm": 0.3580160153578766, "learning_rate": 1.8233564933669568e-08, "loss": 0.0466, "step": 2246 }, { "epoch": 0.99, "grad_norm": 0.39321515603610774, "learning_rate": 1.7038423203146194e-08, "loss": 0.0368, "step": 2247 }, { "epoch": 0.99, "grad_norm": 0.3858969076330753, "learning_rate": 1.5883772132923026e-08, "loss": 0.0492, "step": 2248 }, { "epoch": 0.99, "grad_norm": 0.3922273007748656, "learning_rate": 1.4769614062619675e-08, "loss": 0.0333, "step": 2249 }, { "epoch": 0.99, "grad_norm": 0.4454848474640077, "learning_rate": 1.3695951249810269e-08, "loss": 0.0469, "step": 2250 }, { "epoch": 0.99, "grad_norm": 0.4161376774083763, "learning_rate": 1.266278587001235e-08, "loss": 0.0506, "step": 2251 }, { "epoch": 0.99, "grad_norm": 0.4436814738808929, "learning_rate": 1.1670120016689102e-08, "loss": 0.0608, "step": 2252 }, { "epoch": 0.99, "grad_norm": 0.40464986604065906, "learning_rate": 1.0717955701238236e-08, "loss": 0.0436, "step": 2253 }, { "epoch": 0.99, "grad_norm": 0.38543526716282994, "learning_rate": 9.806294852992004e-09, "loss": 0.0388, "step": 2254 }, { "epoch": 0.99, "grad_norm": 0.41113060160062825, "learning_rate": 8.935139319208308e-09, "loss": 0.0445, "step": 2255 }, { "epoch": 0.99, "grad_norm": 0.35077874452507535, "learning_rate": 8.104490865079585e-09, "loss": 0.0409, "step": 2256 }, { "epoch": 0.99, "grad_norm": 0.3675898113278081, "learning_rate": 7.314351173710599e-09, "loss": 0.0363, "step": 2257 }, { "epoch": 0.99, "grad_norm": 0.3716170385966182, "learning_rate": 6.564721846129552e-09, "loss": 0.0489, "step": 2258 }, { "epoch": 0.99, "grad_norm": 0.4317668022670095, "learning_rate": 5.855604401281412e-09, "loss": 0.0456, "step": 2259 }, { "epoch": 0.99, "grad_norm": 0.3690063595688973, "learning_rate": 5.187000276021259e-09, "loss": 0.0476, "step": 2260 }, { "epoch": 0.99, "grad_norm": 0.3588300387719545, "learning_rate": 4.558910825114282e-09, "loss": 0.0435, "step": 2261 }, { "epoch": 0.99, "grad_norm": 0.39710455811059664, "learning_rate": 3.97133732123356e-09, "loss": 0.0486, "step": 2262 }, { "epoch": 0.99, "grad_norm": 0.3786892220230853, "learning_rate": 3.424280954953396e-09, "loss": 0.0438, "step": 2263 }, { "epoch": 0.99, "grad_norm": 0.37573373061876114, "learning_rate": 2.917742834753767e-09, "loss": 0.0411, "step": 2264 }, { "epoch": 0.99, "grad_norm": 0.5451290871480735, "learning_rate": 2.451723987011434e-09, "loss": 0.0491, "step": 2265 }, { "epoch": 1.0, "grad_norm": 0.3930245513430096, "learning_rate": 2.0262253559999443e-09, "loss": 0.0338, "step": 2266 }, { "epoch": 1.0, "grad_norm": 0.4287001449860118, "learning_rate": 1.6412478038918544e-09, "loss": 0.0517, "step": 2267 }, { "epoch": 1.0, "grad_norm": 0.3922541026064373, "learning_rate": 1.296792110749845e-09, "loss": 0.0387, "step": 2268 }, { "epoch": 1.0, "grad_norm": 0.38646489298035486, "learning_rate": 9.928589745311634e-10, "loss": 0.0424, "step": 2269 }, { "epoch": 1.0, "grad_norm": 0.44154268029959864, "learning_rate": 7.294490110854036e-10, "loss": 0.0451, "step": 2270 }, { "epoch": 1.0, "grad_norm": 0.5360758878021937, "learning_rate": 5.065627541456231e-10, "loss": 0.0539, "step": 2271 }, { "epoch": 1.0, "grad_norm": 0.3922425159302, "learning_rate": 3.2420065533944613e-10, "loss": 0.0361, "step": 2272 }, { "epoch": 1.0, "grad_norm": 0.3551363386353439, "learning_rate": 1.823630841779611e-10, "loss": 0.0439, "step": 2273 }, { "epoch": 1.0, "grad_norm": 0.34980829003809644, "learning_rate": 8.105032806460245e-11, "loss": 0.036, "step": 2274 }, { "epoch": 1.0, "grad_norm": 0.39997250194207, "learning_rate": 2.026259227960736e-11, "loss": 0.0366, "step": 2275 }, { "epoch": 1.0, "grad_norm": 0.34213744498515103, "learning_rate": 0.0, "loss": 0.0346, "step": 2276 }, { "epoch": 1.0, "step": 2276, "total_flos": 4.4944666178721363e+21, "train_loss": 0.19645894658203467, "train_runtime": 8777.3981, "train_samples_per_second": 33.197, "train_steps_per_second": 0.259 } ], "logging_steps": 1.0, "max_steps": 2276, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.4944666178721363e+21, "train_batch_size": 4, "trial_name": null, "trial_params": null }