{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004357298474945534, "grad_norm": 0.9746105074882507, "learning_rate": 2.4000000000000003e-06, "loss": 1.7828, "step": 1 }, { "epoch": 0.008714596949891068, "grad_norm": 0.9253537654876709, "learning_rate": 4.800000000000001e-06, "loss": 1.8214, "step": 2 }, { "epoch": 0.013071895424836602, "grad_norm": 1.5627340078353882, "learning_rate": 7.2e-06, "loss": 1.8438, "step": 3 }, { "epoch": 0.017429193899782137, "grad_norm": 0.38599923253059387, "learning_rate": 9.600000000000001e-06, "loss": 1.7347, "step": 4 }, { "epoch": 0.02178649237472767, "grad_norm": 0.5854593515396118, "learning_rate": 1.2e-05, "loss": 1.5746, "step": 5 }, { "epoch": 0.026143790849673203, "grad_norm": 0.5655810236930847, "learning_rate": 1.44e-05, "loss": 1.6704, "step": 6 }, { "epoch": 0.030501089324618737, "grad_norm": 0.5204870700836182, "learning_rate": 1.6800000000000002e-05, "loss": 1.7218, "step": 7 }, { "epoch": 0.034858387799564274, "grad_norm": 0.3172350227832794, "learning_rate": 1.9200000000000003e-05, "loss": 1.7356, "step": 8 }, { "epoch": 0.0392156862745098, "grad_norm": 0.3486512005329132, "learning_rate": 2.16e-05, "loss": 1.6989, "step": 9 }, { "epoch": 0.04357298474945534, "grad_norm": 0.3535781800746918, "learning_rate": 2.4e-05, "loss": 1.8387, "step": 10 }, { "epoch": 0.04793028322440087, "grad_norm": 0.5419184565544128, "learning_rate": 2.64e-05, "loss": 1.6227, "step": 11 }, { "epoch": 0.05228758169934641, "grad_norm": 0.48862531781196594, "learning_rate": 2.88e-05, "loss": 1.7614, "step": 12 }, { "epoch": 0.05664488017429194, "grad_norm": 0.2664097547531128, "learning_rate": 3.12e-05, "loss": 1.6985, "step": 13 }, { "epoch": 0.06100217864923747, "grad_norm": 0.28547024726867676, "learning_rate": 3.3600000000000004e-05, "loss": 1.6369, "step": 14 }, { "epoch": 0.06535947712418301, "grad_norm": 0.7763476371765137, "learning_rate": 3.6e-05, "loss": 1.6875, "step": 15 }, { "epoch": 0.06971677559912855, "grad_norm": 0.40384408831596375, "learning_rate": 3.8400000000000005e-05, "loss": 1.5499, "step": 16 }, { "epoch": 0.07407407407407407, "grad_norm": 0.2866835594177246, "learning_rate": 4.08e-05, "loss": 1.7804, "step": 17 }, { "epoch": 0.0784313725490196, "grad_norm": 0.36669981479644775, "learning_rate": 4.32e-05, "loss": 1.6886, "step": 18 }, { "epoch": 0.08278867102396514, "grad_norm": 0.31070730090141296, "learning_rate": 4.5600000000000004e-05, "loss": 1.5843, "step": 19 }, { "epoch": 0.08714596949891068, "grad_norm": 0.2698839604854584, "learning_rate": 4.8e-05, "loss": 1.6562, "step": 20 }, { "epoch": 0.0915032679738562, "grad_norm": 0.3030396103858948, "learning_rate": 5.04e-05, "loss": 1.7683, "step": 21 }, { "epoch": 0.09586056644880174, "grad_norm": 0.34498634934425354, "learning_rate": 5.28e-05, "loss": 1.7619, "step": 22 }, { "epoch": 0.10021786492374728, "grad_norm": 0.315149188041687, "learning_rate": 5.520000000000001e-05, "loss": 1.6201, "step": 23 }, { "epoch": 0.10457516339869281, "grad_norm": 0.2636476159095764, "learning_rate": 5.76e-05, "loss": 1.6439, "step": 24 }, { "epoch": 0.10893246187363835, "grad_norm": 0.5684914588928223, "learning_rate": 6e-05, "loss": 1.823, "step": 25 }, { "epoch": 0.11328976034858387, "grad_norm": 0.3564894497394562, "learning_rate": 5.9999210388787696e-05, "loss": 1.6807, "step": 26 }, { "epoch": 0.11764705882352941, "grad_norm": 0.3562137186527252, "learning_rate": 5.999684159671647e-05, "loss": 1.7543, "step": 27 }, { "epoch": 0.12200435729847495, "grad_norm": 0.32931092381477356, "learning_rate": 5.999289374848134e-05, "loss": 1.7032, "step": 28 }, { "epoch": 0.12636165577342048, "grad_norm": 0.3593999743461609, "learning_rate": 5.998736705189997e-05, "loss": 1.5749, "step": 29 }, { "epoch": 0.13071895424836602, "grad_norm": 0.31765711307525635, "learning_rate": 5.998026179790181e-05, "loss": 1.5925, "step": 30 }, { "epoch": 0.13507625272331156, "grad_norm": 1.1995874643325806, "learning_rate": 5.9971578360512726e-05, "loss": 1.6874, "step": 31 }, { "epoch": 0.1394335511982571, "grad_norm": 0.3488420248031616, "learning_rate": 5.996131719683537e-05, "loss": 1.8334, "step": 32 }, { "epoch": 0.1437908496732026, "grad_norm": 1.2170417308807373, "learning_rate": 5.994947884702506e-05, "loss": 1.8111, "step": 33 }, { "epoch": 0.14814814814814814, "grad_norm": 7.140843868255615, "learning_rate": 5.993606393426137e-05, "loss": 1.7542, "step": 34 }, { "epoch": 0.15250544662309368, "grad_norm": 6.034117221832275, "learning_rate": 5.992107316471536e-05, "loss": 1.6763, "step": 35 }, { "epoch": 0.1568627450980392, "grad_norm": 24.70130729675293, "learning_rate": 5.990450732751232e-05, "loss": 1.7738, "step": 36 }, { "epoch": 0.16122004357298475, "grad_norm": 2.477736234664917, "learning_rate": 5.988636729469032e-05, "loss": 1.8413, "step": 37 }, { "epoch": 0.1655773420479303, "grad_norm": 0.5539727210998535, "learning_rate": 5.986665402115423e-05, "loss": 1.7711, "step": 38 }, { "epoch": 0.16993464052287582, "grad_norm": 0.651229739189148, "learning_rate": 5.9845368544625516e-05, "loss": 1.809, "step": 39 }, { "epoch": 0.17429193899782136, "grad_norm": 0.4992523491382599, "learning_rate": 5.9822511985587577e-05, "loss": 1.9164, "step": 40 }, { "epoch": 0.1786492374727669, "grad_norm": 0.5860515832901001, "learning_rate": 5.979808554722675e-05, "loss": 1.737, "step": 41 }, { "epoch": 0.1830065359477124, "grad_norm": 0.9775006175041199, "learning_rate": 5.977209051536903e-05, "loss": 1.6852, "step": 42 }, { "epoch": 0.18736383442265794, "grad_norm": 0.5079925656318665, "learning_rate": 5.974452825841231e-05, "loss": 1.6264, "step": 43 }, { "epoch": 0.19172113289760348, "grad_norm": 0.4939415752887726, "learning_rate": 5.97154002272544e-05, "loss": 1.6138, "step": 44 }, { "epoch": 0.19607843137254902, "grad_norm": 0.4919627606868744, "learning_rate": 5.968470795521663e-05, "loss": 1.6991, "step": 45 }, { "epoch": 0.20043572984749455, "grad_norm": 0.9349490404129028, "learning_rate": 5.965245305796316e-05, "loss": 1.7488, "step": 46 }, { "epoch": 0.2047930283224401, "grad_norm": 8.618950843811035, "learning_rate": 5.9618637233415866e-05, "loss": 1.7847, "step": 47 }, { "epoch": 0.20915032679738563, "grad_norm": 0.656200110912323, "learning_rate": 5.958326226166505e-05, "loss": 1.7535, "step": 48 }, { "epoch": 0.21350762527233116, "grad_norm": 0.49261513352394104, "learning_rate": 5.954633000487565e-05, "loss": 1.6509, "step": 49 }, { "epoch": 0.2178649237472767, "grad_norm": 0.7735104560852051, "learning_rate": 5.950784240718929e-05, "loss": 1.7014, "step": 50 }, { "epoch": 0.2222222222222222, "grad_norm": 0.5278734564781189, "learning_rate": 5.946780149462187e-05, "loss": 1.6646, "step": 51 }, { "epoch": 0.22657952069716775, "grad_norm": 10.276693344116211, "learning_rate": 5.942620937495696e-05, "loss": 1.9713, "step": 52 }, { "epoch": 0.23093681917211328, "grad_norm": 0.9025694727897644, "learning_rate": 5.938306823763481e-05, "loss": 1.7381, "step": 53 }, { "epoch": 0.23529411764705882, "grad_norm": 0.6549501419067383, "learning_rate": 5.933838035363717e-05, "loss": 1.5728, "step": 54 }, { "epoch": 0.23965141612200436, "grad_norm": 0.6484580039978027, "learning_rate": 5.9292148075367635e-05, "loss": 1.7029, "step": 55 }, { "epoch": 0.2440087145969499, "grad_norm": 0.5284441113471985, "learning_rate": 5.924437383652789e-05, "loss": 1.6064, "step": 56 }, { "epoch": 0.24836601307189543, "grad_norm": 0.4835830330848694, "learning_rate": 5.9195060151989595e-05, "loss": 1.6933, "step": 57 }, { "epoch": 0.25272331154684097, "grad_norm": 0.4144735336303711, "learning_rate": 5.914420961766194e-05, "loss": 1.7435, "step": 58 }, { "epoch": 0.2570806100217865, "grad_norm": 0.41090619564056396, "learning_rate": 5.909182491035509e-05, "loss": 1.6487, "step": 59 }, { "epoch": 0.26143790849673204, "grad_norm": 0.35188132524490356, "learning_rate": 5.9037908787639174e-05, "loss": 1.7431, "step": 60 }, { "epoch": 0.2657952069716776, "grad_norm": 0.5362684726715088, "learning_rate": 5.898246408769921e-05, "loss": 1.7848, "step": 61 }, { "epoch": 0.2701525054466231, "grad_norm": 0.37840691208839417, "learning_rate": 5.892549372918564e-05, "loss": 1.807, "step": 62 }, { "epoch": 0.27450980392156865, "grad_norm": 0.6966099143028259, "learning_rate": 5.8867000711060704e-05, "loss": 1.6143, "step": 63 }, { "epoch": 0.2788671023965142, "grad_norm": 1.4057034254074097, "learning_rate": 5.8806988112440624e-05, "loss": 1.5769, "step": 64 }, { "epoch": 0.28322440087145967, "grad_norm": 0.38457629084587097, "learning_rate": 5.8745459092433436e-05, "loss": 1.6125, "step": 65 }, { "epoch": 0.2875816993464052, "grad_norm": 0.3964170813560486, "learning_rate": 5.868241688997275e-05, "loss": 1.9211, "step": 66 }, { "epoch": 0.29193899782135074, "grad_norm": 0.4357544183731079, "learning_rate": 5.861786482364723e-05, "loss": 1.7145, "step": 67 }, { "epoch": 0.2962962962962963, "grad_norm": 0.4696052670478821, "learning_rate": 5.8551806291525885e-05, "loss": 1.5768, "step": 68 }, { "epoch": 0.3006535947712418, "grad_norm": 0.5432698130607605, "learning_rate": 5.848424477097924e-05, "loss": 1.7934, "step": 69 }, { "epoch": 0.30501089324618735, "grad_norm": 0.44644972681999207, "learning_rate": 5.8415183818496234e-05, "loss": 1.7114, "step": 70 }, { "epoch": 0.3093681917211329, "grad_norm": 0.5297689437866211, "learning_rate": 5.8344627069497025e-05, "loss": 1.6654, "step": 71 }, { "epoch": 0.3137254901960784, "grad_norm": 0.5777753591537476, "learning_rate": 5.827257823814162e-05, "loss": 1.7446, "step": 72 }, { "epoch": 0.31808278867102396, "grad_norm": 5.469757556915283, "learning_rate": 5.819904111713436e-05, "loss": 1.7395, "step": 73 }, { "epoch": 0.3224400871459695, "grad_norm": 0.5288973450660706, "learning_rate": 5.812401957752426e-05, "loss": 1.8596, "step": 74 }, { "epoch": 0.32679738562091504, "grad_norm": 1.6414694786071777, "learning_rate": 5.804751756850124e-05, "loss": 1.6706, "step": 75 }, { "epoch": 0.3311546840958606, "grad_norm": 0.9940863847732544, "learning_rate": 5.7969539117188256e-05, "loss": 1.7504, "step": 76 }, { "epoch": 0.3355119825708061, "grad_norm": 4.836872577667236, "learning_rate": 5.789008832842924e-05, "loss": 1.8491, "step": 77 }, { "epoch": 0.33986928104575165, "grad_norm": 1.7528856992721558, "learning_rate": 5.780916938457314e-05, "loss": 1.8635, "step": 78 }, { "epoch": 0.3442265795206972, "grad_norm": 1.1119537353515625, "learning_rate": 5.772678654525362e-05, "loss": 1.7444, "step": 79 }, { "epoch": 0.3485838779956427, "grad_norm": 1.9335803985595703, "learning_rate": 5.764294414716494e-05, "loss": 1.6888, "step": 80 }, { "epoch": 0.35294117647058826, "grad_norm": 2.479970932006836, "learning_rate": 5.755764660383358e-05, "loss": 1.5989, "step": 81 }, { "epoch": 0.3572984749455338, "grad_norm": 0.6857799887657166, "learning_rate": 5.7470898405386005e-05, "loss": 1.7211, "step": 82 }, { "epoch": 0.3616557734204793, "grad_norm": 0.5701134204864502, "learning_rate": 5.738270411831222e-05, "loss": 1.5853, "step": 83 }, { "epoch": 0.3660130718954248, "grad_norm": 1.7818999290466309, "learning_rate": 5.729306838522541e-05, "loss": 1.6756, "step": 84 }, { "epoch": 0.37037037037037035, "grad_norm": 0.6445415019989014, "learning_rate": 5.720199592461757e-05, "loss": 1.7658, "step": 85 }, { "epoch": 0.3747276688453159, "grad_norm": 0.7761105298995972, "learning_rate": 5.7109491530611106e-05, "loss": 1.6389, "step": 86 }, { "epoch": 0.3790849673202614, "grad_norm": 0.40940380096435547, "learning_rate": 5.701556007270647e-05, "loss": 1.6816, "step": 87 }, { "epoch": 0.38344226579520696, "grad_norm": 0.640964925289154, "learning_rate": 5.692020649552581e-05, "loss": 1.5405, "step": 88 }, { "epoch": 0.3877995642701525, "grad_norm": 0.5489770174026489, "learning_rate": 5.6823435818552696e-05, "loss": 1.7949, "step": 89 }, { "epoch": 0.39215686274509803, "grad_norm": 0.7703227400779724, "learning_rate": 5.6725253135867915e-05, "loss": 1.7931, "step": 90 }, { "epoch": 0.39651416122004357, "grad_norm": 0.5832772254943848, "learning_rate": 5.662566361588127e-05, "loss": 1.8679, "step": 91 }, { "epoch": 0.4008714596949891, "grad_norm": 0.8693870902061462, "learning_rate": 5.652467250105952e-05, "loss": 1.6827, "step": 92 }, { "epoch": 0.40522875816993464, "grad_norm": 2.1690566539764404, "learning_rate": 5.642228510765046e-05, "loss": 1.7037, "step": 93 }, { "epoch": 0.4095860566448802, "grad_norm": 1.1100988388061523, "learning_rate": 5.6318506825403015e-05, "loss": 1.6876, "step": 94 }, { "epoch": 0.4139433551198257, "grad_norm": 10.220698356628418, "learning_rate": 5.6213343117283506e-05, "loss": 1.8567, "step": 95 }, { "epoch": 0.41830065359477125, "grad_norm": 4.02601432800293, "learning_rate": 5.610679951918817e-05, "loss": 1.7431, "step": 96 }, { "epoch": 0.4226579520697168, "grad_norm": 9.665276527404785, "learning_rate": 5.5998881639651626e-05, "loss": 1.6413, "step": 97 }, { "epoch": 0.42701525054466233, "grad_norm": 16.3509578704834, "learning_rate": 5.588959515955173e-05, "loss": 2.2493, "step": 98 }, { "epoch": 0.43137254901960786, "grad_norm": 24.08732795715332, "learning_rate": 5.577894583181048e-05, "loss": 2.2252, "step": 99 }, { "epoch": 0.4357298474945534, "grad_norm": 17.57210350036621, "learning_rate": 5.566693948109122e-05, "loss": 1.7175, "step": 100 }, { "epoch": 0.4400871459694989, "grad_norm": 43.53101348876953, "learning_rate": 5.5553582003491944e-05, "loss": 1.9579, "step": 101 }, { "epoch": 0.4444444444444444, "grad_norm": 27.57504653930664, "learning_rate": 5.543887936623503e-05, "loss": 1.9186, "step": 102 }, { "epoch": 0.44880174291938996, "grad_norm": 18.363325119018555, "learning_rate": 5.532283760735302e-05, "loss": 1.8096, "step": 103 }, { "epoch": 0.4531590413943355, "grad_norm": 27.843307495117188, "learning_rate": 5.520546283537086e-05, "loss": 1.8707, "step": 104 }, { "epoch": 0.45751633986928103, "grad_norm": 11.314703941345215, "learning_rate": 5.508676122898427e-05, "loss": 1.5912, "step": 105 }, { "epoch": 0.46187363834422657, "grad_norm": 9.792496681213379, "learning_rate": 5.496673903673455e-05, "loss": 1.6825, "step": 106 }, { "epoch": 0.4662309368191721, "grad_norm": 11.959925651550293, "learning_rate": 5.484540257667961e-05, "loss": 1.7789, "step": 107 }, { "epoch": 0.47058823529411764, "grad_norm": 2.4276187419891357, "learning_rate": 5.47227582360614e-05, "loss": 1.6628, "step": 108 }, { "epoch": 0.4749455337690632, "grad_norm": 2.457439661026001, "learning_rate": 5.459881247096969e-05, "loss": 1.6083, "step": 109 }, { "epoch": 0.4793028322440087, "grad_norm": 19.251781463623047, "learning_rate": 5.447357180600219e-05, "loss": 1.6644, "step": 110 }, { "epoch": 0.48366013071895425, "grad_norm": 1.5374093055725098, "learning_rate": 5.434704283392115e-05, "loss": 1.6576, "step": 111 }, { "epoch": 0.4880174291938998, "grad_norm": 1.1958122253417969, "learning_rate": 5.421923221530622e-05, "loss": 1.5076, "step": 112 }, { "epoch": 0.4923747276688453, "grad_norm": 1.085802435874939, "learning_rate": 5.409014667820389e-05, "loss": 1.7292, "step": 113 }, { "epoch": 0.49673202614379086, "grad_norm": 1.2091395854949951, "learning_rate": 5.395979301777334e-05, "loss": 1.5753, "step": 114 }, { "epoch": 0.5010893246187363, "grad_norm": 2.773833990097046, "learning_rate": 5.3828178095928684e-05, "loss": 1.7763, "step": 115 }, { "epoch": 0.5054466230936819, "grad_norm": 0.7476174831390381, "learning_rate": 5.369530884097779e-05, "loss": 1.7815, "step": 116 }, { "epoch": 0.5098039215686274, "grad_norm": 0.7140160202980042, "learning_rate": 5.356119224725757e-05, "loss": 1.7438, "step": 117 }, { "epoch": 0.514161220043573, "grad_norm": 0.6035992503166199, "learning_rate": 5.3425835374765745e-05, "loss": 1.7846, "step": 118 }, { "epoch": 0.5185185185185185, "grad_norm": 3.483454465866089, "learning_rate": 5.328924534878927e-05, "loss": 1.8446, "step": 119 }, { "epoch": 0.5228758169934641, "grad_norm": 0.5726216435432434, "learning_rate": 5.3151429359529226e-05, "loss": 1.692, "step": 120 }, { "epoch": 0.5272331154684096, "grad_norm": 0.7607144117355347, "learning_rate": 5.3012394661722275e-05, "loss": 1.7304, "step": 121 }, { "epoch": 0.5315904139433552, "grad_norm": 0.586825966835022, "learning_rate": 5.2872148574258864e-05, "loss": 1.7993, "step": 122 }, { "epoch": 0.5359477124183006, "grad_norm": 0.4952510893344879, "learning_rate": 5.273069847979786e-05, "loss": 1.6976, "step": 123 }, { "epoch": 0.5403050108932462, "grad_norm": 0.5482230186462402, "learning_rate": 5.258805182437795e-05, "loss": 1.6702, "step": 124 }, { "epoch": 0.5446623093681917, "grad_norm": 1.4653637409210205, "learning_rate": 5.244421611702573e-05, "loss": 1.7064, "step": 125 }, { "epoch": 0.5490196078431373, "grad_norm": 0.7224717140197754, "learning_rate": 5.229919892936033e-05, "loss": 1.6133, "step": 126 }, { "epoch": 0.5533769063180828, "grad_norm": 2.9934468269348145, "learning_rate": 5.2153007895194915e-05, "loss": 1.8177, "step": 127 }, { "epoch": 0.5577342047930284, "grad_norm": 0.53426593542099, "learning_rate": 5.2005650710134804e-05, "loss": 1.775, "step": 128 }, { "epoch": 0.5620915032679739, "grad_norm": 0.5842443108558655, "learning_rate": 5.185713513117236e-05, "loss": 1.7364, "step": 129 }, { "epoch": 0.5664488017429193, "grad_norm": 0.7634817361831665, "learning_rate": 5.1707468976278674e-05, "loss": 1.6766, "step": 130 }, { "epoch": 0.5708061002178649, "grad_norm": 2.6095964908599854, "learning_rate": 5.1556660123992014e-05, "loss": 1.5793, "step": 131 }, { "epoch": 0.5751633986928104, "grad_norm": 0.6185954809188843, "learning_rate": 5.140471651300309e-05, "loss": 1.7295, "step": 132 }, { "epoch": 0.579520697167756, "grad_norm": 0.7229522466659546, "learning_rate": 5.125164614173716e-05, "loss": 1.6844, "step": 133 }, { "epoch": 0.5838779956427015, "grad_norm": 1.0191963911056519, "learning_rate": 5.109745706793299e-05, "loss": 1.6971, "step": 134 }, { "epoch": 0.5882352941176471, "grad_norm": 7.9863972663879395, "learning_rate": 5.094215740821867e-05, "loss": 1.6789, "step": 135 }, { "epoch": 0.5925925925925926, "grad_norm": 0.5257764458656311, "learning_rate": 5.07857553376844e-05, "loss": 1.6463, "step": 136 }, { "epoch": 0.5969498910675382, "grad_norm": 0.6847944855690002, "learning_rate": 5.062825908945204e-05, "loss": 1.7736, "step": 137 }, { "epoch": 0.6013071895424836, "grad_norm": 0.46256768703460693, "learning_rate": 5.0469676954241857e-05, "loss": 1.721, "step": 138 }, { "epoch": 0.6056644880174292, "grad_norm": 0.5772983431816101, "learning_rate": 5.0310017279935964e-05, "loss": 1.8956, "step": 139 }, { "epoch": 0.6100217864923747, "grad_norm": 1.1113687753677368, "learning_rate": 5.0149288471138975e-05, "loss": 1.8724, "step": 140 }, { "epoch": 0.6143790849673203, "grad_norm": 1.8465428352355957, "learning_rate": 4.9987498988735525e-05, "loss": 1.6231, "step": 141 }, { "epoch": 0.6187363834422658, "grad_norm": 0.6225913166999817, "learning_rate": 4.982465734944489e-05, "loss": 1.7372, "step": 142 }, { "epoch": 0.6230936819172114, "grad_norm": 0.6797170042991638, "learning_rate": 4.9660772125372715e-05, "loss": 1.6512, "step": 143 }, { "epoch": 0.6274509803921569, "grad_norm": 0.7226108312606812, "learning_rate": 4.949585194355966e-05, "loss": 1.7288, "step": 144 }, { "epoch": 0.6318082788671024, "grad_norm": 2.4343645572662354, "learning_rate": 4.9329905485527394e-05, "loss": 1.5985, "step": 145 }, { "epoch": 0.6361655773420479, "grad_norm": 0.8588735461235046, "learning_rate": 4.916294148682151e-05, "loss": 1.7933, "step": 146 }, { "epoch": 0.6405228758169934, "grad_norm": 0.9162130951881409, "learning_rate": 4.899496873655169e-05, "loss": 1.7511, "step": 147 }, { "epoch": 0.644880174291939, "grad_norm": 1.1275595426559448, "learning_rate": 4.882599607692908e-05, "loss": 1.7821, "step": 148 }, { "epoch": 0.6492374727668845, "grad_norm": 0.596320390701294, "learning_rate": 4.865603240280079e-05, "loss": 1.4973, "step": 149 }, { "epoch": 0.6535947712418301, "grad_norm": 2.232231855392456, "learning_rate": 4.8485086661181656e-05, "loss": 1.6625, "step": 150 }, { "epoch": 0.6579520697167756, "grad_norm": 3.6543140411376953, "learning_rate": 4.8313167850783304e-05, "loss": 1.7756, "step": 151 }, { "epoch": 0.6623093681917211, "grad_norm": 32.78754425048828, "learning_rate": 4.8140285021540423e-05, "loss": 1.7519, "step": 152 }, { "epoch": 0.6666666666666666, "grad_norm": 10.278705596923828, "learning_rate": 4.796644727413438e-05, "loss": 1.7621, "step": 153 }, { "epoch": 0.6710239651416122, "grad_norm": 6.011966705322266, "learning_rate": 4.779166375951412e-05, "loss": 1.7506, "step": 154 }, { "epoch": 0.6753812636165577, "grad_norm": 8.796587944030762, "learning_rate": 4.7615943678414515e-05, "loss": 1.8206, "step": 155 }, { "epoch": 0.6797385620915033, "grad_norm": 9.715092658996582, "learning_rate": 4.743929628087197e-05, "loss": 1.736, "step": 156 }, { "epoch": 0.6840958605664488, "grad_norm": 20.890012741088867, "learning_rate": 4.726173086573755e-05, "loss": 1.792, "step": 157 }, { "epoch": 0.6884531590413944, "grad_norm": 6.689200401306152, "learning_rate": 4.708325678018744e-05, "loss": 1.8754, "step": 158 }, { "epoch": 0.6928104575163399, "grad_norm": 10.521119117736816, "learning_rate": 4.6903883419230883e-05, "loss": 1.6421, "step": 159 }, { "epoch": 0.6971677559912854, "grad_norm": 2.2947447299957275, "learning_rate": 4.67236202252157e-05, "loss": 1.7782, "step": 160 }, { "epoch": 0.7015250544662309, "grad_norm": 2.0073139667510986, "learning_rate": 4.6542476687331165e-05, "loss": 1.7282, "step": 161 }, { "epoch": 0.7058823529411765, "grad_norm": 2.2821898460388184, "learning_rate": 4.636046234110852e-05, "loss": 1.7421, "step": 162 }, { "epoch": 0.710239651416122, "grad_norm": 0.8621974587440491, "learning_rate": 4.6177586767918984e-05, "loss": 1.7407, "step": 163 }, { "epoch": 0.7145969498910676, "grad_norm": 3.2683534622192383, "learning_rate": 4.599385959446945e-05, "loss": 1.8106, "step": 164 }, { "epoch": 0.7189542483660131, "grad_norm": 2.4297866821289062, "learning_rate": 4.5809290492295645e-05, "loss": 1.5172, "step": 165 }, { "epoch": 0.7233115468409586, "grad_norm": 3.033637046813965, "learning_rate": 4.5623889177253074e-05, "loss": 1.7526, "step": 166 }, { "epoch": 0.7276688453159041, "grad_norm": 1.2864795923233032, "learning_rate": 4.5437665409005564e-05, "loss": 1.7886, "step": 167 }, { "epoch": 0.7320261437908496, "grad_norm": 2.133450984954834, "learning_rate": 4.525062899051145e-05, "loss": 1.7397, "step": 168 }, { "epoch": 0.7363834422657952, "grad_norm": 0.7829226851463318, "learning_rate": 4.506278976750763e-05, "loss": 1.7315, "step": 169 }, { "epoch": 0.7407407407407407, "grad_norm": 0.8621643781661987, "learning_rate": 4.487415762799118e-05, "loss": 1.7408, "step": 170 }, { "epoch": 0.7450980392156863, "grad_norm": 0.6500158905982971, "learning_rate": 4.468474250169896e-05, "loss": 1.7465, "step": 171 }, { "epoch": 0.7494553376906318, "grad_norm": 0.5809956192970276, "learning_rate": 4.4494554359584785e-05, "loss": 1.7293, "step": 172 }, { "epoch": 0.7538126361655774, "grad_norm": 0.7333821058273315, "learning_rate": 4.430360321329463e-05, "loss": 1.727, "step": 173 }, { "epoch": 0.7581699346405228, "grad_norm": 0.5068430304527283, "learning_rate": 4.411189911463955e-05, "loss": 1.7167, "step": 174 }, { "epoch": 0.7625272331154684, "grad_norm": 0.6892985701560974, "learning_rate": 4.3919452155066614e-05, "loss": 1.5829, "step": 175 }, { "epoch": 0.7668845315904139, "grad_norm": 1.0960310697555542, "learning_rate": 4.372627246512761e-05, "loss": 1.7752, "step": 176 }, { "epoch": 0.7712418300653595, "grad_norm": 0.9808456897735596, "learning_rate": 4.3532370213945826e-05, "loss": 1.6206, "step": 177 }, { "epoch": 0.775599128540305, "grad_norm": 0.9465530514717102, "learning_rate": 4.333775560868071e-05, "loss": 1.7664, "step": 178 }, { "epoch": 0.7799564270152506, "grad_norm": 2.3225557804107666, "learning_rate": 4.3142438893990536e-05, "loss": 1.7564, "step": 179 }, { "epoch": 0.7843137254901961, "grad_norm": 0.833987295627594, "learning_rate": 4.294643035149318e-05, "loss": 1.7599, "step": 180 }, { "epoch": 0.7886710239651417, "grad_norm": 7.097892761230469, "learning_rate": 4.274974029922482e-05, "loss": 1.7297, "step": 181 }, { "epoch": 0.7930283224400871, "grad_norm": 1.9574705362319946, "learning_rate": 4.255237909109685e-05, "loss": 1.6733, "step": 182 }, { "epoch": 0.7973856209150327, "grad_norm": 5.98021125793457, "learning_rate": 4.235435711635076e-05, "loss": 1.7801, "step": 183 }, { "epoch": 0.8017429193899782, "grad_norm": 1.4386473894119263, "learning_rate": 4.2155684799011346e-05, "loss": 1.7036, "step": 184 }, { "epoch": 0.8061002178649237, "grad_norm": 1.5464740991592407, "learning_rate": 4.195637259733789e-05, "loss": 1.7301, "step": 185 }, { "epoch": 0.8104575163398693, "grad_norm": 2.3751494884490967, "learning_rate": 4.1756431003273664e-05, "loss": 1.7411, "step": 186 }, { "epoch": 0.8148148148148148, "grad_norm": 0.7723326086997986, "learning_rate": 4.155587054189365e-05, "loss": 1.6975, "step": 187 }, { "epoch": 0.8191721132897604, "grad_norm": 0.8427643775939941, "learning_rate": 4.135470177085042e-05, "loss": 1.7452, "step": 188 }, { "epoch": 0.8235294117647058, "grad_norm": 0.6999509930610657, "learning_rate": 4.11529352798185e-05, "loss": 1.8235, "step": 189 }, { "epoch": 0.8278867102396514, "grad_norm": 0.6350328326225281, "learning_rate": 4.0950581689936744e-05, "loss": 1.8013, "step": 190 }, { "epoch": 0.8322440087145969, "grad_norm": 0.5478256344795227, "learning_rate": 4.074765165324942e-05, "loss": 1.8239, "step": 191 }, { "epoch": 0.8366013071895425, "grad_norm": 0.7619976997375488, "learning_rate": 4.0544155852145334e-05, "loss": 1.6571, "step": 192 }, { "epoch": 0.840958605664488, "grad_norm": 0.6814029216766357, "learning_rate": 4.034010499879557e-05, "loss": 1.8015, "step": 193 }, { "epoch": 0.8453159041394336, "grad_norm": 1.5520319938659668, "learning_rate": 4.0135509834589566e-05, "loss": 1.7239, "step": 194 }, { "epoch": 0.8496732026143791, "grad_norm": 0.7724117040634155, "learning_rate": 3.9930381129569696e-05, "loss": 1.6609, "step": 195 }, { "epoch": 0.8540305010893247, "grad_norm": 1.1484628915786743, "learning_rate": 3.972472968186434e-05, "loss": 1.7557, "step": 196 }, { "epoch": 0.8583877995642701, "grad_norm": 1.2805622816085815, "learning_rate": 3.95185663171194e-05, "loss": 1.7035, "step": 197 }, { "epoch": 0.8627450980392157, "grad_norm": 0.890356183052063, "learning_rate": 3.931190188792853e-05, "loss": 1.6805, "step": 198 }, { "epoch": 0.8671023965141612, "grad_norm": 1.8623896837234497, "learning_rate": 3.910474727326175e-05, "loss": 1.6048, "step": 199 }, { "epoch": 0.8714596949891068, "grad_norm": 1.6572378873825073, "learning_rate": 3.8897113377892805e-05, "loss": 1.6146, "step": 200 }, { "epoch": 0.8758169934640523, "grad_norm": 0.8090646862983704, "learning_rate": 3.8689011131825186e-05, "loss": 1.6052, "step": 201 }, { "epoch": 0.8801742919389978, "grad_norm": 1.0492998361587524, "learning_rate": 3.8480451489716636e-05, "loss": 1.8196, "step": 202 }, { "epoch": 0.8845315904139434, "grad_norm": 0.5649470686912537, "learning_rate": 3.827144543030266e-05, "loss": 1.544, "step": 203 }, { "epoch": 0.8888888888888888, "grad_norm": 0.9283618330955505, "learning_rate": 3.806200395581841e-05, "loss": 1.6018, "step": 204 }, { "epoch": 0.8932461873638344, "grad_norm": 1.2240904569625854, "learning_rate": 3.785213809141969e-05, "loss": 1.8455, "step": 205 }, { "epoch": 0.8976034858387799, "grad_norm": 1.7425321340560913, "learning_rate": 3.764185888460246e-05, "loss": 1.9087, "step": 206 }, { "epoch": 0.9019607843137255, "grad_norm": 6.965508460998535, "learning_rate": 3.743117740462135e-05, "loss": 1.8282, "step": 207 }, { "epoch": 0.906318082788671, "grad_norm": 1.315513253211975, "learning_rate": 3.722010474190695e-05, "loss": 1.8563, "step": 208 }, { "epoch": 0.9106753812636166, "grad_norm": 0.5694398283958435, "learning_rate": 3.700865200748199e-05, "loss": 1.7312, "step": 209 }, { "epoch": 0.9150326797385621, "grad_norm": 1.3388159275054932, "learning_rate": 3.679683033237648e-05, "loss": 1.7612, "step": 210 }, { "epoch": 0.9193899782135077, "grad_norm": 1.2813358306884766, "learning_rate": 3.658465086704174e-05, "loss": 1.6871, "step": 211 }, { "epoch": 0.9237472766884531, "grad_norm": 0.9567870497703552, "learning_rate": 3.6372124780763396e-05, "loss": 1.731, "step": 212 }, { "epoch": 0.9281045751633987, "grad_norm": 1.2562271356582642, "learning_rate": 3.6159263261073515e-05, "loss": 1.6309, "step": 213 }, { "epoch": 0.9324618736383442, "grad_norm": 1.0945274829864502, "learning_rate": 3.59460775131616e-05, "loss": 1.8533, "step": 214 }, { "epoch": 0.9368191721132898, "grad_norm": 0.546955406665802, "learning_rate": 3.5732578759284776e-05, "loss": 1.6818, "step": 215 }, { "epoch": 0.9411764705882353, "grad_norm": 0.6648678183555603, "learning_rate": 3.551877823817702e-05, "loss": 1.8012, "step": 216 }, { "epoch": 0.9455337690631809, "grad_norm": 0.9706017374992371, "learning_rate": 3.53046872044576e-05, "loss": 1.766, "step": 217 }, { "epoch": 0.9498910675381264, "grad_norm": 0.685961127281189, "learning_rate": 3.5090316928038546e-05, "loss": 1.735, "step": 218 }, { "epoch": 0.954248366013072, "grad_norm": 1.0843380689620972, "learning_rate": 3.487567869353144e-05, "loss": 1.5208, "step": 219 }, { "epoch": 0.9586056644880174, "grad_norm": 0.7091177701950073, "learning_rate": 3.466078379965341e-05, "loss": 1.8162, "step": 220 }, { "epoch": 0.9629629629629629, "grad_norm": 0.535933792591095, "learning_rate": 3.4445643558632295e-05, "loss": 1.7281, "step": 221 }, { "epoch": 0.9673202614379085, "grad_norm": 0.6523289680480957, "learning_rate": 3.423026929561117e-05, "loss": 1.7759, "step": 222 }, { "epoch": 0.971677559912854, "grad_norm": 0.6564977765083313, "learning_rate": 3.401467234805227e-05, "loss": 1.5996, "step": 223 }, { "epoch": 0.9760348583877996, "grad_norm": 0.7078843712806702, "learning_rate": 3.379886406514004e-05, "loss": 1.6532, "step": 224 }, { "epoch": 0.9803921568627451, "grad_norm": 0.7516049742698669, "learning_rate": 3.358285580718383e-05, "loss": 1.5464, "step": 225 }, { "epoch": 0.9847494553376906, "grad_norm": 0.7365531325340271, "learning_rate": 3.336665894501977e-05, "loss": 1.7292, "step": 226 }, { "epoch": 0.9891067538126361, "grad_norm": 0.9807084202766418, "learning_rate": 3.315028485941233e-05, "loss": 1.4769, "step": 227 }, { "epoch": 0.9934640522875817, "grad_norm": 0.7513118386268616, "learning_rate": 3.293374494045507e-05, "loss": 1.8098, "step": 228 }, { "epoch": 0.9978213507625272, "grad_norm": 0.7991335988044739, "learning_rate": 3.271705058697122e-05, "loss": 1.7686, "step": 229 }, { "epoch": 1.0, "grad_norm": 1.0507432222366333, "learning_rate": 3.2500213205913506e-05, "loss": 0.9588, "step": 230 } ], "logging_steps": 1, "max_steps": 458, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 23, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.9962116695030497e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }