{ "best_metric": 2.4329843521118164, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.028667670035117897, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014333835017558947, "grad_norm": 4.095588207244873, "learning_rate": 2.9999999999999997e-05, "loss": 6.7525, "step": 1 }, { "epoch": 0.00014333835017558947, "eval_loss": 6.518734931945801, "eval_runtime": 4.4502, "eval_samples_per_second": 11.235, "eval_steps_per_second": 1.573, "step": 1 }, { "epoch": 0.00028667670035117894, "grad_norm": 3.9740452766418457, "learning_rate": 5.9999999999999995e-05, "loss": 6.4827, "step": 2 }, { "epoch": 0.00043001505052676843, "grad_norm": 4.090057373046875, "learning_rate": 8.999999999999999e-05, "loss": 6.803, "step": 3 }, { "epoch": 0.0005733534007023579, "grad_norm": 4.096340656280518, "learning_rate": 0.00011999999999999999, "loss": 6.3331, "step": 4 }, { "epoch": 0.0007166917508779474, "grad_norm": 3.8779053688049316, "learning_rate": 0.00015, "loss": 5.8767, "step": 5 }, { "epoch": 0.0008600301010535369, "grad_norm": 3.677866220474243, "learning_rate": 0.00017999999999999998, "loss": 5.0901, "step": 6 }, { "epoch": 0.0010033684512291263, "grad_norm": 3.940865993499756, "learning_rate": 0.00020999999999999998, "loss": 4.4307, "step": 7 }, { "epoch": 0.0011467068014047158, "grad_norm": 3.4478397369384766, "learning_rate": 0.00023999999999999998, "loss": 3.9003, "step": 8 }, { "epoch": 0.0012900451515803052, "grad_norm": 1.962295651435852, "learning_rate": 0.00027, "loss": 3.133, "step": 9 }, { "epoch": 0.0014333835017558947, "grad_norm": 2.3240482807159424, "learning_rate": 0.0003, "loss": 2.9516, "step": 10 }, { "epoch": 0.0015767218519314842, "grad_norm": 1.8678102493286133, "learning_rate": 0.0002999794957488703, "loss": 2.8182, "step": 11 }, { "epoch": 0.0017200602021070737, "grad_norm": 1.6835949420928955, "learning_rate": 0.0002999179886011389, "loss": 2.6626, "step": 12 }, { "epoch": 0.0018633985522826632, "grad_norm": 1.4440191984176636, "learning_rate": 0.0002998154953722457, "loss": 2.6, "step": 13 }, { "epoch": 0.0020067369024582525, "grad_norm": 3.3333661556243896, "learning_rate": 0.00029967204408281613, "loss": 2.6397, "step": 14 }, { "epoch": 0.0021500752526338422, "grad_norm": 2.430225372314453, "learning_rate": 0.00029948767395100045, "loss": 2.7674, "step": 15 }, { "epoch": 0.0022934136028094315, "grad_norm": 1.6387251615524292, "learning_rate": 0.0002992624353817517, "loss": 2.5239, "step": 16 }, { "epoch": 0.002436751952985021, "grad_norm": 1.1381381750106812, "learning_rate": 0.0002989963899530457, "loss": 2.5054, "step": 17 }, { "epoch": 0.0025800903031606105, "grad_norm": 1.326271414756775, "learning_rate": 0.00029868961039904624, "loss": 2.5822, "step": 18 }, { "epoch": 0.0027234286533362, "grad_norm": 1.0840177536010742, "learning_rate": 0.00029834218059022024, "loss": 2.4767, "step": 19 }, { "epoch": 0.0028667670035117895, "grad_norm": 1.4158834218978882, "learning_rate": 0.00029795419551040833, "loss": 2.484, "step": 20 }, { "epoch": 0.003010105353687379, "grad_norm": 1.3069770336151123, "learning_rate": 0.00029752576123085736, "loss": 2.4704, "step": 21 }, { "epoch": 0.0031534437038629685, "grad_norm": 0.8556431531906128, "learning_rate": 0.0002970569948812214, "loss": 2.5037, "step": 22 }, { "epoch": 0.003296782054038558, "grad_norm": 0.8674086332321167, "learning_rate": 0.0002965480246175399, "loss": 2.5533, "step": 23 }, { "epoch": 0.0034401204042141475, "grad_norm": 0.8356068134307861, "learning_rate": 0.0002959989895872009, "loss": 2.5738, "step": 24 }, { "epoch": 0.003583458754389737, "grad_norm": 1.8884528875350952, "learning_rate": 0.0002954100398908995, "loss": 2.4843, "step": 25 }, { "epoch": 0.003583458754389737, "eval_loss": 2.4727981090545654, "eval_runtime": 4.572, "eval_samples_per_second": 10.936, "eval_steps_per_second": 1.531, "step": 25 }, { "epoch": 0.0037267971045653265, "grad_norm": 0.7543926239013672, "learning_rate": 0.0002947813365416023, "loss": 2.4307, "step": 26 }, { "epoch": 0.0038701354547409157, "grad_norm": 0.5183073282241821, "learning_rate": 0.0002941130514205272, "loss": 2.4546, "step": 27 }, { "epoch": 0.004013473804916505, "grad_norm": 2.17268443107605, "learning_rate": 0.0002934053672301536, "loss": 2.4966, "step": 28 }, { "epoch": 0.004156812155092095, "grad_norm": 0.5066993236541748, "learning_rate": 0.00029265847744427303, "loss": 2.4952, "step": 29 }, { "epoch": 0.0043001505052676844, "grad_norm": 0.4776642918586731, "learning_rate": 0.00029187258625509513, "loss": 2.5126, "step": 30 }, { "epoch": 0.004443488855443274, "grad_norm": 0.6363259553909302, "learning_rate": 0.00029104790851742417, "loss": 2.5075, "step": 31 }, { "epoch": 0.004586827205618863, "grad_norm": 0.4743508994579315, "learning_rate": 0.0002901846696899191, "loss": 2.4578, "step": 32 }, { "epoch": 0.004730165555794453, "grad_norm": 0.3669068217277527, "learning_rate": 0.00028928310577345606, "loss": 2.4547, "step": 33 }, { "epoch": 0.004873503905970042, "grad_norm": 0.48330530524253845, "learning_rate": 0.0002883434632466077, "loss": 2.6293, "step": 34 }, { "epoch": 0.005016842256145632, "grad_norm": 0.44809746742248535, "learning_rate": 0.00028736599899825856, "loss": 2.4301, "step": 35 }, { "epoch": 0.005160180606321221, "grad_norm": 0.5167154669761658, "learning_rate": 0.00028635098025737434, "loss": 2.5549, "step": 36 }, { "epoch": 0.005303518956496811, "grad_norm": 0.5323503613471985, "learning_rate": 0.00028529868451994384, "loss": 2.4602, "step": 37 }, { "epoch": 0.0054468573066724, "grad_norm": 0.49367713928222656, "learning_rate": 0.0002842093994731145, "loss": 2.3723, "step": 38 }, { "epoch": 0.005590195656847989, "grad_norm": 0.4007982909679413, "learning_rate": 0.00028308342291654174, "loss": 2.404, "step": 39 }, { "epoch": 0.005733534007023579, "grad_norm": 0.5526387095451355, "learning_rate": 0.00028192106268097334, "loss": 2.5208, "step": 40 }, { "epoch": 0.005876872357199169, "grad_norm": 0.6063050627708435, "learning_rate": 0.00028072263654409154, "loss": 2.5084, "step": 41 }, { "epoch": 0.006020210707374758, "grad_norm": 0.5957720875740051, "learning_rate": 0.0002794884721436361, "loss": 2.4823, "step": 42 }, { "epoch": 0.006163549057550347, "grad_norm": 0.4266456663608551, "learning_rate": 0.00027821890688783083, "loss": 2.4275, "step": 43 }, { "epoch": 0.006306887407725937, "grad_norm": 0.6097196340560913, "learning_rate": 0.0002769142878631403, "loss": 2.4463, "step": 44 }, { "epoch": 0.006450225757901527, "grad_norm": 0.4464048445224762, "learning_rate": 0.00027557497173937923, "loss": 2.441, "step": 45 }, { "epoch": 0.006593564108077116, "grad_norm": 0.7037805914878845, "learning_rate": 0.000274201324672203, "loss": 2.5366, "step": 46 }, { "epoch": 0.006736902458252705, "grad_norm": 0.5217775702476501, "learning_rate": 0.00027279372220300385, "loss": 2.4081, "step": 47 }, { "epoch": 0.006880240808428295, "grad_norm": 0.4792337417602539, "learning_rate": 0.0002713525491562421, "loss": 2.5082, "step": 48 }, { "epoch": 0.007023579158603885, "grad_norm": 0.3848729729652405, "learning_rate": 0.00026987819953423867, "loss": 2.5108, "step": 49 }, { "epoch": 0.007166917508779474, "grad_norm": 0.4266701638698578, "learning_rate": 0.00026837107640945905, "loss": 2.4282, "step": 50 }, { "epoch": 0.007166917508779474, "eval_loss": 2.515470027923584, "eval_runtime": 4.5689, "eval_samples_per_second": 10.944, "eval_steps_per_second": 1.532, "step": 50 }, { "epoch": 0.007310255858955063, "grad_norm": 0.4437111020088196, "learning_rate": 0.0002668315918143169, "loss": 2.3658, "step": 51 }, { "epoch": 0.007453594209130653, "grad_norm": 0.3480053246021271, "learning_rate": 0.00026526016662852886, "loss": 2.4018, "step": 52 }, { "epoch": 0.007596932559306243, "grad_norm": 0.45664915442466736, "learning_rate": 0.00026365723046405023, "loss": 2.4212, "step": 53 }, { "epoch": 0.0077402709094818315, "grad_norm": 0.30473992228507996, "learning_rate": 0.0002620232215476231, "loss": 2.3651, "step": 54 }, { "epoch": 0.007883609259657422, "grad_norm": 0.3738688826560974, "learning_rate": 0.0002603585866009697, "loss": 2.3824, "step": 55 }, { "epoch": 0.00802694760983301, "grad_norm": 0.4079970717430115, "learning_rate": 0.00025866378071866334, "loss": 2.5106, "step": 56 }, { "epoch": 0.0081702859600086, "grad_norm": 0.6227981448173523, "learning_rate": 0.00025693926724370956, "loss": 2.5133, "step": 57 }, { "epoch": 0.00831362431018419, "grad_norm": 0.5902373194694519, "learning_rate": 0.00025518551764087326, "loss": 2.4108, "step": 58 }, { "epoch": 0.00845696266035978, "grad_norm": 0.3553559482097626, "learning_rate": 0.00025340301136778483, "loss": 2.4769, "step": 59 }, { "epoch": 0.008600301010535369, "grad_norm": 0.30603644251823425, "learning_rate": 0.00025159223574386114, "loss": 2.3449, "step": 60 }, { "epoch": 0.008743639360710959, "grad_norm": 0.38652244210243225, "learning_rate": 0.0002497536858170772, "loss": 2.4605, "step": 61 }, { "epoch": 0.008886977710886548, "grad_norm": 0.8515211939811707, "learning_rate": 0.00024788786422862526, "loss": 2.5039, "step": 62 }, { "epoch": 0.009030316061062138, "grad_norm": 0.48116958141326904, "learning_rate": 0.00024599528107549745, "loss": 2.4173, "step": 63 }, { "epoch": 0.009173654411237726, "grad_norm": 0.394767701625824, "learning_rate": 0.00024407645377103054, "loss": 2.4306, "step": 64 }, { "epoch": 0.009316992761413316, "grad_norm": 0.3945147693157196, "learning_rate": 0.00024213190690345018, "loss": 2.4364, "step": 65 }, { "epoch": 0.009460331111588905, "grad_norm": 3.018016815185547, "learning_rate": 0.00024016217209245374, "loss": 2.458, "step": 66 }, { "epoch": 0.009603669461764495, "grad_norm": 0.4277852773666382, "learning_rate": 0.00023816778784387094, "loss": 2.5136, "step": 67 }, { "epoch": 0.009747007811940085, "grad_norm": 0.4447251856327057, "learning_rate": 0.0002361492994024415, "loss": 2.4805, "step": 68 }, { "epoch": 0.009890346162115675, "grad_norm": 0.48989495635032654, "learning_rate": 0.0002341072586027509, "loss": 2.4012, "step": 69 }, { "epoch": 0.010033684512291264, "grad_norm": 1.8352046012878418, "learning_rate": 0.00023204222371836405, "loss": 2.5, "step": 70 }, { "epoch": 0.010177022862466852, "grad_norm": 0.4361478388309479, "learning_rate": 0.00022995475930919905, "loss": 2.3609, "step": 71 }, { "epoch": 0.010320361212642442, "grad_norm": 0.3762364387512207, "learning_rate": 0.00022784543606718227, "loss": 2.4299, "step": 72 }, { "epoch": 0.010463699562818032, "grad_norm": 0.5353628993034363, "learning_rate": 0.00022571483066022657, "loss": 2.4636, "step": 73 }, { "epoch": 0.010607037912993621, "grad_norm": 0.4717017710208893, "learning_rate": 0.0002235635255745762, "loss": 2.3873, "step": 74 }, { "epoch": 0.010750376263169211, "grad_norm": 0.4951079189777374, "learning_rate": 0.00022139210895556104, "loss": 2.3833, "step": 75 }, { "epoch": 0.010750376263169211, "eval_loss": 2.5027923583984375, "eval_runtime": 4.575, "eval_samples_per_second": 10.929, "eval_steps_per_second": 1.53, "step": 75 }, { "epoch": 0.0108937146133448, "grad_norm": 1.1163285970687866, "learning_rate": 0.00021920117444680317, "loss": 2.4308, "step": 76 }, { "epoch": 0.01103705296352039, "grad_norm": 0.5007464289665222, "learning_rate": 0.00021699132102792097, "loss": 2.4557, "step": 77 }, { "epoch": 0.011180391313695979, "grad_norm": 0.48471787571907043, "learning_rate": 0.0002147631528507739, "loss": 2.4868, "step": 78 }, { "epoch": 0.011323729663871568, "grad_norm": 0.4553197920322418, "learning_rate": 0.00021251727907429355, "loss": 2.4538, "step": 79 }, { "epoch": 0.011467068014047158, "grad_norm": 0.49402543902397156, "learning_rate": 0.0002102543136979454, "loss": 2.4061, "step": 80 }, { "epoch": 0.011610406364222748, "grad_norm": 0.43397530913352966, "learning_rate": 0.0002079748753938678, "loss": 2.5284, "step": 81 }, { "epoch": 0.011753744714398337, "grad_norm": 0.39252761006355286, "learning_rate": 0.0002056795873377331, "loss": 2.4795, "step": 82 }, { "epoch": 0.011897083064573927, "grad_norm": 0.4337272644042969, "learning_rate": 0.00020336907703837748, "loss": 2.4098, "step": 83 }, { "epoch": 0.012040421414749517, "grad_norm": 0.5519135594367981, "learning_rate": 0.00020104397616624645, "loss": 2.4282, "step": 84 }, { "epoch": 0.012183759764925106, "grad_norm": 0.46673280000686646, "learning_rate": 0.00019870492038070252, "loss": 2.483, "step": 85 }, { "epoch": 0.012327098115100694, "grad_norm": 0.42640626430511475, "learning_rate": 0.0001963525491562421, "loss": 2.5023, "step": 86 }, { "epoch": 0.012470436465276284, "grad_norm": 0.4718412458896637, "learning_rate": 0.0001939875056076697, "loss": 2.5087, "step": 87 }, { "epoch": 0.012613774815451874, "grad_norm": 0.5302379727363586, "learning_rate": 0.00019161043631427666, "loss": 2.4402, "step": 88 }, { "epoch": 0.012757113165627464, "grad_norm": 0.3727835416793823, "learning_rate": 0.00018922199114307294, "loss": 2.4963, "step": 89 }, { "epoch": 0.012900451515803053, "grad_norm": 0.9599354863166809, "learning_rate": 0.00018682282307111987, "loss": 2.437, "step": 90 }, { "epoch": 0.013043789865978643, "grad_norm": 0.40621206164360046, "learning_rate": 0.00018441358800701273, "loss": 2.3686, "step": 91 }, { "epoch": 0.013187128216154233, "grad_norm": 0.40959206223487854, "learning_rate": 0.00018199494461156203, "loss": 2.4784, "step": 92 }, { "epoch": 0.01333046656632982, "grad_norm": 0.4783549904823303, "learning_rate": 0.000179567554117722, "loss": 2.5026, "step": 93 }, { "epoch": 0.01347380491650541, "grad_norm": 0.4089532792568207, "learning_rate": 0.00017713208014981648, "loss": 2.4641, "step": 94 }, { "epoch": 0.013617143266681, "grad_norm": 0.40697500109672546, "learning_rate": 0.00017468918854211007, "loss": 2.4023, "step": 95 }, { "epoch": 0.01376048161685659, "grad_norm": 0.44492053985595703, "learning_rate": 0.00017223954715677627, "loss": 2.4689, "step": 96 }, { "epoch": 0.01390381996703218, "grad_norm": 0.4386192560195923, "learning_rate": 0.00016978382570131034, "loss": 2.3685, "step": 97 }, { "epoch": 0.01404715831720777, "grad_norm": 0.9035962820053101, "learning_rate": 0.00016732269554543794, "loss": 2.434, "step": 98 }, { "epoch": 0.014190496667383359, "grad_norm": 0.5646589994430542, "learning_rate": 0.00016485682953756942, "loss": 2.5078, "step": 99 }, { "epoch": 0.014333835017558949, "grad_norm": 0.6752585172653198, "learning_rate": 0.00016238690182084986, "loss": 2.3888, "step": 100 }, { "epoch": 0.014333835017558949, "eval_loss": 2.4761135578155518, "eval_runtime": 4.5655, "eval_samples_per_second": 10.952, "eval_steps_per_second": 1.533, "step": 100 }, { "epoch": 0.014477173367734537, "grad_norm": 0.6777223348617554, "learning_rate": 0.0001599135876488549, "loss": 2.4187, "step": 101 }, { "epoch": 0.014620511717910126, "grad_norm": 0.6800750494003296, "learning_rate": 0.00015743756320098332, "loss": 2.4502, "step": 102 }, { "epoch": 0.014763850068085716, "grad_norm": 0.52974534034729, "learning_rate": 0.0001549595053975962, "loss": 2.4709, "step": 103 }, { "epoch": 0.014907188418261306, "grad_norm": 0.502795934677124, "learning_rate": 0.00015248009171495378, "loss": 2.3035, "step": 104 }, { "epoch": 0.015050526768436896, "grad_norm": 0.6192975044250488, "learning_rate": 0.00015, "loss": 2.6338, "step": 105 }, { "epoch": 0.015193865118612485, "grad_norm": 0.4301483929157257, "learning_rate": 0.00014751990828504622, "loss": 2.3471, "step": 106 }, { "epoch": 0.015337203468788075, "grad_norm": 0.4026905596256256, "learning_rate": 0.00014504049460240375, "loss": 2.4869, "step": 107 }, { "epoch": 0.015480541818963663, "grad_norm": 0.33096885681152344, "learning_rate": 0.00014256243679901663, "loss": 2.3908, "step": 108 }, { "epoch": 0.015623880169139253, "grad_norm": 0.3435579836368561, "learning_rate": 0.00014008641235114508, "loss": 2.4443, "step": 109 }, { "epoch": 0.015767218519314844, "grad_norm": 0.5849462151527405, "learning_rate": 0.00013761309817915014, "loss": 2.357, "step": 110 }, { "epoch": 0.015910556869490434, "grad_norm": 0.415574848651886, "learning_rate": 0.00013514317046243058, "loss": 2.4204, "step": 111 }, { "epoch": 0.01605389521966602, "grad_norm": 0.5506088137626648, "learning_rate": 0.00013267730445456208, "loss": 2.4529, "step": 112 }, { "epoch": 0.01619723356984161, "grad_norm": 0.4681348204612732, "learning_rate": 0.00013021617429868963, "loss": 2.4031, "step": 113 }, { "epoch": 0.0163405719200172, "grad_norm": 0.33748188614845276, "learning_rate": 0.00012776045284322368, "loss": 2.3714, "step": 114 }, { "epoch": 0.01648391027019279, "grad_norm": 0.36723262071609497, "learning_rate": 0.00012531081145788987, "loss": 2.4555, "step": 115 }, { "epoch": 0.01662724862036838, "grad_norm": 0.454904168844223, "learning_rate": 0.00012286791985018355, "loss": 2.4527, "step": 116 }, { "epoch": 0.01677058697054397, "grad_norm": 0.4296797215938568, "learning_rate": 0.00012043244588227796, "loss": 2.4918, "step": 117 }, { "epoch": 0.01691392532071956, "grad_norm": 0.433634877204895, "learning_rate": 0.00011800505538843798, "loss": 2.5004, "step": 118 }, { "epoch": 0.017057263670895148, "grad_norm": 0.3770894706249237, "learning_rate": 0.00011558641199298727, "loss": 2.4621, "step": 119 }, { "epoch": 0.017200602021070738, "grad_norm": 0.44861501455307007, "learning_rate": 0.00011317717692888012, "loss": 2.3952, "step": 120 }, { "epoch": 0.017343940371246327, "grad_norm": 0.4860612154006958, "learning_rate": 0.00011077800885692702, "loss": 2.3897, "step": 121 }, { "epoch": 0.017487278721421917, "grad_norm": 0.3798341751098633, "learning_rate": 0.00010838956368572334, "loss": 2.4805, "step": 122 }, { "epoch": 0.017630617071597507, "grad_norm": 0.3528059720993042, "learning_rate": 0.0001060124943923303, "loss": 2.3983, "step": 123 }, { "epoch": 0.017773955421773097, "grad_norm": 0.37472400069236755, "learning_rate": 0.0001036474508437579, "loss": 2.3448, "step": 124 }, { "epoch": 0.017917293771948686, "grad_norm": 0.450785756111145, "learning_rate": 0.00010129507961929748, "loss": 2.4047, "step": 125 }, { "epoch": 0.017917293771948686, "eval_loss": 2.4208545684814453, "eval_runtime": 4.5669, "eval_samples_per_second": 10.948, "eval_steps_per_second": 1.533, "step": 125 }, { "epoch": 0.018060632122124276, "grad_norm": 0.42055612802505493, "learning_rate": 9.895602383375353e-05, "loss": 2.3805, "step": 126 }, { "epoch": 0.018203970472299862, "grad_norm": 0.4635365605354309, "learning_rate": 9.663092296162251e-05, "loss": 2.3714, "step": 127 }, { "epoch": 0.018347308822475452, "grad_norm": 0.36117833852767944, "learning_rate": 9.432041266226686e-05, "loss": 2.4163, "step": 128 }, { "epoch": 0.01849064717265104, "grad_norm": 0.3636980950832367, "learning_rate": 9.202512460613219e-05, "loss": 2.362, "step": 129 }, { "epoch": 0.01863398552282663, "grad_norm": 0.4152284860610962, "learning_rate": 8.97456863020546e-05, "loss": 2.4751, "step": 130 }, { "epoch": 0.01877732387300222, "grad_norm": 0.5936709642410278, "learning_rate": 8.748272092570646e-05, "loss": 2.4146, "step": 131 }, { "epoch": 0.01892066222317781, "grad_norm": 0.43432939052581787, "learning_rate": 8.523684714922608e-05, "loss": 2.3515, "step": 132 }, { "epoch": 0.0190640005733534, "grad_norm": 0.5816676020622253, "learning_rate": 8.300867897207903e-05, "loss": 2.4006, "step": 133 }, { "epoch": 0.01920733892352899, "grad_norm": 0.4577726423740387, "learning_rate": 8.079882555319684e-05, "loss": 2.4041, "step": 134 }, { "epoch": 0.01935067727370458, "grad_norm": 0.5329848527908325, "learning_rate": 7.860789104443896e-05, "loss": 2.4021, "step": 135 }, { "epoch": 0.01949401562388017, "grad_norm": 0.4278225302696228, "learning_rate": 7.643647442542382e-05, "loss": 2.3308, "step": 136 }, { "epoch": 0.01963735397405576, "grad_norm": 0.42905911803245544, "learning_rate": 7.428516933977347e-05, "loss": 2.4348, "step": 137 }, { "epoch": 0.01978069232423135, "grad_norm": 0.4884713292121887, "learning_rate": 7.215456393281776e-05, "loss": 2.3535, "step": 138 }, { "epoch": 0.01992403067440694, "grad_norm": 0.5074369311332703, "learning_rate": 7.004524069080096e-05, "loss": 2.4361, "step": 139 }, { "epoch": 0.02006736902458253, "grad_norm": 0.5184245109558105, "learning_rate": 6.795777628163599e-05, "loss": 2.48, "step": 140 }, { "epoch": 0.02021070737475812, "grad_norm": 0.4724279046058655, "learning_rate": 6.58927413972491e-05, "loss": 2.3833, "step": 141 }, { "epoch": 0.020354045724933705, "grad_norm": 0.42493098974227905, "learning_rate": 6.385070059755846e-05, "loss": 2.4109, "step": 142 }, { "epoch": 0.020497384075109294, "grad_norm": 0.46368831396102905, "learning_rate": 6.183221215612904e-05, "loss": 2.403, "step": 143 }, { "epoch": 0.020640722425284884, "grad_norm": 0.5204209685325623, "learning_rate": 5.983782790754623e-05, "loss": 2.4407, "step": 144 }, { "epoch": 0.020784060775460474, "grad_norm": 0.5628048777580261, "learning_rate": 5.786809309654982e-05, "loss": 2.4423, "step": 145 }, { "epoch": 0.020927399125636063, "grad_norm": 0.43388831615448, "learning_rate": 5.592354622896944e-05, "loss": 2.4248, "step": 146 }, { "epoch": 0.021070737475811653, "grad_norm": 0.43663108348846436, "learning_rate": 5.40047189245025e-05, "loss": 2.4316, "step": 147 }, { "epoch": 0.021214075825987243, "grad_norm": 0.4974912703037262, "learning_rate": 5.211213577137469e-05, "loss": 2.4239, "step": 148 }, { "epoch": 0.021357414176162832, "grad_norm": 0.48720306158065796, "learning_rate": 5.024631418292274e-05, "loss": 2.4115, "step": 149 }, { "epoch": 0.021500752526338422, "grad_norm": 0.4307795763015747, "learning_rate": 4.840776425613886e-05, "loss": 2.4406, "step": 150 }, { "epoch": 0.021500752526338422, "eval_loss": 2.442666530609131, "eval_runtime": 4.5664, "eval_samples_per_second": 10.95, "eval_steps_per_second": 1.533, "step": 150 }, { "epoch": 0.021644090876514012, "grad_norm": 0.42068323493003845, "learning_rate": 4.659698863221513e-05, "loss": 2.343, "step": 151 }, { "epoch": 0.0217874292266896, "grad_norm": 0.436985045671463, "learning_rate": 4.481448235912671e-05, "loss": 2.3775, "step": 152 }, { "epoch": 0.02193076757686519, "grad_norm": 0.43744659423828125, "learning_rate": 4.306073275629044e-05, "loss": 2.366, "step": 153 }, { "epoch": 0.02207410592704078, "grad_norm": 0.42853182554244995, "learning_rate": 4.133621928133665e-05, "loss": 2.3583, "step": 154 }, { "epoch": 0.02221744427721637, "grad_norm": 0.43902191519737244, "learning_rate": 3.964141339903026e-05, "loss": 2.3905, "step": 155 }, { "epoch": 0.022360782627391957, "grad_norm": 0.46048280596733093, "learning_rate": 3.797677845237696e-05, "loss": 2.4043, "step": 156 }, { "epoch": 0.022504120977567547, "grad_norm": 0.3899269998073578, "learning_rate": 3.634276953594982e-05, "loss": 2.3547, "step": 157 }, { "epoch": 0.022647459327743136, "grad_norm": 0.37371909618377686, "learning_rate": 3.473983337147118e-05, "loss": 2.3287, "step": 158 }, { "epoch": 0.022790797677918726, "grad_norm": 0.404742956161499, "learning_rate": 3.316840818568315e-05, "loss": 2.3772, "step": 159 }, { "epoch": 0.022934136028094316, "grad_norm": 0.38302767276763916, "learning_rate": 3.162892359054098e-05, "loss": 2.3533, "step": 160 }, { "epoch": 0.023077474378269906, "grad_norm": 0.3811044991016388, "learning_rate": 3.0121800465761293e-05, "loss": 2.3194, "step": 161 }, { "epoch": 0.023220812728445495, "grad_norm": 0.5596585869789124, "learning_rate": 2.8647450843757897e-05, "loss": 2.4266, "step": 162 }, { "epoch": 0.023364151078621085, "grad_norm": 0.5400771498680115, "learning_rate": 2.7206277796996144e-05, "loss": 2.3642, "step": 163 }, { "epoch": 0.023507489428796675, "grad_norm": 0.4592722952365875, "learning_rate": 2.5798675327796993e-05, "loss": 2.327, "step": 164 }, { "epoch": 0.023650827778972264, "grad_norm": 0.47848427295684814, "learning_rate": 2.4425028260620715e-05, "loss": 2.4503, "step": 165 }, { "epoch": 0.023794166129147854, "grad_norm": 0.45060548186302185, "learning_rate": 2.3085712136859668e-05, "loss": 2.3749, "step": 166 }, { "epoch": 0.023937504479323444, "grad_norm": 0.4228126108646393, "learning_rate": 2.178109311216913e-05, "loss": 2.4633, "step": 167 }, { "epoch": 0.024080842829499034, "grad_norm": 0.5396104454994202, "learning_rate": 2.0511527856363912e-05, "loss": 2.4974, "step": 168 }, { "epoch": 0.024224181179674623, "grad_norm": 0.47498688101768494, "learning_rate": 1.927736345590839e-05, "loss": 2.5275, "step": 169 }, { "epoch": 0.024367519529850213, "grad_norm": 0.5508636832237244, "learning_rate": 1.8078937319026654e-05, "loss": 2.4751, "step": 170 }, { "epoch": 0.0245108578800258, "grad_norm": 0.4817218780517578, "learning_rate": 1.6916577083458228e-05, "loss": 2.5245, "step": 171 }, { "epoch": 0.02465419623020139, "grad_norm": 0.4276881217956543, "learning_rate": 1.579060052688548e-05, "loss": 2.403, "step": 172 }, { "epoch": 0.02479753458037698, "grad_norm": 0.4423224627971649, "learning_rate": 1.4701315480056164e-05, "loss": 2.4871, "step": 173 }, { "epoch": 0.02494087293055257, "grad_norm": 0.4730720818042755, "learning_rate": 1.3649019742625623e-05, "loss": 2.4419, "step": 174 }, { "epoch": 0.025084211280728158, "grad_norm": 0.4102948009967804, "learning_rate": 1.2634001001741373e-05, "loss": 2.3216, "step": 175 }, { "epoch": 0.025084211280728158, "eval_loss": 2.4315061569213867, "eval_runtime": 4.5667, "eval_samples_per_second": 10.949, "eval_steps_per_second": 1.533, "step": 175 }, { "epoch": 0.025227549630903748, "grad_norm": 0.43688416481018066, "learning_rate": 1.1656536753392287e-05, "loss": 2.3618, "step": 176 }, { "epoch": 0.025370887981079338, "grad_norm": 0.4183753728866577, "learning_rate": 1.0716894226543953e-05, "loss": 2.4004, "step": 177 }, { "epoch": 0.025514226331254927, "grad_norm": 0.3798510432243347, "learning_rate": 9.815330310080887e-06, "loss": 2.3351, "step": 178 }, { "epoch": 0.025657564681430517, "grad_norm": 0.4367026090621948, "learning_rate": 8.952091482575824e-06, "loss": 2.339, "step": 179 }, { "epoch": 0.025800903031606107, "grad_norm": 0.43516334891319275, "learning_rate": 8.127413744904804e-06, "loss": 2.3865, "step": 180 }, { "epoch": 0.025944241381781696, "grad_norm": 0.39649835228919983, "learning_rate": 7.34152255572697e-06, "loss": 2.4152, "step": 181 }, { "epoch": 0.026087579731957286, "grad_norm": 0.4496510326862335, "learning_rate": 6.594632769846353e-06, "loss": 2.4076, "step": 182 }, { "epoch": 0.026230918082132876, "grad_norm": 0.36271652579307556, "learning_rate": 5.886948579472778e-06, "loss": 2.349, "step": 183 }, { "epoch": 0.026374256432308466, "grad_norm": 0.43088510632514954, "learning_rate": 5.218663458397715e-06, "loss": 2.412, "step": 184 }, { "epoch": 0.026517594782484055, "grad_norm": 0.4110885262489319, "learning_rate": 4.589960109100444e-06, "loss": 2.3669, "step": 185 }, { "epoch": 0.02666093313265964, "grad_norm": 0.44089290499687195, "learning_rate": 4.001010412799138e-06, "loss": 2.3644, "step": 186 }, { "epoch": 0.02680427148283523, "grad_norm": 0.46191123127937317, "learning_rate": 3.451975382460109e-06, "loss": 2.4576, "step": 187 }, { "epoch": 0.02694760983301082, "grad_norm": 0.43181681632995605, "learning_rate": 2.9430051187785962e-06, "loss": 2.3365, "step": 188 }, { "epoch": 0.02709094818318641, "grad_norm": 0.5321618914604187, "learning_rate": 2.4742387691426445e-06, "loss": 2.3837, "step": 189 }, { "epoch": 0.027234286533362, "grad_norm": 0.4526923894882202, "learning_rate": 2.0458044895916513e-06, "loss": 2.3506, "step": 190 }, { "epoch": 0.02737762488353759, "grad_norm": 0.42679014801979065, "learning_rate": 1.6578194097797258e-06, "loss": 2.4344, "step": 191 }, { "epoch": 0.02752096323371318, "grad_norm": 0.5031974911689758, "learning_rate": 1.3103896009537207e-06, "loss": 2.3758, "step": 192 }, { "epoch": 0.02766430158388877, "grad_norm": 0.4715871214866638, "learning_rate": 1.0036100469542786e-06, "loss": 2.4062, "step": 193 }, { "epoch": 0.02780763993406436, "grad_norm": 0.5107702612876892, "learning_rate": 7.375646182482875e-07, "loss": 2.4455, "step": 194 }, { "epoch": 0.02795097828423995, "grad_norm": 0.44611087441444397, "learning_rate": 5.123260489995229e-07, "loss": 2.456, "step": 195 }, { "epoch": 0.02809431663441554, "grad_norm": 0.4629242718219757, "learning_rate": 3.2795591718381975e-07, "loss": 2.3595, "step": 196 }, { "epoch": 0.02823765498459113, "grad_norm": 0.4822857975959778, "learning_rate": 1.8450462775428942e-07, "loss": 2.4289, "step": 197 }, { "epoch": 0.028380993334766718, "grad_norm": 0.4529360234737396, "learning_rate": 8.201139886109264e-08, "loss": 2.3392, "step": 198 }, { "epoch": 0.028524331684942308, "grad_norm": 0.43792641162872314, "learning_rate": 2.0504251129649374e-08, "loss": 2.4626, "step": 199 }, { "epoch": 0.028667670035117897, "grad_norm": 0.4136735796928406, "learning_rate": 0.0, "loss": 2.35, "step": 200 }, { "epoch": 0.028667670035117897, "eval_loss": 2.4329843521118164, "eval_runtime": 4.566, "eval_samples_per_second": 10.95, "eval_steps_per_second": 1.533, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.39939410345984e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }