{ "best_metric": 1.0438764095306396, "best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-8900", "epoch": 2.7285921625544267, "eval_steps": 100, "global_step": 9400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02902757619738752, "grad_norm": 1.131932258605957, "learning_rate": 9.970972423802612e-05, "loss": 2.8244, "step": 100 }, { "epoch": 0.02902757619738752, "eval_loss": 1.2662084102630615, "eval_runtime": 213.5614, "eval_samples_per_second": 213.049, "eval_steps_per_second": 3.329, "step": 100 }, { "epoch": 0.05805515239477504, "grad_norm": 1.0239707231521606, "learning_rate": 9.941944847605225e-05, "loss": 2.7081, "step": 200 }, { "epoch": 0.05805515239477504, "eval_loss": 1.2453378438949585, "eval_runtime": 212.9056, "eval_samples_per_second": 213.705, "eval_steps_per_second": 3.34, "step": 200 }, { "epoch": 0.08708272859216255, "grad_norm": 1.1205116510391235, "learning_rate": 9.912917271407838e-05, "loss": 2.642, "step": 300 }, { "epoch": 0.08708272859216255, "eval_loss": 1.2237757444381714, "eval_runtime": 214.4447, "eval_samples_per_second": 212.171, "eval_steps_per_second": 3.316, "step": 300 }, { "epoch": 0.11611030478955008, "grad_norm": 1.0193355083465576, "learning_rate": 9.883889695210451e-05, "loss": 2.6037, "step": 400 }, { "epoch": 0.11611030478955008, "eval_loss": 1.2148627042770386, "eval_runtime": 213.5123, "eval_samples_per_second": 213.098, "eval_steps_per_second": 3.33, "step": 400 }, { "epoch": 0.14513788098693758, "grad_norm": 1.05299711227417, "learning_rate": 9.854862119013063e-05, "loss": 2.5791, "step": 500 }, { "epoch": 0.14513788098693758, "eval_loss": 1.2020208835601807, "eval_runtime": 213.769, "eval_samples_per_second": 212.842, "eval_steps_per_second": 3.326, "step": 500 }, { "epoch": 0.1741654571843251, "grad_norm": 1.0508314371109009, "learning_rate": 9.825834542815675e-05, "loss": 2.5464, "step": 600 }, { "epoch": 0.1741654571843251, "eval_loss": 1.1960116624832153, "eval_runtime": 214.1083, "eval_samples_per_second": 212.505, "eval_steps_per_second": 3.321, "step": 600 }, { "epoch": 0.20319303338171263, "grad_norm": 1.158460021018982, "learning_rate": 9.796806966618288e-05, "loss": 2.5391, "step": 700 }, { "epoch": 0.20319303338171263, "eval_loss": 1.186664342880249, "eval_runtime": 213.4364, "eval_samples_per_second": 213.174, "eval_steps_per_second": 3.331, "step": 700 }, { "epoch": 0.23222060957910015, "grad_norm": 1.0704821348190308, "learning_rate": 9.767779390420901e-05, "loss": 2.4944, "step": 800 }, { "epoch": 0.23222060957910015, "eval_loss": 1.1850290298461914, "eval_runtime": 213.63, "eval_samples_per_second": 212.98, "eval_steps_per_second": 3.328, "step": 800 }, { "epoch": 0.2612481857764877, "grad_norm": 1.0562227964401245, "learning_rate": 9.738751814223513e-05, "loss": 2.4879, "step": 900 }, { "epoch": 0.2612481857764877, "eval_loss": 1.1725127696990967, "eval_runtime": 213.7307, "eval_samples_per_second": 212.88, "eval_steps_per_second": 3.327, "step": 900 }, { "epoch": 0.29027576197387517, "grad_norm": 1.136777639389038, "learning_rate": 9.709724238026126e-05, "loss": 2.4647, "step": 1000 }, { "epoch": 0.29027576197387517, "eval_loss": 1.1709253787994385, "eval_runtime": 213.2147, "eval_samples_per_second": 213.395, "eval_steps_per_second": 3.335, "step": 1000 }, { "epoch": 0.3193033381712627, "grad_norm": 1.0949931144714355, "learning_rate": 9.680696661828737e-05, "loss": 2.4441, "step": 1100 }, { "epoch": 0.3193033381712627, "eval_loss": 1.1647560596466064, "eval_runtime": 213.5056, "eval_samples_per_second": 213.104, "eval_steps_per_second": 3.33, "step": 1100 }, { "epoch": 0.3483309143686502, "grad_norm": 1.2719751596450806, "learning_rate": 9.65166908563135e-05, "loss": 2.432, "step": 1200 }, { "epoch": 0.3483309143686502, "eval_loss": 1.1668621301651, "eval_runtime": 213.8017, "eval_samples_per_second": 212.809, "eval_steps_per_second": 3.326, "step": 1200 }, { "epoch": 0.37735849056603776, "grad_norm": 1.1357173919677734, "learning_rate": 9.622641509433963e-05, "loss": 2.4173, "step": 1300 }, { "epoch": 0.37735849056603776, "eval_loss": 1.1585583686828613, "eval_runtime": 212.8448, "eval_samples_per_second": 213.766, "eval_steps_per_second": 3.34, "step": 1300 }, { "epoch": 0.40638606676342526, "grad_norm": 1.1240577697753906, "learning_rate": 9.593613933236575e-05, "loss": 2.4029, "step": 1400 }, { "epoch": 0.40638606676342526, "eval_loss": 1.1513617038726807, "eval_runtime": 214.5547, "eval_samples_per_second": 212.063, "eval_steps_per_second": 3.314, "step": 1400 }, { "epoch": 0.43541364296081275, "grad_norm": 1.074048399925232, "learning_rate": 9.564586357039188e-05, "loss": 2.3964, "step": 1500 }, { "epoch": 0.43541364296081275, "eval_loss": 1.1514214277267456, "eval_runtime": 213.8115, "eval_samples_per_second": 212.8, "eval_steps_per_second": 3.325, "step": 1500 }, { "epoch": 0.4644412191582003, "grad_norm": 1.2565686702728271, "learning_rate": 9.5355587808418e-05, "loss": 2.3548, "step": 1600 }, { "epoch": 0.4644412191582003, "eval_loss": 1.1476994752883911, "eval_runtime": 214.3759, "eval_samples_per_second": 212.239, "eval_steps_per_second": 3.317, "step": 1600 }, { "epoch": 0.4934687953555878, "grad_norm": 1.1474090814590454, "learning_rate": 9.506531204644412e-05, "loss": 2.36, "step": 1700 }, { "epoch": 0.4934687953555878, "eval_loss": 1.1446571350097656, "eval_runtime": 213.458, "eval_samples_per_second": 213.152, "eval_steps_per_second": 3.331, "step": 1700 }, { "epoch": 0.5224963715529753, "grad_norm": 1.2290916442871094, "learning_rate": 9.477503628447025e-05, "loss": 2.3438, "step": 1800 }, { "epoch": 0.5224963715529753, "eval_loss": 1.1393438577651978, "eval_runtime": 213.014, "eval_samples_per_second": 213.596, "eval_steps_per_second": 3.338, "step": 1800 }, { "epoch": 0.5515239477503628, "grad_norm": 1.1700950860977173, "learning_rate": 9.448476052249638e-05, "loss": 2.3416, "step": 1900 }, { "epoch": 0.5515239477503628, "eval_loss": 1.1348192691802979, "eval_runtime": 213.2252, "eval_samples_per_second": 213.385, "eval_steps_per_second": 3.335, "step": 1900 }, { "epoch": 0.5805515239477503, "grad_norm": 1.1090705394744873, "learning_rate": 9.419448476052251e-05, "loss": 2.3289, "step": 2000 }, { "epoch": 0.5805515239477503, "eval_loss": 1.130873203277588, "eval_runtime": 212.7564, "eval_samples_per_second": 213.855, "eval_steps_per_second": 3.342, "step": 2000 }, { "epoch": 0.6095791001451378, "grad_norm": 1.17753267288208, "learning_rate": 9.390420899854863e-05, "loss": 2.3218, "step": 2100 }, { "epoch": 0.6095791001451378, "eval_loss": 1.1335190534591675, "eval_runtime": 212.7619, "eval_samples_per_second": 213.849, "eval_steps_per_second": 3.342, "step": 2100 }, { "epoch": 0.6386066763425254, "grad_norm": 1.087358832359314, "learning_rate": 9.361393323657474e-05, "loss": 2.3072, "step": 2200 }, { "epoch": 0.6386066763425254, "eval_loss": 1.1303313970565796, "eval_runtime": 213.3449, "eval_samples_per_second": 213.265, "eval_steps_per_second": 3.333, "step": 2200 }, { "epoch": 0.6676342525399129, "grad_norm": 1.1286981105804443, "learning_rate": 9.332365747460087e-05, "loss": 2.2881, "step": 2300 }, { "epoch": 0.6676342525399129, "eval_loss": 1.1234804391860962, "eval_runtime": 213.3465, "eval_samples_per_second": 213.263, "eval_steps_per_second": 3.333, "step": 2300 }, { "epoch": 0.6966618287373004, "grad_norm": 1.1590163707733154, "learning_rate": 9.3033381712627e-05, "loss": 2.2751, "step": 2400 }, { "epoch": 0.6966618287373004, "eval_loss": 1.120328664779663, "eval_runtime": 213.9246, "eval_samples_per_second": 212.687, "eval_steps_per_second": 3.324, "step": 2400 }, { "epoch": 0.7256894049346879, "grad_norm": 1.3988169431686401, "learning_rate": 9.274310595065312e-05, "loss": 2.2666, "step": 2500 }, { "epoch": 0.7256894049346879, "eval_loss": 1.1266223192214966, "eval_runtime": 214.3634, "eval_samples_per_second": 212.252, "eval_steps_per_second": 3.317, "step": 2500 }, { "epoch": 0.7547169811320755, "grad_norm": 1.239560842514038, "learning_rate": 9.245283018867925e-05, "loss": 2.2702, "step": 2600 }, { "epoch": 0.7547169811320755, "eval_loss": 1.1224210262298584, "eval_runtime": 213.2424, "eval_samples_per_second": 213.367, "eval_steps_per_second": 3.334, "step": 2600 }, { "epoch": 0.783744557329463, "grad_norm": 1.1289948225021362, "learning_rate": 9.216255442670537e-05, "loss": 2.256, "step": 2700 }, { "epoch": 0.783744557329463, "eval_loss": 1.1150513887405396, "eval_runtime": 213.4486, "eval_samples_per_second": 213.161, "eval_steps_per_second": 3.331, "step": 2700 }, { "epoch": 0.8127721335268505, "grad_norm": 1.1463016271591187, "learning_rate": 9.18722786647315e-05, "loss": 2.2483, "step": 2800 }, { "epoch": 0.8127721335268505, "eval_loss": 1.1185483932495117, "eval_runtime": 212.704, "eval_samples_per_second": 213.908, "eval_steps_per_second": 3.343, "step": 2800 }, { "epoch": 0.841799709724238, "grad_norm": 1.1233168840408325, "learning_rate": 9.158200290275763e-05, "loss": 2.2328, "step": 2900 }, { "epoch": 0.841799709724238, "eval_loss": 1.1085420846939087, "eval_runtime": 213.7255, "eval_samples_per_second": 212.885, "eval_steps_per_second": 3.327, "step": 2900 }, { "epoch": 0.8708272859216255, "grad_norm": 1.1887527704238892, "learning_rate": 9.129172714078375e-05, "loss": 2.235, "step": 3000 }, { "epoch": 0.8708272859216255, "eval_loss": 1.1104073524475098, "eval_runtime": 213.9252, "eval_samples_per_second": 212.687, "eval_steps_per_second": 3.324, "step": 3000 }, { "epoch": 0.8998548621190131, "grad_norm": 1.2834577560424805, "learning_rate": 9.100145137880988e-05, "loss": 2.2209, "step": 3100 }, { "epoch": 0.8998548621190131, "eval_loss": 1.1137757301330566, "eval_runtime": 213.6201, "eval_samples_per_second": 212.99, "eval_steps_per_second": 3.328, "step": 3100 }, { "epoch": 0.9288824383164006, "grad_norm": 1.3034873008728027, "learning_rate": 9.0711175616836e-05, "loss": 2.2185, "step": 3200 }, { "epoch": 0.9288824383164006, "eval_loss": 1.107863187789917, "eval_runtime": 213.1098, "eval_samples_per_second": 213.5, "eval_steps_per_second": 3.336, "step": 3200 }, { "epoch": 0.9579100145137881, "grad_norm": 1.1802492141723633, "learning_rate": 9.042089985486212e-05, "loss": 2.2147, "step": 3300 }, { "epoch": 0.9579100145137881, "eval_loss": 1.1041762828826904, "eval_runtime": 213.2962, "eval_samples_per_second": 213.314, "eval_steps_per_second": 3.333, "step": 3300 }, { "epoch": 0.9869375907111756, "grad_norm": 1.2992894649505615, "learning_rate": 9.013062409288826e-05, "loss": 2.216, "step": 3400 }, { "epoch": 0.9869375907111756, "eval_loss": 1.1009138822555542, "eval_runtime": 213.7998, "eval_samples_per_second": 212.811, "eval_steps_per_second": 3.326, "step": 3400 }, { "epoch": 1.0159651669085632, "grad_norm": 1.1432065963745117, "learning_rate": 8.984034833091437e-05, "loss": 2.1952, "step": 3500 }, { "epoch": 1.0159651669085632, "eval_loss": 1.106726884841919, "eval_runtime": 213.7054, "eval_samples_per_second": 212.905, "eval_steps_per_second": 3.327, "step": 3500 }, { "epoch": 1.0449927431059507, "grad_norm": 1.1603158712387085, "learning_rate": 8.95500725689405e-05, "loss": 2.2019, "step": 3600 }, { "epoch": 1.0449927431059507, "eval_loss": 1.1014330387115479, "eval_runtime": 213.1977, "eval_samples_per_second": 213.412, "eval_steps_per_second": 3.335, "step": 3600 }, { "epoch": 1.0740203193033382, "grad_norm": 1.2428488731384277, "learning_rate": 8.925979680696662e-05, "loss": 2.1959, "step": 3700 }, { "epoch": 1.0740203193033382, "eval_loss": 1.1004406213760376, "eval_runtime": 213.3658, "eval_samples_per_second": 213.244, "eval_steps_per_second": 3.332, "step": 3700 }, { "epoch": 1.1030478955007257, "grad_norm": 1.1615545749664307, "learning_rate": 8.896952104499274e-05, "loss": 2.1776, "step": 3800 }, { "epoch": 1.1030478955007257, "eval_loss": 1.0938160419464111, "eval_runtime": 213.3987, "eval_samples_per_second": 213.211, "eval_steps_per_second": 3.332, "step": 3800 }, { "epoch": 1.1320754716981132, "grad_norm": 1.1921610832214355, "learning_rate": 8.867924528301888e-05, "loss": 2.1762, "step": 3900 }, { "epoch": 1.1320754716981132, "eval_loss": 1.0960694551467896, "eval_runtime": 213.1832, "eval_samples_per_second": 213.427, "eval_steps_per_second": 3.335, "step": 3900 }, { "epoch": 1.1611030478955007, "grad_norm": 1.1980363130569458, "learning_rate": 8.8388969521045e-05, "loss": 2.1717, "step": 4000 }, { "epoch": 1.1611030478955007, "eval_loss": 1.0951919555664062, "eval_runtime": 213.4024, "eval_samples_per_second": 213.207, "eval_steps_per_second": 3.332, "step": 4000 }, { "epoch": 1.1901306240928882, "grad_norm": 1.217236042022705, "learning_rate": 8.809869375907113e-05, "loss": 2.1534, "step": 4100 }, { "epoch": 1.1901306240928882, "eval_loss": 1.0937577486038208, "eval_runtime": 213.8113, "eval_samples_per_second": 212.8, "eval_steps_per_second": 3.325, "step": 4100 }, { "epoch": 1.2191582002902757, "grad_norm": 1.2121118307113647, "learning_rate": 8.780841799709725e-05, "loss": 2.1639, "step": 4200 }, { "epoch": 1.2191582002902757, "eval_loss": 1.0909945964813232, "eval_runtime": 212.8308, "eval_samples_per_second": 213.78, "eval_steps_per_second": 3.341, "step": 4200 }, { "epoch": 1.2481857764876634, "grad_norm": 1.17587411403656, "learning_rate": 8.751814223512336e-05, "loss": 2.146, "step": 4300 }, { "epoch": 1.2481857764876634, "eval_loss": 1.0888868570327759, "eval_runtime": 213.8752, "eval_samples_per_second": 212.736, "eval_steps_per_second": 3.324, "step": 4300 }, { "epoch": 1.2772133526850509, "grad_norm": 1.2848412990570068, "learning_rate": 8.722786647314949e-05, "loss": 2.1357, "step": 4400 }, { "epoch": 1.2772133526850509, "eval_loss": 1.091068983078003, "eval_runtime": 213.4081, "eval_samples_per_second": 213.202, "eval_steps_per_second": 3.332, "step": 4400 }, { "epoch": 1.3062409288824384, "grad_norm": 1.2059731483459473, "learning_rate": 8.693759071117562e-05, "loss": 2.1456, "step": 4500 }, { "epoch": 1.3062409288824384, "eval_loss": 1.0857021808624268, "eval_runtime": 213.7314, "eval_samples_per_second": 212.879, "eval_steps_per_second": 3.327, "step": 4500 }, { "epoch": 1.3352685050798259, "grad_norm": 1.226241946220398, "learning_rate": 8.664731494920174e-05, "loss": 2.1453, "step": 4600 }, { "epoch": 1.3352685050798259, "eval_loss": 1.0845140218734741, "eval_runtime": 213.4698, "eval_samples_per_second": 213.14, "eval_steps_per_second": 3.331, "step": 4600 }, { "epoch": 1.3642960812772134, "grad_norm": 1.1810499429702759, "learning_rate": 8.635703918722787e-05, "loss": 2.1425, "step": 4700 }, { "epoch": 1.3642960812772134, "eval_loss": 1.0831544399261475, "eval_runtime": 214.2077, "eval_samples_per_second": 212.406, "eval_steps_per_second": 3.319, "step": 4700 }, { "epoch": 1.3933236574746009, "grad_norm": 1.155281662940979, "learning_rate": 8.606676342525399e-05, "loss": 2.1173, "step": 4800 }, { "epoch": 1.3933236574746009, "eval_loss": 1.0785441398620605, "eval_runtime": 213.6973, "eval_samples_per_second": 212.913, "eval_steps_per_second": 3.327, "step": 4800 }, { "epoch": 1.4223512336719883, "grad_norm": 1.2070744037628174, "learning_rate": 8.577648766328012e-05, "loss": 2.1183, "step": 4900 }, { "epoch": 1.4223512336719883, "eval_loss": 1.0808286666870117, "eval_runtime": 213.4564, "eval_samples_per_second": 213.154, "eval_steps_per_second": 3.331, "step": 4900 }, { "epoch": 1.4513788098693758, "grad_norm": 1.1901525259017944, "learning_rate": 8.548621190130625e-05, "loss": 2.1274, "step": 5000 }, { "epoch": 1.4513788098693758, "eval_loss": 1.0827044248580933, "eval_runtime": 212.5592, "eval_samples_per_second": 214.053, "eval_steps_per_second": 3.345, "step": 5000 }, { "epoch": 1.4804063860667633, "grad_norm": 1.1999766826629639, "learning_rate": 8.519593613933237e-05, "loss": 2.1145, "step": 5100 }, { "epoch": 1.4804063860667633, "eval_loss": 1.078644037246704, "eval_runtime": 213.0532, "eval_samples_per_second": 213.557, "eval_steps_per_second": 3.337, "step": 5100 }, { "epoch": 1.509433962264151, "grad_norm": 1.2294871807098389, "learning_rate": 8.49056603773585e-05, "loss": 2.1067, "step": 5200 }, { "epoch": 1.509433962264151, "eval_loss": 1.0794402360916138, "eval_runtime": 212.9617, "eval_samples_per_second": 213.649, "eval_steps_per_second": 3.339, "step": 5200 }, { "epoch": 1.5384615384615383, "grad_norm": 1.2571580410003662, "learning_rate": 8.461538461538461e-05, "loss": 2.1032, "step": 5300 }, { "epoch": 1.5384615384615383, "eval_loss": 1.0783346891403198, "eval_runtime": 213.4656, "eval_samples_per_second": 213.144, "eval_steps_per_second": 3.331, "step": 5300 }, { "epoch": 1.567489114658926, "grad_norm": 1.2078722715377808, "learning_rate": 8.432510885341074e-05, "loss": 2.0912, "step": 5400 }, { "epoch": 1.567489114658926, "eval_loss": 1.0764219760894775, "eval_runtime": 213.826, "eval_samples_per_second": 212.785, "eval_steps_per_second": 3.325, "step": 5400 }, { "epoch": 1.5965166908563135, "grad_norm": 1.272294521331787, "learning_rate": 8.403483309143688e-05, "loss": 2.0784, "step": 5500 }, { "epoch": 1.5965166908563135, "eval_loss": 1.0817687511444092, "eval_runtime": 213.443, "eval_samples_per_second": 213.167, "eval_steps_per_second": 3.331, "step": 5500 }, { "epoch": 1.625544267053701, "grad_norm": 1.2367442846298218, "learning_rate": 8.374455732946299e-05, "loss": 2.0997, "step": 5600 }, { "epoch": 1.625544267053701, "eval_loss": 1.079858660697937, "eval_runtime": 213.7339, "eval_samples_per_second": 212.877, "eval_steps_per_second": 3.327, "step": 5600 }, { "epoch": 1.6545718432510885, "grad_norm": 1.2720229625701904, "learning_rate": 8.345428156748912e-05, "loss": 2.093, "step": 5700 }, { "epoch": 1.6545718432510885, "eval_loss": 1.0779507160186768, "eval_runtime": 213.2034, "eval_samples_per_second": 213.407, "eval_steps_per_second": 3.335, "step": 5700 }, { "epoch": 1.683599419448476, "grad_norm": 1.1694726943969727, "learning_rate": 8.316400580551524e-05, "loss": 2.0822, "step": 5800 }, { "epoch": 1.683599419448476, "eval_loss": 1.068250060081482, "eval_runtime": 213.1022, "eval_samples_per_second": 213.508, "eval_steps_per_second": 3.336, "step": 5800 }, { "epoch": 1.7126269956458637, "grad_norm": 1.2155323028564453, "learning_rate": 8.287373004354137e-05, "loss": 2.0792, "step": 5900 }, { "epoch": 1.7126269956458637, "eval_loss": 1.0666776895523071, "eval_runtime": 213.4935, "eval_samples_per_second": 213.117, "eval_steps_per_second": 3.33, "step": 5900 }, { "epoch": 1.741654571843251, "grad_norm": 1.3163602352142334, "learning_rate": 8.25834542815675e-05, "loss": 2.0712, "step": 6000 }, { "epoch": 1.741654571843251, "eval_loss": 1.0677340030670166, "eval_runtime": 213.751, "eval_samples_per_second": 212.86, "eval_steps_per_second": 3.326, "step": 6000 }, { "epoch": 1.7706821480406387, "grad_norm": 1.1972286701202393, "learning_rate": 8.229317851959362e-05, "loss": 2.0679, "step": 6100 }, { "epoch": 1.7706821480406387, "eval_loss": 1.0662775039672852, "eval_runtime": 213.7705, "eval_samples_per_second": 212.84, "eval_steps_per_second": 3.326, "step": 6100 }, { "epoch": 1.799709724238026, "grad_norm": 1.189395546913147, "learning_rate": 8.200290275761974e-05, "loss": 2.0753, "step": 6200 }, { "epoch": 1.799709724238026, "eval_loss": 1.0646038055419922, "eval_runtime": 213.3945, "eval_samples_per_second": 213.215, "eval_steps_per_second": 3.332, "step": 6200 }, { "epoch": 1.8287373004354137, "grad_norm": 1.2696415185928345, "learning_rate": 8.171262699564587e-05, "loss": 2.063, "step": 6300 }, { "epoch": 1.8287373004354137, "eval_loss": 1.0669814348220825, "eval_runtime": 213.7587, "eval_samples_per_second": 212.852, "eval_steps_per_second": 3.326, "step": 6300 }, { "epoch": 1.8577648766328012, "grad_norm": 1.241452693939209, "learning_rate": 8.142235123367198e-05, "loss": 2.0508, "step": 6400 }, { "epoch": 1.8577648766328012, "eval_loss": 1.072275996208191, "eval_runtime": 213.3197, "eval_samples_per_second": 213.29, "eval_steps_per_second": 3.333, "step": 6400 }, { "epoch": 1.8867924528301887, "grad_norm": 1.22267484664917, "learning_rate": 8.113207547169813e-05, "loss": 2.07, "step": 6500 }, { "epoch": 1.8867924528301887, "eval_loss": 1.0654535293579102, "eval_runtime": 214.0386, "eval_samples_per_second": 212.574, "eval_steps_per_second": 3.322, "step": 6500 }, { "epoch": 1.9158200290275762, "grad_norm": 1.2704839706420898, "learning_rate": 8.084179970972424e-05, "loss": 2.0646, "step": 6600 }, { "epoch": 1.9158200290275762, "eval_loss": 1.0614382028579712, "eval_runtime": 213.4971, "eval_samples_per_second": 213.113, "eval_steps_per_second": 3.33, "step": 6600 }, { "epoch": 1.9448476052249637, "grad_norm": 1.3870867490768433, "learning_rate": 8.055152394775036e-05, "loss": 2.0598, "step": 6700 }, { "epoch": 1.9448476052249637, "eval_loss": 1.067047357559204, "eval_runtime": 214.0952, "eval_samples_per_second": 212.518, "eval_steps_per_second": 3.321, "step": 6700 }, { "epoch": 1.9738751814223512, "grad_norm": 1.3581643104553223, "learning_rate": 8.026124818577649e-05, "loss": 2.0501, "step": 6800 }, { "epoch": 1.9738751814223512, "eval_loss": 1.0663081407546997, "eval_runtime": 213.8995, "eval_samples_per_second": 212.712, "eval_steps_per_second": 3.324, "step": 6800 }, { "epoch": 2.0029027576197387, "grad_norm": 1.3438752889633179, "learning_rate": 7.997097242380261e-05, "loss": 2.0332, "step": 6900 }, { "epoch": 2.0029027576197387, "eval_loss": 1.059921383857727, "eval_runtime": 213.0183, "eval_samples_per_second": 213.592, "eval_steps_per_second": 3.338, "step": 6900 }, { "epoch": 2.0319303338171264, "grad_norm": 1.3646849393844604, "learning_rate": 7.968069666182875e-05, "loss": 2.0463, "step": 7000 }, { "epoch": 2.0319303338171264, "eval_loss": 1.0679893493652344, "eval_runtime": 213.3912, "eval_samples_per_second": 213.219, "eval_steps_per_second": 3.332, "step": 7000 }, { "epoch": 2.0609579100145137, "grad_norm": 1.2047359943389893, "learning_rate": 7.939042089985487e-05, "loss": 2.0376, "step": 7100 }, { "epoch": 2.0609579100145137, "eval_loss": 1.0566322803497314, "eval_runtime": 213.6266, "eval_samples_per_second": 212.984, "eval_steps_per_second": 3.328, "step": 7100 }, { "epoch": 2.0899854862119014, "grad_norm": 1.2285219430923462, "learning_rate": 7.910014513788099e-05, "loss": 2.0327, "step": 7200 }, { "epoch": 2.0899854862119014, "eval_loss": 1.058618426322937, "eval_runtime": 213.6922, "eval_samples_per_second": 212.918, "eval_steps_per_second": 3.327, "step": 7200 }, { "epoch": 2.1190130624092887, "grad_norm": 1.2674715518951416, "learning_rate": 7.880986937590712e-05, "loss": 2.0347, "step": 7300 }, { "epoch": 2.1190130624092887, "eval_loss": 1.0599507093429565, "eval_runtime": 213.5256, "eval_samples_per_second": 213.085, "eval_steps_per_second": 3.33, "step": 7300 }, { "epoch": 2.1480406386066764, "grad_norm": 1.3713229894638062, "learning_rate": 7.851959361393323e-05, "loss": 2.0321, "step": 7400 }, { "epoch": 2.1480406386066764, "eval_loss": 1.0617178678512573, "eval_runtime": 213.0273, "eval_samples_per_second": 213.583, "eval_steps_per_second": 3.338, "step": 7400 }, { "epoch": 2.1770682148040637, "grad_norm": 1.292090654373169, "learning_rate": 7.822931785195937e-05, "loss": 2.01, "step": 7500 }, { "epoch": 2.1770682148040637, "eval_loss": 1.0593364238739014, "eval_runtime": 213.421, "eval_samples_per_second": 213.189, "eval_steps_per_second": 3.331, "step": 7500 }, { "epoch": 2.2060957910014514, "grad_norm": 1.1819452047348022, "learning_rate": 7.79390420899855e-05, "loss": 2.0209, "step": 7600 }, { "epoch": 2.2060957910014514, "eval_loss": 1.0524711608886719, "eval_runtime": 214.0149, "eval_samples_per_second": 212.597, "eval_steps_per_second": 3.322, "step": 7600 }, { "epoch": 2.235123367198839, "grad_norm": 1.2881128787994385, "learning_rate": 7.764876632801161e-05, "loss": 2.0085, "step": 7700 }, { "epoch": 2.235123367198839, "eval_loss": 1.0567752122879028, "eval_runtime": 213.6228, "eval_samples_per_second": 212.988, "eval_steps_per_second": 3.328, "step": 7700 }, { "epoch": 2.2641509433962264, "grad_norm": 1.2962584495544434, "learning_rate": 7.735849056603774e-05, "loss": 2.0204, "step": 7800 }, { "epoch": 2.2641509433962264, "eval_loss": 1.0586293935775757, "eval_runtime": 213.3516, "eval_samples_per_second": 213.258, "eval_steps_per_second": 3.333, "step": 7800 }, { "epoch": 2.293178519593614, "grad_norm": 1.2214884757995605, "learning_rate": 7.706821480406386e-05, "loss": 2.0184, "step": 7900 }, { "epoch": 2.293178519593614, "eval_loss": 1.0525050163269043, "eval_runtime": 212.5483, "eval_samples_per_second": 214.064, "eval_steps_per_second": 3.345, "step": 7900 }, { "epoch": 2.3222060957910013, "grad_norm": 1.2622853517532349, "learning_rate": 7.677793904208999e-05, "loss": 2.0162, "step": 8000 }, { "epoch": 2.3222060957910013, "eval_loss": 1.0512940883636475, "eval_runtime": 212.6462, "eval_samples_per_second": 213.966, "eval_steps_per_second": 3.344, "step": 8000 }, { "epoch": 2.351233671988389, "grad_norm": 1.2338088750839233, "learning_rate": 7.648766328011612e-05, "loss": 2.0029, "step": 8100 }, { "epoch": 2.351233671988389, "eval_loss": 1.0521414279937744, "eval_runtime": 213.5358, "eval_samples_per_second": 213.074, "eval_steps_per_second": 3.33, "step": 8100 }, { "epoch": 2.3802612481857763, "grad_norm": 1.2111109495162964, "learning_rate": 7.619738751814224e-05, "loss": 2.0101, "step": 8200 }, { "epoch": 2.3802612481857763, "eval_loss": 1.0501890182495117, "eval_runtime": 213.0351, "eval_samples_per_second": 213.575, "eval_steps_per_second": 3.337, "step": 8200 }, { "epoch": 2.409288824383164, "grad_norm": 1.2333025932312012, "learning_rate": 7.590711175616836e-05, "loss": 2.0, "step": 8300 }, { "epoch": 2.409288824383164, "eval_loss": 1.051579236984253, "eval_runtime": 213.5529, "eval_samples_per_second": 213.057, "eval_steps_per_second": 3.329, "step": 8300 }, { "epoch": 2.4383164005805513, "grad_norm": 1.3394699096679688, "learning_rate": 7.561683599419449e-05, "loss": 1.9986, "step": 8400 }, { "epoch": 2.4383164005805513, "eval_loss": 1.0520364046096802, "eval_runtime": 212.3818, "eval_samples_per_second": 214.232, "eval_steps_per_second": 3.348, "step": 8400 }, { "epoch": 2.467343976777939, "grad_norm": 1.334936261177063, "learning_rate": 7.532656023222062e-05, "loss": 1.993, "step": 8500 }, { "epoch": 2.467343976777939, "eval_loss": 1.0490361452102661, "eval_runtime": 213.9415, "eval_samples_per_second": 212.67, "eval_steps_per_second": 3.323, "step": 8500 }, { "epoch": 2.4963715529753268, "grad_norm": 1.3085263967514038, "learning_rate": 7.503628447024675e-05, "loss": 1.9771, "step": 8600 }, { "epoch": 2.4963715529753268, "eval_loss": 1.0522186756134033, "eval_runtime": 212.3302, "eval_samples_per_second": 214.284, "eval_steps_per_second": 3.349, "step": 8600 }, { "epoch": 2.525399129172714, "grad_norm": 1.4204107522964478, "learning_rate": 7.474600870827286e-05, "loss": 1.9848, "step": 8700 }, { "epoch": 2.525399129172714, "eval_loss": 1.0486035346984863, "eval_runtime": 213.5477, "eval_samples_per_second": 213.062, "eval_steps_per_second": 3.329, "step": 8700 }, { "epoch": 2.5544267053701017, "grad_norm": 1.2411503791809082, "learning_rate": 7.445573294629898e-05, "loss": 2.0016, "step": 8800 }, { "epoch": 2.5544267053701017, "eval_loss": 1.0516774654388428, "eval_runtime": 213.2425, "eval_samples_per_second": 213.367, "eval_steps_per_second": 3.334, "step": 8800 }, { "epoch": 2.583454281567489, "grad_norm": 1.2166720628738403, "learning_rate": 7.416545718432511e-05, "loss": 1.9761, "step": 8900 }, { "epoch": 2.583454281567489, "eval_loss": 1.0438764095306396, "eval_runtime": 213.2447, "eval_samples_per_second": 213.365, "eval_steps_per_second": 3.334, "step": 8900 }, { "epoch": 2.6124818577648767, "grad_norm": 1.307707667350769, "learning_rate": 7.387518142235124e-05, "loss": 1.9753, "step": 9000 }, { "epoch": 2.6124818577648767, "eval_loss": 1.0445740222930908, "eval_runtime": 212.5813, "eval_samples_per_second": 214.031, "eval_steps_per_second": 3.345, "step": 9000 }, { "epoch": 2.641509433962264, "grad_norm": 1.3446862697601318, "learning_rate": 7.358490566037736e-05, "loss": 1.9795, "step": 9100 }, { "epoch": 2.641509433962264, "eval_loss": 1.0461750030517578, "eval_runtime": 213.2022, "eval_samples_per_second": 213.408, "eval_steps_per_second": 3.335, "step": 9100 }, { "epoch": 2.6705370101596517, "grad_norm": 1.25364351272583, "learning_rate": 7.329462989840349e-05, "loss": 1.966, "step": 9200 }, { "epoch": 2.6705370101596517, "eval_loss": 1.0489540100097656, "eval_runtime": 213.3373, "eval_samples_per_second": 213.273, "eval_steps_per_second": 3.333, "step": 9200 }, { "epoch": 2.699564586357039, "grad_norm": 1.317325472831726, "learning_rate": 7.300435413642961e-05, "loss": 1.9853, "step": 9300 }, { "epoch": 2.699564586357039, "eval_loss": 1.04426109790802, "eval_runtime": 212.5953, "eval_samples_per_second": 214.017, "eval_steps_per_second": 3.344, "step": 9300 }, { "epoch": 2.7285921625544267, "grad_norm": 1.2580476999282837, "learning_rate": 7.271407837445574e-05, "loss": 1.9873, "step": 9400 }, { "epoch": 2.7285921625544267, "eval_loss": 1.0441796779632568, "eval_runtime": 213.1744, "eval_samples_per_second": 213.436, "eval_steps_per_second": 3.335, "step": 9400 } ], "logging_steps": 100, "max_steps": 34450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.403409048272896e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }