{ "best_metric": 1.0211207866668701, "best_model_checkpoint": "mgh6/TCS_MLM_50/checkpoint-12500", "epoch": 3.7735849056603774, "eval_steps": 100, "global_step": 13000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02902757619738752, "grad_norm": 1.1695395708084106, "learning_rate": 9.970972423802612e-05, "loss": 2.8263, "step": 100 }, { "epoch": 0.02902757619738752, "eval_loss": 1.2625532150268555, "eval_runtime": 213.9369, "eval_samples_per_second": 212.651, "eval_steps_per_second": 3.323, "step": 100 }, { "epoch": 0.05805515239477504, "grad_norm": 1.1860003471374512, "learning_rate": 9.941944847605225e-05, "loss": 2.7152, "step": 200 }, { "epoch": 0.05805515239477504, "eval_loss": 1.2428085803985596, "eval_runtime": 214.2707, "eval_samples_per_second": 212.32, "eval_steps_per_second": 3.318, "step": 200 }, { "epoch": 0.08708272859216255, "grad_norm": 1.138083577156067, "learning_rate": 9.912917271407838e-05, "loss": 2.6496, "step": 300 }, { "epoch": 0.08708272859216255, "eval_loss": 1.2260128259658813, "eval_runtime": 214.1595, "eval_samples_per_second": 212.43, "eval_steps_per_second": 3.32, "step": 300 }, { "epoch": 0.11611030478955008, "grad_norm": 1.118958592414856, "learning_rate": 9.883889695210451e-05, "loss": 2.6016, "step": 400 }, { "epoch": 0.11611030478955008, "eval_loss": 1.2176499366760254, "eval_runtime": 214.7144, "eval_samples_per_second": 211.881, "eval_steps_per_second": 3.311, "step": 400 }, { "epoch": 0.14513788098693758, "grad_norm": 1.0901985168457031, "learning_rate": 9.854862119013063e-05, "loss": 2.5744, "step": 500 }, { "epoch": 0.14513788098693758, "eval_loss": 1.200432300567627, "eval_runtime": 214.0554, "eval_samples_per_second": 212.534, "eval_steps_per_second": 3.322, "step": 500 }, { "epoch": 0.1741654571843251, "grad_norm": 1.1364562511444092, "learning_rate": 9.825834542815675e-05, "loss": 2.5412, "step": 600 }, { "epoch": 0.1741654571843251, "eval_loss": 1.19223153591156, "eval_runtime": 214.4792, "eval_samples_per_second": 212.114, "eval_steps_per_second": 3.315, "step": 600 }, { "epoch": 0.20319303338171263, "grad_norm": 1.1283212900161743, "learning_rate": 9.796806966618288e-05, "loss": 2.5318, "step": 700 }, { "epoch": 0.20319303338171263, "eval_loss": 1.1891732215881348, "eval_runtime": 214.0534, "eval_samples_per_second": 212.536, "eval_steps_per_second": 3.322, "step": 700 }, { "epoch": 0.23222060957910015, "grad_norm": 1.123288631439209, "learning_rate": 9.767779390420901e-05, "loss": 2.4897, "step": 800 }, { "epoch": 0.23222060957910015, "eval_loss": 1.1818548440933228, "eval_runtime": 214.4047, "eval_samples_per_second": 212.187, "eval_steps_per_second": 3.316, "step": 800 }, { "epoch": 0.2612481857764877, "grad_norm": 1.2019628286361694, "learning_rate": 9.738751814223513e-05, "loss": 2.4833, "step": 900 }, { "epoch": 0.2612481857764877, "eval_loss": 1.169024109840393, "eval_runtime": 214.4504, "eval_samples_per_second": 212.142, "eval_steps_per_second": 3.315, "step": 900 }, { "epoch": 0.29027576197387517, "grad_norm": 1.1262823343276978, "learning_rate": 9.709724238026126e-05, "loss": 2.4637, "step": 1000 }, { "epoch": 0.29027576197387517, "eval_loss": 1.1714328527450562, "eval_runtime": 213.9877, "eval_samples_per_second": 212.601, "eval_steps_per_second": 3.323, "step": 1000 }, { "epoch": 0.3193033381712627, "grad_norm": 1.1507551670074463, "learning_rate": 9.680696661828737e-05, "loss": 2.4408, "step": 1100 }, { "epoch": 0.3193033381712627, "eval_loss": 1.165282130241394, "eval_runtime": 214.1235, "eval_samples_per_second": 212.466, "eval_steps_per_second": 3.321, "step": 1100 }, { "epoch": 0.3483309143686502, "grad_norm": 1.1482508182525635, "learning_rate": 9.65166908563135e-05, "loss": 2.4353, "step": 1200 }, { "epoch": 0.3483309143686502, "eval_loss": 1.162631630897522, "eval_runtime": 214.0787, "eval_samples_per_second": 212.511, "eval_steps_per_second": 3.321, "step": 1200 }, { "epoch": 0.37735849056603776, "grad_norm": 1.1512328386306763, "learning_rate": 9.622641509433963e-05, "loss": 2.404, "step": 1300 }, { "epoch": 0.37735849056603776, "eval_loss": 1.1568048000335693, "eval_runtime": 214.1359, "eval_samples_per_second": 212.454, "eval_steps_per_second": 3.32, "step": 1300 }, { "epoch": 0.40638606676342526, "grad_norm": 1.1897668838500977, "learning_rate": 9.593613933236575e-05, "loss": 2.3905, "step": 1400 }, { "epoch": 0.40638606676342526, "eval_loss": 1.1544520854949951, "eval_runtime": 214.4841, "eval_samples_per_second": 212.109, "eval_steps_per_second": 3.315, "step": 1400 }, { "epoch": 0.43541364296081275, "grad_norm": 1.1775190830230713, "learning_rate": 9.564586357039188e-05, "loss": 2.3792, "step": 1500 }, { "epoch": 0.43541364296081275, "eval_loss": 1.1439248323440552, "eval_runtime": 213.8637, "eval_samples_per_second": 212.724, "eval_steps_per_second": 3.325, "step": 1500 }, { "epoch": 0.4644412191582003, "grad_norm": 1.1436594724655151, "learning_rate": 9.5355587808418e-05, "loss": 2.3592, "step": 1600 }, { "epoch": 0.4644412191582003, "eval_loss": 1.1428674459457397, "eval_runtime": 214.3984, "eval_samples_per_second": 212.194, "eval_steps_per_second": 3.316, "step": 1600 }, { "epoch": 0.4934687953555878, "grad_norm": 1.1852160692214966, "learning_rate": 9.506531204644412e-05, "loss": 2.3539, "step": 1700 }, { "epoch": 0.4934687953555878, "eval_loss": 1.1439515352249146, "eval_runtime": 214.0153, "eval_samples_per_second": 212.574, "eval_steps_per_second": 3.322, "step": 1700 }, { "epoch": 0.5224963715529753, "grad_norm": 1.2375448942184448, "learning_rate": 9.477503628447025e-05, "loss": 2.3489, "step": 1800 }, { "epoch": 0.5224963715529753, "eval_loss": 1.1391005516052246, "eval_runtime": 214.5494, "eval_samples_per_second": 212.044, "eval_steps_per_second": 3.314, "step": 1800 }, { "epoch": 0.5515239477503628, "grad_norm": 1.1505770683288574, "learning_rate": 9.448476052249638e-05, "loss": 2.3336, "step": 1900 }, { "epoch": 0.5515239477503628, "eval_loss": 1.1322171688079834, "eval_runtime": 213.2802, "eval_samples_per_second": 213.306, "eval_steps_per_second": 3.334, "step": 1900 }, { "epoch": 0.5805515239477503, "grad_norm": 1.1152174472808838, "learning_rate": 9.419448476052251e-05, "loss": 2.3321, "step": 2000 }, { "epoch": 0.5805515239477503, "eval_loss": 1.1339818239212036, "eval_runtime": 213.6442, "eval_samples_per_second": 212.943, "eval_steps_per_second": 3.328, "step": 2000 }, { "epoch": 0.6095791001451378, "grad_norm": 1.1027612686157227, "learning_rate": 9.390420899854863e-05, "loss": 2.3039, "step": 2100 }, { "epoch": 0.6095791001451378, "eval_loss": 1.1304194927215576, "eval_runtime": 213.0147, "eval_samples_per_second": 213.572, "eval_steps_per_second": 3.338, "step": 2100 }, { "epoch": 0.6386066763425254, "grad_norm": 1.1585232019424438, "learning_rate": 9.361393323657474e-05, "loss": 2.3101, "step": 2200 }, { "epoch": 0.6386066763425254, "eval_loss": 1.1316287517547607, "eval_runtime": 214.3522, "eval_samples_per_second": 212.239, "eval_steps_per_second": 3.317, "step": 2200 }, { "epoch": 0.6676342525399129, "grad_norm": 1.1749528646469116, "learning_rate": 9.332365747460087e-05, "loss": 2.3048, "step": 2300 }, { "epoch": 0.6676342525399129, "eval_loss": 1.1262996196746826, "eval_runtime": 214.6823, "eval_samples_per_second": 211.913, "eval_steps_per_second": 3.312, "step": 2300 }, { "epoch": 0.6966618287373004, "grad_norm": 1.1533962488174438, "learning_rate": 9.3033381712627e-05, "loss": 2.2808, "step": 2400 }, { "epoch": 0.6966618287373004, "eval_loss": 1.1249016523361206, "eval_runtime": 214.0327, "eval_samples_per_second": 212.556, "eval_steps_per_second": 3.322, "step": 2400 }, { "epoch": 0.7256894049346879, "grad_norm": 1.1524910926818848, "learning_rate": 9.274310595065312e-05, "loss": 2.2865, "step": 2500 }, { "epoch": 0.7256894049346879, "eval_loss": 1.1257150173187256, "eval_runtime": 213.721, "eval_samples_per_second": 212.866, "eval_steps_per_second": 3.327, "step": 2500 }, { "epoch": 0.7547169811320755, "grad_norm": 1.1282308101654053, "learning_rate": 9.245283018867925e-05, "loss": 2.2654, "step": 2600 }, { "epoch": 0.7547169811320755, "eval_loss": 1.1186834573745728, "eval_runtime": 214.3515, "eval_samples_per_second": 212.24, "eval_steps_per_second": 3.317, "step": 2600 }, { "epoch": 0.783744557329463, "grad_norm": 1.239816427230835, "learning_rate": 9.216255442670537e-05, "loss": 2.2564, "step": 2700 }, { "epoch": 0.783744557329463, "eval_loss": 1.1156889200210571, "eval_runtime": 213.6092, "eval_samples_per_second": 212.978, "eval_steps_per_second": 3.329, "step": 2700 }, { "epoch": 0.8127721335268505, "grad_norm": 1.2036716938018799, "learning_rate": 9.18722786647315e-05, "loss": 2.2453, "step": 2800 }, { "epoch": 0.8127721335268505, "eval_loss": 1.118744134902954, "eval_runtime": 214.1658, "eval_samples_per_second": 212.424, "eval_steps_per_second": 3.32, "step": 2800 }, { "epoch": 0.841799709724238, "grad_norm": 1.2474415302276611, "learning_rate": 9.158200290275763e-05, "loss": 2.2402, "step": 2900 }, { "epoch": 0.841799709724238, "eval_loss": 1.1129833459854126, "eval_runtime": 214.2278, "eval_samples_per_second": 212.363, "eval_steps_per_second": 3.319, "step": 2900 }, { "epoch": 0.8708272859216255, "grad_norm": 1.2137649059295654, "learning_rate": 9.129172714078375e-05, "loss": 2.2243, "step": 3000 }, { "epoch": 0.8708272859216255, "eval_loss": 1.1113933324813843, "eval_runtime": 213.6564, "eval_samples_per_second": 212.931, "eval_steps_per_second": 3.328, "step": 3000 }, { "epoch": 0.8998548621190131, "grad_norm": 1.2188935279846191, "learning_rate": 9.100145137880988e-05, "loss": 2.2324, "step": 3100 }, { "epoch": 0.8998548621190131, "eval_loss": 1.1111185550689697, "eval_runtime": 214.2911, "eval_samples_per_second": 212.3, "eval_steps_per_second": 3.318, "step": 3100 }, { "epoch": 0.9288824383164006, "grad_norm": 1.2475199699401855, "learning_rate": 9.0711175616836e-05, "loss": 2.2329, "step": 3200 }, { "epoch": 0.9288824383164006, "eval_loss": 1.1126782894134521, "eval_runtime": 214.309, "eval_samples_per_second": 212.282, "eval_steps_per_second": 3.318, "step": 3200 }, { "epoch": 0.9579100145137881, "grad_norm": 1.1850870847702026, "learning_rate": 9.042089985486212e-05, "loss": 2.2292, "step": 3300 }, { "epoch": 0.9579100145137881, "eval_loss": 1.1046797037124634, "eval_runtime": 214.0726, "eval_samples_per_second": 212.517, "eval_steps_per_second": 3.321, "step": 3300 }, { "epoch": 0.9869375907111756, "grad_norm": 1.1915068626403809, "learning_rate": 9.013062409288826e-05, "loss": 2.2169, "step": 3400 }, { "epoch": 0.9869375907111756, "eval_loss": 1.1029560565948486, "eval_runtime": 214.518, "eval_samples_per_second": 212.075, "eval_steps_per_second": 3.314, "step": 3400 }, { "epoch": 1.0159651669085632, "grad_norm": 1.3059227466583252, "learning_rate": 8.984034833091437e-05, "loss": 2.2112, "step": 3500 }, { "epoch": 1.0159651669085632, "eval_loss": 1.1044234037399292, "eval_runtime": 214.4346, "eval_samples_per_second": 212.158, "eval_steps_per_second": 3.316, "step": 3500 }, { "epoch": 1.0449927431059507, "grad_norm": 1.3193408250808716, "learning_rate": 8.95500725689405e-05, "loss": 2.186, "step": 3600 }, { "epoch": 1.0449927431059507, "eval_loss": 1.1042026281356812, "eval_runtime": 214.3105, "eval_samples_per_second": 212.281, "eval_steps_per_second": 3.318, "step": 3600 }, { "epoch": 1.0740203193033382, "grad_norm": 1.1710057258605957, "learning_rate": 8.925979680696662e-05, "loss": 2.1882, "step": 3700 }, { "epoch": 1.0740203193033382, "eval_loss": 1.0975611209869385, "eval_runtime": 214.3954, "eval_samples_per_second": 212.197, "eval_steps_per_second": 3.316, "step": 3700 }, { "epoch": 1.1030478955007257, "grad_norm": 1.1426420211791992, "learning_rate": 8.896952104499274e-05, "loss": 2.1697, "step": 3800 }, { "epoch": 1.1030478955007257, "eval_loss": 1.0976998805999756, "eval_runtime": 212.987, "eval_samples_per_second": 213.6, "eval_steps_per_second": 3.338, "step": 3800 }, { "epoch": 1.1320754716981132, "grad_norm": 1.1272858381271362, "learning_rate": 8.867924528301888e-05, "loss": 2.1836, "step": 3900 }, { "epoch": 1.1320754716981132, "eval_loss": 1.0982595682144165, "eval_runtime": 214.3281, "eval_samples_per_second": 212.263, "eval_steps_per_second": 3.317, "step": 3900 }, { "epoch": 1.1611030478955007, "grad_norm": 1.141606330871582, "learning_rate": 8.8388969521045e-05, "loss": 2.1668, "step": 4000 }, { "epoch": 1.1611030478955007, "eval_loss": 1.0947861671447754, "eval_runtime": 214.7163, "eval_samples_per_second": 211.88, "eval_steps_per_second": 3.311, "step": 4000 }, { "epoch": 1.1901306240928882, "grad_norm": 1.197513222694397, "learning_rate": 8.809869375907113e-05, "loss": 2.1537, "step": 4100 }, { "epoch": 1.1901306240928882, "eval_loss": 1.0914397239685059, "eval_runtime": 214.3796, "eval_samples_per_second": 212.212, "eval_steps_per_second": 3.317, "step": 4100 }, { "epoch": 1.2191582002902757, "grad_norm": 1.2622817754745483, "learning_rate": 8.780841799709725e-05, "loss": 2.1609, "step": 4200 }, { "epoch": 1.2191582002902757, "eval_loss": 1.0932444334030151, "eval_runtime": 214.4603, "eval_samples_per_second": 212.133, "eval_steps_per_second": 3.315, "step": 4200 }, { "epoch": 1.2481857764876634, "grad_norm": 1.1745682954788208, "learning_rate": 8.751814223512336e-05, "loss": 2.1448, "step": 4300 }, { "epoch": 1.2481857764876634, "eval_loss": 1.0895456075668335, "eval_runtime": 214.0416, "eval_samples_per_second": 212.548, "eval_steps_per_second": 3.322, "step": 4300 }, { "epoch": 1.2772133526850509, "grad_norm": 1.1918201446533203, "learning_rate": 8.722786647314949e-05, "loss": 2.1552, "step": 4400 }, { "epoch": 1.2772133526850509, "eval_loss": 1.089804768562317, "eval_runtime": 214.182, "eval_samples_per_second": 212.408, "eval_steps_per_second": 3.32, "step": 4400 }, { "epoch": 1.3062409288824384, "grad_norm": 1.2561489343643188, "learning_rate": 8.693759071117562e-05, "loss": 2.1421, "step": 4500 }, { "epoch": 1.3062409288824384, "eval_loss": 1.0928661823272705, "eval_runtime": 214.3073, "eval_samples_per_second": 212.284, "eval_steps_per_second": 3.318, "step": 4500 }, { "epoch": 1.3352685050798259, "grad_norm": 1.1966407299041748, "learning_rate": 8.664731494920174e-05, "loss": 2.1426, "step": 4600 }, { "epoch": 1.3352685050798259, "eval_loss": 1.0840479135513306, "eval_runtime": 214.538, "eval_samples_per_second": 212.056, "eval_steps_per_second": 3.314, "step": 4600 }, { "epoch": 1.3642960812772134, "grad_norm": 1.20412278175354, "learning_rate": 8.635703918722787e-05, "loss": 2.1256, "step": 4700 }, { "epoch": 1.3642960812772134, "eval_loss": 1.085578441619873, "eval_runtime": 214.0855, "eval_samples_per_second": 212.504, "eval_steps_per_second": 3.321, "step": 4700 }, { "epoch": 1.3933236574746009, "grad_norm": 1.1835148334503174, "learning_rate": 8.606676342525399e-05, "loss": 2.1398, "step": 4800 }, { "epoch": 1.3933236574746009, "eval_loss": 1.0841166973114014, "eval_runtime": 213.7271, "eval_samples_per_second": 212.86, "eval_steps_per_second": 3.327, "step": 4800 }, { "epoch": 1.4223512336719883, "grad_norm": 1.1613247394561768, "learning_rate": 8.577648766328012e-05, "loss": 2.1202, "step": 4900 }, { "epoch": 1.4223512336719883, "eval_loss": 1.085669994354248, "eval_runtime": 215.8008, "eval_samples_per_second": 210.815, "eval_steps_per_second": 3.295, "step": 4900 }, { "epoch": 1.4513788098693758, "grad_norm": 1.1468629837036133, "learning_rate": 8.548621190130625e-05, "loss": 2.1123, "step": 5000 }, { "epoch": 1.4513788098693758, "eval_loss": 1.0774667263031006, "eval_runtime": 214.0808, "eval_samples_per_second": 212.509, "eval_steps_per_second": 3.321, "step": 5000 }, { "epoch": 1.4804063860667633, "grad_norm": 1.2450999021530151, "learning_rate": 8.519593613933237e-05, "loss": 2.1104, "step": 5100 }, { "epoch": 1.4804063860667633, "eval_loss": 1.0789214372634888, "eval_runtime": 214.4581, "eval_samples_per_second": 212.135, "eval_steps_per_second": 3.315, "step": 5100 }, { "epoch": 1.509433962264151, "grad_norm": 1.1406731605529785, "learning_rate": 8.49056603773585e-05, "loss": 2.1076, "step": 5200 }, { "epoch": 1.509433962264151, "eval_loss": 1.0758976936340332, "eval_runtime": 214.3317, "eval_samples_per_second": 212.26, "eval_steps_per_second": 3.317, "step": 5200 }, { "epoch": 1.5384615384615383, "grad_norm": 1.2358899116516113, "learning_rate": 8.461538461538461e-05, "loss": 2.1149, "step": 5300 }, { "epoch": 1.5384615384615383, "eval_loss": 1.0744006633758545, "eval_runtime": 214.6001, "eval_samples_per_second": 211.994, "eval_steps_per_second": 3.313, "step": 5300 }, { "epoch": 1.567489114658926, "grad_norm": 1.3809137344360352, "learning_rate": 8.432510885341074e-05, "loss": 2.1052, "step": 5400 }, { "epoch": 1.567489114658926, "eval_loss": 1.0817296504974365, "eval_runtime": 214.5107, "eval_samples_per_second": 212.083, "eval_steps_per_second": 3.315, "step": 5400 }, { "epoch": 1.5965166908563135, "grad_norm": 1.1924511194229126, "learning_rate": 8.403483309143688e-05, "loss": 2.093, "step": 5500 }, { "epoch": 1.5965166908563135, "eval_loss": 1.072150707244873, "eval_runtime": 214.3047, "eval_samples_per_second": 212.286, "eval_steps_per_second": 3.318, "step": 5500 }, { "epoch": 1.625544267053701, "grad_norm": 1.1757687330245972, "learning_rate": 8.374455732946299e-05, "loss": 2.0911, "step": 5600 }, { "epoch": 1.625544267053701, "eval_loss": 1.0715994834899902, "eval_runtime": 214.1638, "eval_samples_per_second": 212.426, "eval_steps_per_second": 3.32, "step": 5600 }, { "epoch": 1.6545718432510885, "grad_norm": 1.263269066810608, "learning_rate": 8.345428156748912e-05, "loss": 2.0912, "step": 5700 }, { "epoch": 1.6545718432510885, "eval_loss": 1.0753726959228516, "eval_runtime": 214.272, "eval_samples_per_second": 212.319, "eval_steps_per_second": 3.318, "step": 5700 }, { "epoch": 1.683599419448476, "grad_norm": 1.1598068475723267, "learning_rate": 8.316400580551524e-05, "loss": 2.085, "step": 5800 }, { "epoch": 1.683599419448476, "eval_loss": 1.0767669677734375, "eval_runtime": 214.1942, "eval_samples_per_second": 212.396, "eval_steps_per_second": 3.319, "step": 5800 }, { "epoch": 1.7126269956458637, "grad_norm": 1.2049143314361572, "learning_rate": 8.287373004354137e-05, "loss": 2.091, "step": 5900 }, { "epoch": 1.7126269956458637, "eval_loss": 1.0710922479629517, "eval_runtime": 214.6737, "eval_samples_per_second": 211.922, "eval_steps_per_second": 3.312, "step": 5900 }, { "epoch": 1.741654571843251, "grad_norm": 1.2534894943237305, "learning_rate": 8.25834542815675e-05, "loss": 2.0744, "step": 6000 }, { "epoch": 1.741654571843251, "eval_loss": 1.07111656665802, "eval_runtime": 214.3274, "eval_samples_per_second": 212.264, "eval_steps_per_second": 3.317, "step": 6000 }, { "epoch": 1.7706821480406387, "grad_norm": 1.260311245918274, "learning_rate": 8.229317851959362e-05, "loss": 2.082, "step": 6100 }, { "epoch": 1.7706821480406387, "eval_loss": 1.0758848190307617, "eval_runtime": 214.3528, "eval_samples_per_second": 212.239, "eval_steps_per_second": 3.317, "step": 6100 }, { "epoch": 1.799709724238026, "grad_norm": 1.192262887954712, "learning_rate": 8.200290275761974e-05, "loss": 2.0647, "step": 6200 }, { "epoch": 1.799709724238026, "eval_loss": 1.0665150880813599, "eval_runtime": 214.1927, "eval_samples_per_second": 212.398, "eval_steps_per_second": 3.319, "step": 6200 }, { "epoch": 1.8287373004354137, "grad_norm": 1.2158530950546265, "learning_rate": 8.171262699564587e-05, "loss": 2.0524, "step": 6300 }, { "epoch": 1.8287373004354137, "eval_loss": 1.0663120746612549, "eval_runtime": 214.6232, "eval_samples_per_second": 211.971, "eval_steps_per_second": 3.313, "step": 6300 }, { "epoch": 1.8577648766328012, "grad_norm": 1.1896952390670776, "learning_rate": 8.142235123367198e-05, "loss": 2.0654, "step": 6400 }, { "epoch": 1.8577648766328012, "eval_loss": 1.064207911491394, "eval_runtime": 213.7354, "eval_samples_per_second": 212.852, "eval_steps_per_second": 3.327, "step": 6400 }, { "epoch": 1.8867924528301887, "grad_norm": 1.1889102458953857, "learning_rate": 8.113207547169813e-05, "loss": 2.0549, "step": 6500 }, { "epoch": 1.8867924528301887, "eval_loss": 1.0605015754699707, "eval_runtime": 214.211, "eval_samples_per_second": 212.379, "eval_steps_per_second": 3.319, "step": 6500 }, { "epoch": 1.9158200290275762, "grad_norm": 1.2628164291381836, "learning_rate": 8.084179970972424e-05, "loss": 2.056, "step": 6600 }, { "epoch": 1.9158200290275762, "eval_loss": 1.060520052909851, "eval_runtime": 214.2715, "eval_samples_per_second": 212.319, "eval_steps_per_second": 3.318, "step": 6600 }, { "epoch": 1.9448476052249637, "grad_norm": 1.203740119934082, "learning_rate": 8.055152394775036e-05, "loss": 2.0645, "step": 6700 }, { "epoch": 1.9448476052249637, "eval_loss": 1.0628570318222046, "eval_runtime": 213.9992, "eval_samples_per_second": 212.59, "eval_steps_per_second": 3.322, "step": 6700 }, { "epoch": 1.9738751814223512, "grad_norm": 1.228109359741211, "learning_rate": 8.026124818577649e-05, "loss": 2.0528, "step": 6800 }, { "epoch": 1.9738751814223512, "eval_loss": 1.0618752241134644, "eval_runtime": 214.3127, "eval_samples_per_second": 212.279, "eval_steps_per_second": 3.318, "step": 6800 }, { "epoch": 2.0029027576197387, "grad_norm": 1.2511652708053589, "learning_rate": 7.997097242380261e-05, "loss": 2.0643, "step": 6900 }, { "epoch": 2.0029027576197387, "eval_loss": 1.0580365657806396, "eval_runtime": 214.5925, "eval_samples_per_second": 212.002, "eval_steps_per_second": 3.313, "step": 6900 }, { "epoch": 2.0319303338171264, "grad_norm": 1.2169101238250732, "learning_rate": 7.968069666182875e-05, "loss": 2.0342, "step": 7000 }, { "epoch": 2.0319303338171264, "eval_loss": 1.0617866516113281, "eval_runtime": 214.3766, "eval_samples_per_second": 212.215, "eval_steps_per_second": 3.317, "step": 7000 }, { "epoch": 2.0609579100145137, "grad_norm": 1.1878671646118164, "learning_rate": 7.939042089985487e-05, "loss": 2.0391, "step": 7100 }, { "epoch": 2.0609579100145137, "eval_loss": 1.0561269521713257, "eval_runtime": 213.6515, "eval_samples_per_second": 212.936, "eval_steps_per_second": 3.328, "step": 7100 }, { "epoch": 2.0899854862119014, "grad_norm": 1.2561451196670532, "learning_rate": 7.910014513788099e-05, "loss": 2.0368, "step": 7200 }, { "epoch": 2.0899854862119014, "eval_loss": 1.0582093000411987, "eval_runtime": 214.1893, "eval_samples_per_second": 212.401, "eval_steps_per_second": 3.319, "step": 7200 }, { "epoch": 2.1190130624092887, "grad_norm": 1.3752440214157104, "learning_rate": 7.880986937590712e-05, "loss": 2.0223, "step": 7300 }, { "epoch": 2.1190130624092887, "eval_loss": 1.0552905797958374, "eval_runtime": 214.1025, "eval_samples_per_second": 212.487, "eval_steps_per_second": 3.321, "step": 7300 }, { "epoch": 2.1480406386066764, "grad_norm": 1.2082586288452148, "learning_rate": 7.851959361393323e-05, "loss": 2.0219, "step": 7400 }, { "epoch": 2.1480406386066764, "eval_loss": 1.056668996810913, "eval_runtime": 214.2573, "eval_samples_per_second": 212.333, "eval_steps_per_second": 3.318, "step": 7400 }, { "epoch": 2.1770682148040637, "grad_norm": 1.335627555847168, "learning_rate": 7.822931785195937e-05, "loss": 2.0191, "step": 7500 }, { "epoch": 2.1770682148040637, "eval_loss": 1.0617352724075317, "eval_runtime": 214.1612, "eval_samples_per_second": 212.429, "eval_steps_per_second": 3.32, "step": 7500 }, { "epoch": 2.2060957910014514, "grad_norm": 1.3789772987365723, "learning_rate": 7.79390420899855e-05, "loss": 2.0163, "step": 7600 }, { "epoch": 2.2060957910014514, "eval_loss": 1.0667177438735962, "eval_runtime": 214.4271, "eval_samples_per_second": 212.165, "eval_steps_per_second": 3.316, "step": 7600 }, { "epoch": 2.235123367198839, "grad_norm": 1.2630983591079712, "learning_rate": 7.764876632801161e-05, "loss": 2.0075, "step": 7700 }, { "epoch": 2.235123367198839, "eval_loss": 1.053751826286316, "eval_runtime": 214.4933, "eval_samples_per_second": 212.1, "eval_steps_per_second": 3.315, "step": 7700 }, { "epoch": 2.2641509433962264, "grad_norm": 1.3576209545135498, "learning_rate": 7.735849056603774e-05, "loss": 2.018, "step": 7800 }, { "epoch": 2.2641509433962264, "eval_loss": 1.0545238256454468, "eval_runtime": 214.4112, "eval_samples_per_second": 212.181, "eval_steps_per_second": 3.316, "step": 7800 }, { "epoch": 2.293178519593614, "grad_norm": 1.2727316617965698, "learning_rate": 7.706821480406386e-05, "loss": 2.0123, "step": 7900 }, { "epoch": 2.293178519593614, "eval_loss": 1.0540556907653809, "eval_runtime": 214.501, "eval_samples_per_second": 212.092, "eval_steps_per_second": 3.315, "step": 7900 }, { "epoch": 2.3222060957910013, "grad_norm": 1.2817336320877075, "learning_rate": 7.677793904208999e-05, "loss": 2.0129, "step": 8000 }, { "epoch": 2.3222060957910013, "eval_loss": 1.053106427192688, "eval_runtime": 214.7491, "eval_samples_per_second": 211.847, "eval_steps_per_second": 3.311, "step": 8000 }, { "epoch": 2.351233671988389, "grad_norm": 1.1629624366760254, "learning_rate": 7.648766328011612e-05, "loss": 1.9998, "step": 8100 }, { "epoch": 2.351233671988389, "eval_loss": 1.0525621175765991, "eval_runtime": 214.4605, "eval_samples_per_second": 212.132, "eval_steps_per_second": 3.315, "step": 8100 }, { "epoch": 2.3802612481857763, "grad_norm": 1.225195050239563, "learning_rate": 7.619738751814224e-05, "loss": 1.9998, "step": 8200 }, { "epoch": 2.3802612481857763, "eval_loss": 1.0501279830932617, "eval_runtime": 214.3635, "eval_samples_per_second": 212.228, "eval_steps_per_second": 3.317, "step": 8200 }, { "epoch": 2.409288824383164, "grad_norm": 1.167968988418579, "learning_rate": 7.590711175616836e-05, "loss": 2.0127, "step": 8300 }, { "epoch": 2.409288824383164, "eval_loss": 1.0495474338531494, "eval_runtime": 214.3385, "eval_samples_per_second": 212.253, "eval_steps_per_second": 3.317, "step": 8300 }, { "epoch": 2.4383164005805513, "grad_norm": 1.2802715301513672, "learning_rate": 7.561683599419449e-05, "loss": 2.0046, "step": 8400 }, { "epoch": 2.4383164005805513, "eval_loss": 1.0517114400863647, "eval_runtime": 213.9947, "eval_samples_per_second": 212.594, "eval_steps_per_second": 3.323, "step": 8400 }, { "epoch": 2.467343976777939, "grad_norm": 1.2801434993743896, "learning_rate": 7.532656023222062e-05, "loss": 1.9913, "step": 8500 }, { "epoch": 2.467343976777939, "eval_loss": 1.0506008863449097, "eval_runtime": 214.0451, "eval_samples_per_second": 212.544, "eval_steps_per_second": 3.322, "step": 8500 }, { "epoch": 2.4963715529753268, "grad_norm": 1.3369925022125244, "learning_rate": 7.503628447024675e-05, "loss": 1.9895, "step": 8600 }, { "epoch": 2.4963715529753268, "eval_loss": 1.0505975484848022, "eval_runtime": 214.2989, "eval_samples_per_second": 212.292, "eval_steps_per_second": 3.318, "step": 8600 }, { "epoch": 2.525399129172714, "grad_norm": 1.2676314115524292, "learning_rate": 7.474600870827286e-05, "loss": 1.9963, "step": 8700 }, { "epoch": 2.525399129172714, "eval_loss": 1.0470978021621704, "eval_runtime": 214.1975, "eval_samples_per_second": 212.393, "eval_steps_per_second": 3.319, "step": 8700 }, { "epoch": 2.5544267053701017, "grad_norm": 1.2529655694961548, "learning_rate": 7.445573294629898e-05, "loss": 1.9858, "step": 8800 }, { "epoch": 2.5544267053701017, "eval_loss": 1.045462965965271, "eval_runtime": 213.8996, "eval_samples_per_second": 212.689, "eval_steps_per_second": 3.324, "step": 8800 }, { "epoch": 2.583454281567489, "grad_norm": 1.227094054222107, "learning_rate": 7.416545718432511e-05, "loss": 1.9877, "step": 8900 }, { "epoch": 2.583454281567489, "eval_loss": 1.0446746349334717, "eval_runtime": 214.2422, "eval_samples_per_second": 212.348, "eval_steps_per_second": 3.319, "step": 8900 }, { "epoch": 2.6124818577648767, "grad_norm": 1.22869074344635, "learning_rate": 7.387518142235124e-05, "loss": 1.9914, "step": 9000 }, { "epoch": 2.6124818577648767, "eval_loss": 1.045985460281372, "eval_runtime": 213.626, "eval_samples_per_second": 212.961, "eval_steps_per_second": 3.328, "step": 9000 }, { "epoch": 2.641509433962264, "grad_norm": 1.3192973136901855, "learning_rate": 7.358490566037736e-05, "loss": 1.9686, "step": 9100 }, { "epoch": 2.641509433962264, "eval_loss": 1.0464129447937012, "eval_runtime": 214.0751, "eval_samples_per_second": 212.514, "eval_steps_per_second": 3.321, "step": 9100 }, { "epoch": 2.6705370101596517, "grad_norm": 1.3081276416778564, "learning_rate": 7.329462989840349e-05, "loss": 1.9731, "step": 9200 }, { "epoch": 2.6705370101596517, "eval_loss": 1.047652006149292, "eval_runtime": 214.1138, "eval_samples_per_second": 212.476, "eval_steps_per_second": 3.321, "step": 9200 }, { "epoch": 2.699564586357039, "grad_norm": 1.309837818145752, "learning_rate": 7.300435413642961e-05, "loss": 1.9722, "step": 9300 }, { "epoch": 2.699564586357039, "eval_loss": 1.0429437160491943, "eval_runtime": 213.1703, "eval_samples_per_second": 213.416, "eval_steps_per_second": 3.335, "step": 9300 }, { "epoch": 2.7285921625544267, "grad_norm": 1.3633908033370972, "learning_rate": 7.271407837445574e-05, "loss": 1.9837, "step": 9400 }, { "epoch": 2.7285921625544267, "eval_loss": 1.041870355606079, "eval_runtime": 214.2854, "eval_samples_per_second": 212.306, "eval_steps_per_second": 3.318, "step": 9400 }, { "epoch": 2.7576197387518144, "grad_norm": 1.195707082748413, "learning_rate": 7.242380261248185e-05, "loss": 1.9657, "step": 9500 }, { "epoch": 2.7576197387518144, "eval_loss": 1.0397106409072876, "eval_runtime": 214.0497, "eval_samples_per_second": 212.539, "eval_steps_per_second": 3.322, "step": 9500 }, { "epoch": 2.7866473149492017, "grad_norm": 1.2074401378631592, "learning_rate": 7.213352685050799e-05, "loss": 1.9782, "step": 9600 }, { "epoch": 2.7866473149492017, "eval_loss": 1.0388689041137695, "eval_runtime": 213.9256, "eval_samples_per_second": 212.663, "eval_steps_per_second": 3.324, "step": 9600 }, { "epoch": 2.8156748911465894, "grad_norm": 1.42034113407135, "learning_rate": 7.184325108853412e-05, "loss": 1.9678, "step": 9700 }, { "epoch": 2.8156748911465894, "eval_loss": 1.0477054119110107, "eval_runtime": 214.1129, "eval_samples_per_second": 212.477, "eval_steps_per_second": 3.321, "step": 9700 }, { "epoch": 2.8447024673439767, "grad_norm": 1.2497634887695312, "learning_rate": 7.155297532656023e-05, "loss": 1.9499, "step": 9800 }, { "epoch": 2.8447024673439767, "eval_loss": 1.0382879972457886, "eval_runtime": 214.4692, "eval_samples_per_second": 212.124, "eval_steps_per_second": 3.315, "step": 9800 }, { "epoch": 2.8737300435413644, "grad_norm": 1.2587764263153076, "learning_rate": 7.126269956458636e-05, "loss": 1.9596, "step": 9900 }, { "epoch": 2.8737300435413644, "eval_loss": 1.0374723672866821, "eval_runtime": 214.3582, "eval_samples_per_second": 212.234, "eval_steps_per_second": 3.317, "step": 9900 }, { "epoch": 2.9027576197387517, "grad_norm": 1.2650773525238037, "learning_rate": 7.097242380261248e-05, "loss": 1.9632, "step": 10000 }, { "epoch": 2.9027576197387517, "eval_loss": 1.0395891666412354, "eval_runtime": 214.2184, "eval_samples_per_second": 212.372, "eval_steps_per_second": 3.319, "step": 10000 }, { "epoch": 2.9317851959361394, "grad_norm": 1.237382411956787, "learning_rate": 7.068214804063861e-05, "loss": 1.9448, "step": 10100 }, { "epoch": 2.9317851959361394, "eval_loss": 1.0347273349761963, "eval_runtime": 214.6802, "eval_samples_per_second": 211.915, "eval_steps_per_second": 3.312, "step": 10100 }, { "epoch": 2.9608127721335267, "grad_norm": 1.2535216808319092, "learning_rate": 7.039187227866474e-05, "loss": 1.9633, "step": 10200 }, { "epoch": 2.9608127721335267, "eval_loss": 1.0382635593414307, "eval_runtime": 214.2729, "eval_samples_per_second": 212.318, "eval_steps_per_second": 3.318, "step": 10200 }, { "epoch": 2.9898403483309144, "grad_norm": 1.2122920751571655, "learning_rate": 7.010159651669086e-05, "loss": 1.9531, "step": 10300 }, { "epoch": 2.9898403483309144, "eval_loss": 1.0362297296524048, "eval_runtime": 214.3174, "eval_samples_per_second": 212.274, "eval_steps_per_second": 3.318, "step": 10300 }, { "epoch": 3.018867924528302, "grad_norm": 1.207924723625183, "learning_rate": 6.981132075471698e-05, "loss": 1.9597, "step": 10400 }, { "epoch": 3.018867924528302, "eval_loss": 1.0346544981002808, "eval_runtime": 214.0838, "eval_samples_per_second": 212.506, "eval_steps_per_second": 3.321, "step": 10400 }, { "epoch": 3.0478955007256894, "grad_norm": 1.3156700134277344, "learning_rate": 6.95210449927431e-05, "loss": 1.9284, "step": 10500 }, { "epoch": 3.0478955007256894, "eval_loss": 1.0392136573791504, "eval_runtime": 214.2728, "eval_samples_per_second": 212.318, "eval_steps_per_second": 3.318, "step": 10500 }, { "epoch": 3.076923076923077, "grad_norm": 1.2844287157058716, "learning_rate": 6.923076923076924e-05, "loss": 1.9524, "step": 10600 }, { "epoch": 3.076923076923077, "eval_loss": 1.0422698259353638, "eval_runtime": 214.2459, "eval_samples_per_second": 212.345, "eval_steps_per_second": 3.319, "step": 10600 }, { "epoch": 3.1059506531204644, "grad_norm": 1.3154046535491943, "learning_rate": 6.894049346879537e-05, "loss": 1.9321, "step": 10700 }, { "epoch": 3.1059506531204644, "eval_loss": 1.0372092723846436, "eval_runtime": 214.4246, "eval_samples_per_second": 212.168, "eval_steps_per_second": 3.316, "step": 10700 }, { "epoch": 3.134978229317852, "grad_norm": 1.30637788772583, "learning_rate": 6.865021770682148e-05, "loss": 1.9414, "step": 10800 }, { "epoch": 3.134978229317852, "eval_loss": 1.0316834449768066, "eval_runtime": 214.2895, "eval_samples_per_second": 212.302, "eval_steps_per_second": 3.318, "step": 10800 }, { "epoch": 3.1640058055152394, "grad_norm": 1.375622272491455, "learning_rate": 6.83599419448476e-05, "loss": 1.9255, "step": 10900 }, { "epoch": 3.1640058055152394, "eval_loss": 1.0339484214782715, "eval_runtime": 214.0141, "eval_samples_per_second": 212.575, "eval_steps_per_second": 3.322, "step": 10900 }, { "epoch": 3.193033381712627, "grad_norm": 1.2978899478912354, "learning_rate": 6.806966618287373e-05, "loss": 1.9384, "step": 11000 }, { "epoch": 3.193033381712627, "eval_loss": 1.033180832862854, "eval_runtime": 214.382, "eval_samples_per_second": 212.21, "eval_steps_per_second": 3.317, "step": 11000 }, { "epoch": 3.2220609579100143, "grad_norm": 1.233608603477478, "learning_rate": 6.777939042089986e-05, "loss": 1.9297, "step": 11100 }, { "epoch": 3.2220609579100143, "eval_loss": 1.0305285453796387, "eval_runtime": 214.0802, "eval_samples_per_second": 212.509, "eval_steps_per_second": 3.321, "step": 11100 }, { "epoch": 3.251088534107402, "grad_norm": 1.2634618282318115, "learning_rate": 6.748911465892598e-05, "loss": 1.9315, "step": 11200 }, { "epoch": 3.251088534107402, "eval_loss": 1.0329853296279907, "eval_runtime": 214.8185, "eval_samples_per_second": 211.779, "eval_steps_per_second": 3.31, "step": 11200 }, { "epoch": 3.28011611030479, "grad_norm": 1.3260959386825562, "learning_rate": 6.719883889695211e-05, "loss": 1.9331, "step": 11300 }, { "epoch": 3.28011611030479, "eval_loss": 1.0363577604293823, "eval_runtime": 214.1897, "eval_samples_per_second": 212.401, "eval_steps_per_second": 3.319, "step": 11300 }, { "epoch": 3.309143686502177, "grad_norm": 1.330241322517395, "learning_rate": 6.690856313497823e-05, "loss": 1.9355, "step": 11400 }, { "epoch": 3.309143686502177, "eval_loss": 1.0366979837417603, "eval_runtime": 214.2627, "eval_samples_per_second": 212.328, "eval_steps_per_second": 3.318, "step": 11400 }, { "epoch": 3.3381712626995648, "grad_norm": 1.3124949932098389, "learning_rate": 6.661828737300436e-05, "loss": 1.9141, "step": 11500 }, { "epoch": 3.3381712626995648, "eval_loss": 1.0314677953720093, "eval_runtime": 214.3893, "eval_samples_per_second": 212.203, "eval_steps_per_second": 3.316, "step": 11500 }, { "epoch": 3.367198838896952, "grad_norm": 1.2886366844177246, "learning_rate": 6.632801161103049e-05, "loss": 1.918, "step": 11600 }, { "epoch": 3.367198838896952, "eval_loss": 1.029552698135376, "eval_runtime": 214.4197, "eval_samples_per_second": 212.173, "eval_steps_per_second": 3.316, "step": 11600 }, { "epoch": 3.3962264150943398, "grad_norm": 1.4406765699386597, "learning_rate": 6.60377358490566e-05, "loss": 1.9192, "step": 11700 }, { "epoch": 3.3962264150943398, "eval_loss": 1.0297138690948486, "eval_runtime": 214.4728, "eval_samples_per_second": 212.12, "eval_steps_per_second": 3.315, "step": 11700 }, { "epoch": 3.425253991291727, "grad_norm": 1.3517920970916748, "learning_rate": 6.574746008708274e-05, "loss": 1.9146, "step": 11800 }, { "epoch": 3.425253991291727, "eval_loss": 1.0310994386672974, "eval_runtime": 214.5121, "eval_samples_per_second": 212.081, "eval_steps_per_second": 3.314, "step": 11800 }, { "epoch": 3.4542815674891147, "grad_norm": 1.31048583984375, "learning_rate": 6.545718432510885e-05, "loss": 1.9235, "step": 11900 }, { "epoch": 3.4542815674891147, "eval_loss": 1.029317021369934, "eval_runtime": 214.205, "eval_samples_per_second": 212.385, "eval_steps_per_second": 3.319, "step": 11900 }, { "epoch": 3.483309143686502, "grad_norm": 1.2714518308639526, "learning_rate": 6.516690856313497e-05, "loss": 1.9161, "step": 12000 }, { "epoch": 3.483309143686502, "eval_loss": 1.0265744924545288, "eval_runtime": 214.3435, "eval_samples_per_second": 212.248, "eval_steps_per_second": 3.317, "step": 12000 }, { "epoch": 3.5123367198838897, "grad_norm": 1.274511456489563, "learning_rate": 6.487663280116111e-05, "loss": 1.9295, "step": 12100 }, { "epoch": 3.5123367198838897, "eval_loss": 1.026885747909546, "eval_runtime": 214.622, "eval_samples_per_second": 211.973, "eval_steps_per_second": 3.313, "step": 12100 }, { "epoch": 3.5413642960812775, "grad_norm": 1.4020469188690186, "learning_rate": 6.458635703918723e-05, "loss": 1.9214, "step": 12200 }, { "epoch": 3.5413642960812775, "eval_loss": 1.0313502550125122, "eval_runtime": 214.5336, "eval_samples_per_second": 212.06, "eval_steps_per_second": 3.314, "step": 12200 }, { "epoch": 3.5703918722786647, "grad_norm": 1.329451322555542, "learning_rate": 6.429608127721336e-05, "loss": 1.8986, "step": 12300 }, { "epoch": 3.5703918722786647, "eval_loss": 1.027103304862976, "eval_runtime": 215.1871, "eval_samples_per_second": 211.416, "eval_steps_per_second": 3.304, "step": 12300 }, { "epoch": 3.599419448476052, "grad_norm": 1.2377736568450928, "learning_rate": 6.400580551523948e-05, "loss": 1.8982, "step": 12400 }, { "epoch": 3.599419448476052, "eval_loss": 1.0257542133331299, "eval_runtime": 214.3661, "eval_samples_per_second": 212.226, "eval_steps_per_second": 3.317, "step": 12400 }, { "epoch": 3.6284470246734397, "grad_norm": 1.2443993091583252, "learning_rate": 6.37155297532656e-05, "loss": 1.909, "step": 12500 }, { "epoch": 3.6284470246734397, "eval_loss": 1.0211207866668701, "eval_runtime": 214.1724, "eval_samples_per_second": 212.418, "eval_steps_per_second": 3.32, "step": 12500 }, { "epoch": 3.6574746008708274, "grad_norm": 1.3550719022750854, "learning_rate": 6.342525399129173e-05, "loss": 1.8973, "step": 12600 }, { "epoch": 3.6574746008708274, "eval_loss": 1.0253050327301025, "eval_runtime": 214.3603, "eval_samples_per_second": 212.231, "eval_steps_per_second": 3.317, "step": 12600 }, { "epoch": 3.6865021770682147, "grad_norm": 1.2715822458267212, "learning_rate": 6.313497822931786e-05, "loss": 1.8928, "step": 12700 }, { "epoch": 3.6865021770682147, "eval_loss": 1.0215857028961182, "eval_runtime": 214.2543, "eval_samples_per_second": 212.336, "eval_steps_per_second": 3.318, "step": 12700 }, { "epoch": 3.7155297532656024, "grad_norm": 1.230591893196106, "learning_rate": 6.284470246734397e-05, "loss": 1.8998, "step": 12800 }, { "epoch": 3.7155297532656024, "eval_loss": 1.0226044654846191, "eval_runtime": 214.8109, "eval_samples_per_second": 211.786, "eval_steps_per_second": 3.31, "step": 12800 }, { "epoch": 3.7445573294629897, "grad_norm": 1.2558367252349854, "learning_rate": 6.25544267053701e-05, "loss": 1.9083, "step": 12900 }, { "epoch": 3.7445573294629897, "eval_loss": 1.0279453992843628, "eval_runtime": 214.5403, "eval_samples_per_second": 212.053, "eval_steps_per_second": 3.314, "step": 12900 }, { "epoch": 3.7735849056603774, "grad_norm": 1.3605984449386597, "learning_rate": 6.226415094339622e-05, "loss": 1.8947, "step": 13000 }, { "epoch": 3.7735849056603774, "eval_loss": 1.0254005193710327, "eval_runtime": 214.5443, "eval_samples_per_second": 212.049, "eval_steps_per_second": 3.314, "step": 13000 } ], "logging_steps": 100, "max_steps": 34450, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 5 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3003632111570125e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }