{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.997792494481236, "eval_steps": 50, "global_step": 2037, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014716703458425313, "grad_norm": 3.226644655877531, "learning_rate": 4.901960784313725e-07, "loss": 0.4182, "step": 10 }, { "epoch": 0.029433406916850625, "grad_norm": 2.129692195859408, "learning_rate": 9.80392156862745e-07, "loss": 0.389, "step": 20 }, { "epoch": 0.04415011037527594, "grad_norm": 1.5861033073146842, "learning_rate": 1.4705882352941177e-06, "loss": 0.2876, "step": 30 }, { "epoch": 0.05886681383370125, "grad_norm": 0.9864226661653924, "learning_rate": 1.96078431372549e-06, "loss": 0.1933, "step": 40 }, { "epoch": 0.07358351729212656, "grad_norm": 0.8851816239940652, "learning_rate": 2.450980392156863e-06, "loss": 0.166, "step": 50 }, { "epoch": 0.07358351729212656, "eval_loss": 0.1525491625070572, "eval_runtime": 216.1194, "eval_samples_per_second": 5.59, "eval_steps_per_second": 0.699, "step": 50 }, { "epoch": 0.08830022075055188, "grad_norm": 0.8806004863473016, "learning_rate": 2.9411764705882355e-06, "loss": 0.1491, "step": 60 }, { "epoch": 0.10301692420897719, "grad_norm": 1.0095005155732772, "learning_rate": 3.431372549019608e-06, "loss": 0.1444, "step": 70 }, { "epoch": 0.1177336276674025, "grad_norm": 0.8222552861447616, "learning_rate": 3.92156862745098e-06, "loss": 0.1325, "step": 80 }, { "epoch": 0.13245033112582782, "grad_norm": 0.8482175166475515, "learning_rate": 4.411764705882353e-06, "loss": 0.1249, "step": 90 }, { "epoch": 0.14716703458425312, "grad_norm": 0.8916707135250133, "learning_rate": 4.901960784313726e-06, "loss": 0.1267, "step": 100 }, { "epoch": 0.14716703458425312, "eval_loss": 0.12268291413784027, "eval_runtime": 206.0326, "eval_samples_per_second": 5.863, "eval_steps_per_second": 0.733, "step": 100 }, { "epoch": 0.16188373804267844, "grad_norm": 0.8391480398866726, "learning_rate": 5.392156862745098e-06, "loss": 0.1261, "step": 110 }, { "epoch": 0.17660044150110377, "grad_norm": 0.8543855316305797, "learning_rate": 5.882352941176471e-06, "loss": 0.1239, "step": 120 }, { "epoch": 0.19131714495952906, "grad_norm": 1.0426618599860231, "learning_rate": 6.372549019607843e-06, "loss": 0.1249, "step": 130 }, { "epoch": 0.20603384841795438, "grad_norm": 0.7381326766253737, "learning_rate": 6.862745098039216e-06, "loss": 0.1161, "step": 140 }, { "epoch": 0.22075055187637968, "grad_norm": 0.7710809135546592, "learning_rate": 7.352941176470589e-06, "loss": 0.1171, "step": 150 }, { "epoch": 0.22075055187637968, "eval_loss": 0.11395128816366196, "eval_runtime": 173.8286, "eval_samples_per_second": 6.949, "eval_steps_per_second": 0.869, "step": 150 }, { "epoch": 0.235467255334805, "grad_norm": 0.6891100266664143, "learning_rate": 7.84313725490196e-06, "loss": 0.1156, "step": 160 }, { "epoch": 0.2501839587932303, "grad_norm": 0.8566371646933698, "learning_rate": 8.333333333333334e-06, "loss": 0.1123, "step": 170 }, { "epoch": 0.26490066225165565, "grad_norm": 0.678987575471473, "learning_rate": 8.823529411764707e-06, "loss": 0.114, "step": 180 }, { "epoch": 0.27961736571008095, "grad_norm": 0.7177541472393981, "learning_rate": 9.31372549019608e-06, "loss": 0.1144, "step": 190 }, { "epoch": 0.29433406916850624, "grad_norm": 0.6069002401700933, "learning_rate": 9.803921568627451e-06, "loss": 0.1117, "step": 200 }, { "epoch": 0.29433406916850624, "eval_loss": 0.1121131181716919, "eval_runtime": 204.0232, "eval_samples_per_second": 5.921, "eval_steps_per_second": 0.74, "step": 200 }, { "epoch": 0.3090507726269316, "grad_norm": 0.6658587363100609, "learning_rate": 9.999735629192408e-06, "loss": 0.1207, "step": 210 }, { "epoch": 0.3237674760853569, "grad_norm": 0.6216355033039211, "learning_rate": 9.99812013105419e-06, "loss": 0.1099, "step": 220 }, { "epoch": 0.3384841795437822, "grad_norm": 0.6191227561051886, "learning_rate": 9.995036481411005e-06, "loss": 0.1099, "step": 230 }, { "epoch": 0.35320088300220753, "grad_norm": 1.8888088694270877, "learning_rate": 9.990485586056381e-06, "loss": 0.1091, "step": 240 }, { "epoch": 0.36791758646063283, "grad_norm": 0.6030722360970995, "learning_rate": 9.984468781773688e-06, "loss": 0.1089, "step": 250 }, { "epoch": 0.36791758646063283, "eval_loss": 0.10794272273778915, "eval_runtime": 192.1395, "eval_samples_per_second": 6.287, "eval_steps_per_second": 0.786, "step": 250 }, { "epoch": 0.3826342899190581, "grad_norm": 0.595437994630761, "learning_rate": 9.976987835943465e-06, "loss": 0.1059, "step": 260 }, { "epoch": 0.3973509933774834, "grad_norm": 0.6619589566630248, "learning_rate": 9.968044946024277e-06, "loss": 0.113, "step": 270 }, { "epoch": 0.41206769683590877, "grad_norm": 0.5476231049438186, "learning_rate": 9.957642738907226e-06, "loss": 0.1143, "step": 280 }, { "epoch": 0.42678440029433407, "grad_norm": 0.5802953355038116, "learning_rate": 9.945784270144321e-06, "loss": 0.11, "step": 290 }, { "epoch": 0.44150110375275936, "grad_norm": 0.5847953307046128, "learning_rate": 9.932473023050954e-06, "loss": 0.1048, "step": 300 }, { "epoch": 0.44150110375275936, "eval_loss": 0.10326112061738968, "eval_runtime": 179.9326, "eval_samples_per_second": 6.714, "eval_steps_per_second": 0.839, "step": 300 }, { "epoch": 0.4562178072111847, "grad_norm": 0.6275753190574224, "learning_rate": 9.917712907682694e-06, "loss": 0.1013, "step": 310 }, { "epoch": 0.47093451066961, "grad_norm": 0.6431980899061217, "learning_rate": 9.901508259686746e-06, "loss": 0.1017, "step": 320 }, { "epoch": 0.4856512141280353, "grad_norm": 0.5721037703631747, "learning_rate": 9.883863839028402e-06, "loss": 0.1099, "step": 330 }, { "epoch": 0.5003679175864606, "grad_norm": 0.5487439214439007, "learning_rate": 9.864784828592842e-06, "loss": 0.0969, "step": 340 }, { "epoch": 0.515084621044886, "grad_norm": 0.5817005922601163, "learning_rate": 9.844276832662704e-06, "loss": 0.0976, "step": 350 }, { "epoch": 0.515084621044886, "eval_loss": 0.09972475469112396, "eval_runtime": 193.3575, "eval_samples_per_second": 6.247, "eval_steps_per_second": 0.781, "step": 350 }, { "epoch": 0.5298013245033113, "grad_norm": 0.7409277848217514, "learning_rate": 9.822345875271884e-06, "loss": 0.1053, "step": 360 }, { "epoch": 0.5445180279617365, "grad_norm": 0.6141304848014978, "learning_rate": 9.798998398436031e-06, "loss": 0.1028, "step": 370 }, { "epoch": 0.5592347314201619, "grad_norm": 0.5607954946605025, "learning_rate": 9.774241260260266e-06, "loss": 0.1033, "step": 380 }, { "epoch": 0.5739514348785872, "grad_norm": 0.5542868417397482, "learning_rate": 9.74808173292467e-06, "loss": 0.1037, "step": 390 }, { "epoch": 0.5886681383370125, "grad_norm": 0.6069603969724401, "learning_rate": 9.720527500548155e-06, "loss": 0.0972, "step": 400 }, { "epoch": 0.5886681383370125, "eval_loss": 0.09853184223175049, "eval_runtime": 199.2969, "eval_samples_per_second": 6.061, "eval_steps_per_second": 0.758, "step": 400 }, { "epoch": 0.6033848417954378, "grad_norm": 0.5133322169553051, "learning_rate": 9.691586656931326e-06, "loss": 0.1024, "step": 410 }, { "epoch": 0.6181015452538632, "grad_norm": 0.5357388008964457, "learning_rate": 9.661267703178999e-06, "loss": 0.1033, "step": 420 }, { "epoch": 0.6328182487122884, "grad_norm": 0.5215856861597291, "learning_rate": 9.629579545203076e-06, "loss": 0.0994, "step": 430 }, { "epoch": 0.6475349521707138, "grad_norm": 0.5969857087876467, "learning_rate": 9.596531491106528e-06, "loss": 0.1019, "step": 440 }, { "epoch": 0.6622516556291391, "grad_norm": 0.5231050382062306, "learning_rate": 9.56213324844921e-06, "loss": 0.0968, "step": 450 }, { "epoch": 0.6622516556291391, "eval_loss": 0.09699959307909012, "eval_runtime": 190.5546, "eval_samples_per_second": 6.339, "eval_steps_per_second": 0.792, "step": 450 }, { "epoch": 0.6769683590875644, "grad_norm": 0.5546434863895826, "learning_rate": 9.526394921396373e-06, "loss": 0.1026, "step": 460 }, { "epoch": 0.6916850625459897, "grad_norm": 0.5168914632751676, "learning_rate": 9.489327007750644e-06, "loss": 0.1012, "step": 470 }, { "epoch": 0.7064017660044151, "grad_norm": 0.5152122638926383, "learning_rate": 9.450940395868397e-06, "loss": 0.1013, "step": 480 }, { "epoch": 0.7211184694628403, "grad_norm": 0.5104516201207467, "learning_rate": 9.41124636146141e-06, "loss": 0.0945, "step": 490 }, { "epoch": 0.7358351729212657, "grad_norm": 0.5202984287068582, "learning_rate": 9.370256564284713e-06, "loss": 0.0967, "step": 500 }, { "epoch": 0.7358351729212657, "eval_loss": 0.09431542456150055, "eval_runtime": 184.4167, "eval_samples_per_second": 6.55, "eval_steps_per_second": 0.819, "step": 500 }, { "epoch": 0.7505518763796909, "grad_norm": 0.5032141555673829, "learning_rate": 9.327983044711655e-06, "loss": 0.0935, "step": 510 }, { "epoch": 0.7652685798381162, "grad_norm": 0.5167615309062046, "learning_rate": 9.28443822019715e-06, "loss": 0.0981, "step": 520 }, { "epoch": 0.7799852832965416, "grad_norm": 0.48598303739277543, "learning_rate": 9.239634881630162e-06, "loss": 0.0897, "step": 530 }, { "epoch": 0.7947019867549668, "grad_norm": 0.53876383666863, "learning_rate": 9.19358618957651e-06, "loss": 0.0986, "step": 540 }, { "epoch": 0.8094186902133922, "grad_norm": 0.49526243406348325, "learning_rate": 9.146305670413069e-06, "loss": 0.0879, "step": 550 }, { "epoch": 0.8094186902133922, "eval_loss": 0.09373725950717926, "eval_runtime": 203.9941, "eval_samples_per_second": 5.922, "eval_steps_per_second": 0.74, "step": 550 }, { "epoch": 0.8241353936718175, "grad_norm": 0.5723604640533689, "learning_rate": 9.097807212354513e-06, "loss": 0.0915, "step": 560 }, { "epoch": 0.8388520971302428, "grad_norm": 0.567543105501399, "learning_rate": 9.048105061373793e-06, "loss": 0.0947, "step": 570 }, { "epoch": 0.8535688005886681, "grad_norm": 0.5181347389812981, "learning_rate": 8.997213817017508e-06, "loss": 0.095, "step": 580 }, { "epoch": 0.8682855040470935, "grad_norm": 0.4258862103531478, "learning_rate": 8.945148428117423e-06, "loss": 0.0917, "step": 590 }, { "epoch": 0.8830022075055187, "grad_norm": 0.5739504951081847, "learning_rate": 8.891924188399395e-06, "loss": 0.1014, "step": 600 }, { "epoch": 0.8830022075055187, "eval_loss": 0.09279368817806244, "eval_runtime": 174.7309, "eval_samples_per_second": 6.913, "eval_steps_per_second": 0.864, "step": 600 }, { "epoch": 0.8977189109639441, "grad_norm": 0.5023401278687947, "learning_rate": 8.837556731990973e-06, "loss": 0.0977, "step": 610 }, { "epoch": 0.9124356144223694, "grad_norm": 0.4472157776860558, "learning_rate": 8.782062028829028e-06, "loss": 0.0944, "step": 620 }, { "epoch": 0.9271523178807947, "grad_norm": 0.5229751477277164, "learning_rate": 8.725456379968717e-06, "loss": 0.0894, "step": 630 }, { "epoch": 0.94186902133922, "grad_norm": 0.540335952099867, "learning_rate": 8.667756412795217e-06, "loss": 0.0914, "step": 640 }, { "epoch": 0.9565857247976454, "grad_norm": 0.5214096611567617, "learning_rate": 8.608979076139572e-06, "loss": 0.1026, "step": 650 }, { "epoch": 0.9565857247976454, "eval_loss": 0.09049851447343826, "eval_runtime": 191.9453, "eval_samples_per_second": 6.293, "eval_steps_per_second": 0.787, "step": 650 }, { "epoch": 0.9713024282560706, "grad_norm": 0.47553610942736374, "learning_rate": 8.549141635300135e-06, "loss": 0.0906, "step": 660 }, { "epoch": 0.986019131714496, "grad_norm": 0.5432074308037707, "learning_rate": 8.488261666971047e-06, "loss": 0.0854, "step": 670 }, { "epoch": 1.0007358351729212, "grad_norm": 0.5579816589630594, "learning_rate": 8.426357054079244e-06, "loss": 0.0923, "step": 680 }, { "epoch": 1.0154525386313467, "grad_norm": 0.5140159523753607, "learning_rate": 8.363445980531515e-06, "loss": 0.0683, "step": 690 }, { "epoch": 1.030169242089772, "grad_norm": 0.49111266471989273, "learning_rate": 8.299546925873148e-06, "loss": 0.0635, "step": 700 }, { "epoch": 1.030169242089772, "eval_loss": 0.09157832711935043, "eval_runtime": 185.5584, "eval_samples_per_second": 6.51, "eval_steps_per_second": 0.814, "step": 700 }, { "epoch": 1.0448859455481971, "grad_norm": 0.4650423339954392, "learning_rate": 8.234678659859729e-06, "loss": 0.0667, "step": 710 }, { "epoch": 1.0596026490066226, "grad_norm": 0.5350038624215137, "learning_rate": 8.168860236943709e-06, "loss": 0.0692, "step": 720 }, { "epoch": 1.0743193524650478, "grad_norm": 0.4137475767583062, "learning_rate": 8.102110990677328e-06, "loss": 0.0723, "step": 730 }, { "epoch": 1.089036055923473, "grad_norm": 0.42028700866957225, "learning_rate": 8.034450528033565e-06, "loss": 0.066, "step": 740 }, { "epoch": 1.1037527593818985, "grad_norm": 0.5321405562977654, "learning_rate": 7.965898723646777e-06, "loss": 0.0703, "step": 750 }, { "epoch": 1.1037527593818985, "eval_loss": 0.08948411047458649, "eval_runtime": 184.8668, "eval_samples_per_second": 6.534, "eval_steps_per_second": 0.817, "step": 750 }, { "epoch": 1.1184694628403238, "grad_norm": 0.4973824096134147, "learning_rate": 7.896475713974696e-06, "loss": 0.0667, "step": 760 }, { "epoch": 1.133186166298749, "grad_norm": 0.5184687953265169, "learning_rate": 7.826201891383542e-06, "loss": 0.0721, "step": 770 }, { "epoch": 1.1479028697571745, "grad_norm": 0.4182786077759931, "learning_rate": 7.755097898157957e-06, "loss": 0.0652, "step": 780 }, { "epoch": 1.1626195732155997, "grad_norm": 0.5162298391916976, "learning_rate": 7.683184620437511e-06, "loss": 0.0715, "step": 790 }, { "epoch": 1.177336276674025, "grad_norm": 0.41958696094652936, "learning_rate": 7.610483182081607e-06, "loss": 0.0699, "step": 800 }, { "epoch": 1.177336276674025, "eval_loss": 0.08885398507118225, "eval_runtime": 198.9152, "eval_samples_per_second": 6.073, "eval_steps_per_second": 0.759, "step": 800 }, { "epoch": 1.1920529801324504, "grad_norm": 0.4131639402362476, "learning_rate": 7.537014938464529e-06, "loss": 0.0679, "step": 810 }, { "epoch": 1.2067696835908757, "grad_norm": 0.48371552023497083, "learning_rate": 7.462801470202513e-06, "loss": 0.0724, "step": 820 }, { "epoch": 1.221486387049301, "grad_norm": 0.5028126635648151, "learning_rate": 7.387864576814628e-06, "loss": 0.065, "step": 830 }, { "epoch": 1.2362030905077264, "grad_norm": 0.46008897965297035, "learning_rate": 7.31222627031938e-06, "loss": 0.0672, "step": 840 }, { "epoch": 1.2509197939661516, "grad_norm": 0.3995351586970657, "learning_rate": 7.235908768768875e-06, "loss": 0.0655, "step": 850 }, { "epoch": 1.2509197939661516, "eval_loss": 0.0898497924208641, "eval_runtime": 190.9254, "eval_samples_per_second": 6.327, "eval_steps_per_second": 0.791, "step": 850 }, { "epoch": 1.2656364974245768, "grad_norm": 0.37529528925372135, "learning_rate": 7.1589344897224795e-06, "loss": 0.0696, "step": 860 }, { "epoch": 1.280353200883002, "grad_norm": 0.5211153879452506, "learning_rate": 7.081326043661867e-06, "loss": 0.0671, "step": 870 }, { "epoch": 1.2950699043414275, "grad_norm": 0.46585166367095826, "learning_rate": 7.003106227349399e-06, "loss": 0.0673, "step": 880 }, { "epoch": 1.3097866077998528, "grad_norm": 0.49300557145854806, "learning_rate": 6.924298017131786e-06, "loss": 0.0664, "step": 890 }, { "epoch": 1.3245033112582782, "grad_norm": 0.480260675255211, "learning_rate": 6.844924562191003e-06, "loss": 0.065, "step": 900 }, { "epoch": 1.3245033112582782, "eval_loss": 0.08873660862445831, "eval_runtime": 206.5717, "eval_samples_per_second": 5.848, "eval_steps_per_second": 0.731, "step": 900 }, { "epoch": 1.3392200147167035, "grad_norm": 0.4824688537300334, "learning_rate": 6.765009177744425e-06, "loss": 0.0704, "step": 910 }, { "epoch": 1.3539367181751287, "grad_norm": 0.4415786568127757, "learning_rate": 6.6845753381961995e-06, "loss": 0.0654, "step": 920 }, { "epoch": 1.368653421633554, "grad_norm": 0.5631526023299833, "learning_rate": 6.603646670241863e-06, "loss": 0.0663, "step": 930 }, { "epoch": 1.3833701250919794, "grad_norm": 0.46084364060561317, "learning_rate": 6.522246945928214e-06, "loss": 0.0692, "step": 940 }, { "epoch": 1.3980868285504047, "grad_norm": 0.5348577097898968, "learning_rate": 6.440400075670491e-06, "loss": 0.069, "step": 950 }, { "epoch": 1.3980868285504047, "eval_loss": 0.08685711026191711, "eval_runtime": 177.7464, "eval_samples_per_second": 6.796, "eval_steps_per_second": 0.85, "step": 950 }, { "epoch": 1.4128035320088301, "grad_norm": 0.5250790642687054, "learning_rate": 6.358130101228914e-06, "loss": 0.0702, "step": 960 }, { "epoch": 1.4275202354672554, "grad_norm": 0.5047393202253249, "learning_rate": 6.275461188646641e-06, "loss": 0.0699, "step": 970 }, { "epoch": 1.4422369389256806, "grad_norm": 0.48776704190164294, "learning_rate": 6.1924176211512145e-06, "loss": 0.0634, "step": 980 }, { "epoch": 1.4569536423841059, "grad_norm": 0.49529594396564186, "learning_rate": 6.109023792021586e-06, "loss": 0.0667, "step": 990 }, { "epoch": 1.4716703458425313, "grad_norm": 0.47438683295737333, "learning_rate": 6.025304197422819e-06, "loss": 0.0693, "step": 1000 }, { "epoch": 1.4716703458425313, "eval_loss": 0.08619654178619385, "eval_runtime": 181.9786, "eval_samples_per_second": 6.638, "eval_steps_per_second": 0.83, "step": 1000 }, { "epoch": 1.4863870493009566, "grad_norm": 0.47483465689550636, "learning_rate": 5.941283429210568e-06, "loss": 0.0659, "step": 1010 }, { "epoch": 1.501103752759382, "grad_norm": 0.4446944338196383, "learning_rate": 5.856986167707448e-06, "loss": 0.0638, "step": 1020 }, { "epoch": 1.5158204562178073, "grad_norm": 0.47714369154377795, "learning_rate": 5.772437174453418e-06, "loss": 0.0646, "step": 1030 }, { "epoch": 1.5305371596762325, "grad_norm": 0.4489337679674589, "learning_rate": 5.687661284932306e-06, "loss": 0.0644, "step": 1040 }, { "epoch": 1.5452538631346577, "grad_norm": 0.5494239982767725, "learning_rate": 5.6026834012766155e-06, "loss": 0.0648, "step": 1050 }, { "epoch": 1.5452538631346577, "eval_loss": 0.08584881573915482, "eval_runtime": 196.0939, "eval_samples_per_second": 6.16, "eval_steps_per_second": 0.77, "step": 1050 }, { "epoch": 1.5599705665930832, "grad_norm": 0.4324026964232888, "learning_rate": 5.5175284849527635e-06, "loss": 0.0662, "step": 1060 }, { "epoch": 1.5746872700515084, "grad_norm": 0.43771048938211576, "learning_rate": 5.432221549428867e-06, "loss": 0.0646, "step": 1070 }, { "epoch": 1.589403973509934, "grad_norm": 0.40653033653295745, "learning_rate": 5.346787652827279e-06, "loss": 0.0673, "step": 1080 }, { "epoch": 1.6041206769683591, "grad_norm": 0.4218995885501481, "learning_rate": 5.26125189056399e-06, "loss": 0.0652, "step": 1090 }, { "epoch": 1.6188373804267844, "grad_norm": 0.42589283927464555, "learning_rate": 5.175639387977091e-06, "loss": 0.067, "step": 1100 }, { "epoch": 1.6188373804267844, "eval_loss": 0.08547249436378479, "eval_runtime": 188.3934, "eval_samples_per_second": 6.412, "eval_steps_per_second": 0.802, "step": 1100 }, { "epoch": 1.6335540838852096, "grad_norm": 0.4607522386339002, "learning_rate": 5.089975292946427e-06, "loss": 0.0677, "step": 1110 }, { "epoch": 1.648270787343635, "grad_norm": 0.41120752213023654, "learning_rate": 5.00428476850665e-06, "loss": 0.0633, "step": 1120 }, { "epoch": 1.6629874908020603, "grad_norm": 0.5477912053365783, "learning_rate": 4.918592985455799e-06, "loss": 0.0648, "step": 1130 }, { "epoch": 1.6777041942604858, "grad_norm": 0.47503483012059583, "learning_rate": 4.832925114961629e-06, "loss": 0.0618, "step": 1140 }, { "epoch": 1.692420897718911, "grad_norm": 0.45774600350002437, "learning_rate": 4.747306321167791e-06, "loss": 0.0617, "step": 1150 }, { "epoch": 1.692420897718911, "eval_loss": 0.08534925431013107, "eval_runtime": 204.7242, "eval_samples_per_second": 5.901, "eval_steps_per_second": 0.738, "step": 1150 }, { "epoch": 1.7071376011773363, "grad_norm": 0.45847738919073283, "learning_rate": 4.66176175380212e-06, "loss": 0.0658, "step": 1160 }, { "epoch": 1.7218543046357615, "grad_norm": 0.44501034067234635, "learning_rate": 4.576316540789122e-06, "loss": 0.0649, "step": 1170 }, { "epoch": 1.7365710080941867, "grad_norm": 0.4832020901371425, "learning_rate": 4.4909957808688765e-06, "loss": 0.0663, "step": 1180 }, { "epoch": 1.7512877115526122, "grad_norm": 0.5231088503027554, "learning_rate": 4.4058245362245276e-06, "loss": 0.0617, "step": 1190 }, { "epoch": 1.7660044150110377, "grad_norm": 0.5011172484501668, "learning_rate": 4.320827825120485e-06, "loss": 0.0639, "step": 1200 }, { "epoch": 1.7660044150110377, "eval_loss": 0.08313070237636566, "eval_runtime": 199.3984, "eval_samples_per_second": 6.058, "eval_steps_per_second": 0.757, "step": 1200 }, { "epoch": 1.780721118469463, "grad_norm": 0.5345442409242496, "learning_rate": 4.236030614553552e-06, "loss": 0.0606, "step": 1210 }, { "epoch": 1.7954378219278881, "grad_norm": 0.512563715796756, "learning_rate": 4.151457812919094e-06, "loss": 0.0603, "step": 1220 }, { "epoch": 1.8101545253863134, "grad_norm": 0.44851621254213614, "learning_rate": 4.067134262694431e-06, "loss": 0.0645, "step": 1230 }, { "epoch": 1.8248712288447386, "grad_norm": 0.5248672860684085, "learning_rate": 3.983084733141588e-06, "loss": 0.0623, "step": 1240 }, { "epoch": 1.839587932303164, "grad_norm": 0.5498054945628633, "learning_rate": 3.899333913031561e-06, "loss": 0.0668, "step": 1250 }, { "epoch": 1.839587932303164, "eval_loss": 0.0824863463640213, "eval_runtime": 186.253, "eval_samples_per_second": 6.486, "eval_steps_per_second": 0.811, "step": 1250 }, { "epoch": 1.8543046357615895, "grad_norm": 0.42853375775393104, "learning_rate": 3.815906403392203e-06, "loss": 0.0593, "step": 1260 }, { "epoch": 1.8690213392200148, "grad_norm": 0.45809760814838824, "learning_rate": 3.732826710281923e-06, "loss": 0.0635, "step": 1270 }, { "epoch": 1.88373804267844, "grad_norm": 0.41621812440438655, "learning_rate": 3.650119237591232e-06, "loss": 0.0585, "step": 1280 }, { "epoch": 1.8984547461368653, "grad_norm": 0.47534317303862195, "learning_rate": 3.5678082798743498e-06, "loss": 0.0595, "step": 1290 }, { "epoch": 1.9131714495952905, "grad_norm": 0.41752392992965454, "learning_rate": 3.485918015212891e-06, "loss": 0.0643, "step": 1300 }, { "epoch": 1.9131714495952905, "eval_loss": 0.08134686201810837, "eval_runtime": 180.7959, "eval_samples_per_second": 6.682, "eval_steps_per_second": 0.835, "step": 1300 }, { "epoch": 1.927888153053716, "grad_norm": 0.4388803040345972, "learning_rate": 3.4044724981137787e-06, "loss": 0.0609, "step": 1310 }, { "epoch": 1.9426048565121414, "grad_norm": 0.4342058670787917, "learning_rate": 3.3234956524434615e-06, "loss": 0.062, "step": 1320 }, { "epoch": 1.9573215599705667, "grad_norm": 0.40894625830036435, "learning_rate": 3.243011264400494e-06, "loss": 0.0606, "step": 1330 }, { "epoch": 1.972038263428992, "grad_norm": 0.4587254776423067, "learning_rate": 3.1630429755285623e-06, "loss": 0.0639, "step": 1340 }, { "epoch": 1.9867549668874172, "grad_norm": 0.5863720947155439, "learning_rate": 3.0836142757720034e-06, "loss": 0.0601, "step": 1350 }, { "epoch": 1.9867549668874172, "eval_loss": 0.08116251230239868, "eval_runtime": 214.6117, "eval_samples_per_second": 5.629, "eval_steps_per_second": 0.704, "step": 1350 }, { "epoch": 2.0014716703458424, "grad_norm": 0.3180683055717724, "learning_rate": 3.004748496575842e-06, "loss": 0.0571, "step": 1360 }, { "epoch": 2.0161883738042676, "grad_norm": 0.37920317857819413, "learning_rate": 2.9264688040324098e-06, "loss": 0.0418, "step": 1370 }, { "epoch": 2.0309050772626933, "grad_norm": 0.43496386857367136, "learning_rate": 2.8487981920765044e-06, "loss": 0.0412, "step": 1380 }, { "epoch": 2.0456217807211186, "grad_norm": 0.454994148288807, "learning_rate": 2.7717594757311435e-06, "loss": 0.0386, "step": 1390 }, { "epoch": 2.060338484179544, "grad_norm": 0.4879169888697804, "learning_rate": 2.69537528440586e-06, "loss": 0.0391, "step": 1400 }, { "epoch": 2.060338484179544, "eval_loss": 0.08909143507480621, "eval_runtime": 197.7898, "eval_samples_per_second": 6.107, "eval_steps_per_second": 0.763, "step": 1400 }, { "epoch": 2.075055187637969, "grad_norm": 0.44447001392962837, "learning_rate": 2.619668055249527e-06, "loss": 0.0381, "step": 1410 }, { "epoch": 2.0897718910963943, "grad_norm": 0.40740917793748654, "learning_rate": 2.544660026559639e-06, "loss": 0.0367, "step": 1420 }, { "epoch": 2.1044885945548195, "grad_norm": 0.399633409974892, "learning_rate": 2.4703732312500438e-06, "loss": 0.0382, "step": 1430 }, { "epoch": 2.119205298013245, "grad_norm": 0.43107632751069047, "learning_rate": 2.3968294903789474e-06, "loss": 0.0398, "step": 1440 }, { "epoch": 2.1339220014716704, "grad_norm": 0.43610535435590353, "learning_rate": 2.324050406739205e-06, "loss": 0.0411, "step": 1450 }, { "epoch": 2.1339220014716704, "eval_loss": 0.08864710479974747, "eval_runtime": 185.9081, "eval_samples_per_second": 6.498, "eval_steps_per_second": 0.812, "step": 1450 }, { "epoch": 2.1486387049300957, "grad_norm": 0.3969874821725999, "learning_rate": 2.2520573585126863e-06, "loss": 0.0407, "step": 1460 }, { "epoch": 2.163355408388521, "grad_norm": 0.44469487818286946, "learning_rate": 2.1808714929906394e-06, "loss": 0.037, "step": 1470 }, { "epoch": 2.178072111846946, "grad_norm": 0.4933403170140201, "learning_rate": 2.110513720361869e-06, "loss": 0.0385, "step": 1480 }, { "epoch": 2.1927888153053714, "grad_norm": 0.40970411491367764, "learning_rate": 2.041004707570555e-06, "loss": 0.0362, "step": 1490 }, { "epoch": 2.207505518763797, "grad_norm": 0.47294108634743565, "learning_rate": 1.972364872245539e-06, "loss": 0.0376, "step": 1500 }, { "epoch": 2.207505518763797, "eval_loss": 0.09001829475164413, "eval_runtime": 203.7053, "eval_samples_per_second": 5.93, "eval_steps_per_second": 0.741, "step": 1500 }, { "epoch": 2.2222222222222223, "grad_norm": 0.4013296778951186, "learning_rate": 1.9046143767028309e-06, "loss": 0.0359, "step": 1510 }, { "epoch": 2.2369389256806476, "grad_norm": 0.41616621605630383, "learning_rate": 1.8377731220231144e-06, "loss": 0.0373, "step": 1520 }, { "epoch": 2.251655629139073, "grad_norm": 0.4858320948580327, "learning_rate": 1.771860742205988e-06, "loss": 0.0355, "step": 1530 }, { "epoch": 2.266372332597498, "grad_norm": 0.4284960397863766, "learning_rate": 1.706896598402663e-06, "loss": 0.0379, "step": 1540 }, { "epoch": 2.2810890360559233, "grad_norm": 0.41264671002453457, "learning_rate": 1.642899773228801e-06, "loss": 0.0372, "step": 1550 }, { "epoch": 2.2810890360559233, "eval_loss": 0.08930070698261261, "eval_runtime": 216.4439, "eval_samples_per_second": 5.581, "eval_steps_per_second": 0.698, "step": 1550 }, { "epoch": 2.295805739514349, "grad_norm": 0.4348744731420184, "learning_rate": 1.5798890651591759e-06, "loss": 0.0375, "step": 1560 }, { "epoch": 2.310522442972774, "grad_norm": 0.4350319794815005, "learning_rate": 1.5178829830057883e-06, "loss": 0.0353, "step": 1570 }, { "epoch": 2.3252391464311994, "grad_norm": 0.397696827791832, "learning_rate": 1.4568997404810858e-06, "loss": 0.0369, "step": 1580 }, { "epoch": 2.3399558498896247, "grad_norm": 0.44249359787198733, "learning_rate": 1.3969572508478424e-06, "loss": 0.0365, "step": 1590 }, { "epoch": 2.35467255334805, "grad_norm": 0.3999504032855848, "learning_rate": 1.33807312165731e-06, "loss": 0.0391, "step": 1600 }, { "epoch": 2.35467255334805, "eval_loss": 0.08941526710987091, "eval_runtime": 201.599, "eval_samples_per_second": 5.992, "eval_steps_per_second": 0.749, "step": 1600 }, { "epoch": 2.369389256806475, "grad_norm": 0.47235025180203943, "learning_rate": 1.2802646495771592e-06, "loss": 0.0374, "step": 1610 }, { "epoch": 2.384105960264901, "grad_norm": 0.4505178794969632, "learning_rate": 1.2235488153107488e-06, "loss": 0.0386, "step": 1620 }, { "epoch": 2.398822663723326, "grad_norm": 0.4515169168194488, "learning_rate": 1.1679422786091909e-06, "loss": 0.0355, "step": 1630 }, { "epoch": 2.4135393671817513, "grad_norm": 0.4486232416834487, "learning_rate": 1.1134613733777195e-06, "loss": 0.0353, "step": 1640 }, { "epoch": 2.4282560706401766, "grad_norm": 0.45969446958453936, "learning_rate": 1.060122102877739e-06, "loss": 0.0369, "step": 1650 }, { "epoch": 2.4282560706401766, "eval_loss": 0.08896949887275696, "eval_runtime": 190.8926, "eval_samples_per_second": 6.328, "eval_steps_per_second": 0.791, "step": 1650 }, { "epoch": 2.442972774098602, "grad_norm": 0.4795593227430335, "learning_rate": 1.0079401350260288e-06, "loss": 0.0365, "step": 1660 }, { "epoch": 2.457689477557027, "grad_norm": 0.4364131921904563, "learning_rate": 9.569307977924304e-07, "loss": 0.0374, "step": 1670 }, { "epoch": 2.4724061810154527, "grad_norm": 0.39082384348290283, "learning_rate": 9.071090746973999e-07, "loss": 0.0367, "step": 1680 }, { "epoch": 2.487122884473878, "grad_norm": 0.4316116220500935, "learning_rate": 8.584896004107379e-07, "loss": 0.0357, "step": 1690 }, { "epoch": 2.501839587932303, "grad_norm": 0.4639023437586311, "learning_rate": 8.110866564527925e-07, "loss": 0.0362, "step": 1700 }, { "epoch": 2.501839587932303, "eval_loss": 0.08904436975717545, "eval_runtime": 192.3246, "eval_samples_per_second": 6.281, "eval_steps_per_second": 0.785, "step": 1700 }, { "epoch": 2.5165562913907285, "grad_norm": 0.44077589190339306, "learning_rate": 7.649141669993881e-07, "loss": 0.0342, "step": 1710 }, { "epoch": 2.5312729948491537, "grad_norm": 0.4866710092763864, "learning_rate": 7.199856947917372e-07, "loss": 0.0355, "step": 1720 }, { "epoch": 2.5459896983075794, "grad_norm": 0.5558412036138655, "learning_rate": 6.763144371525048e-07, "loss": 0.0362, "step": 1730 }, { "epoch": 2.560706401766004, "grad_norm": 0.5242729609693463, "learning_rate": 6.339132221092181e-07, "loss": 0.0346, "step": 1740 }, { "epoch": 2.57542310522443, "grad_norm": 0.43612087623478013, "learning_rate": 5.927945046261541e-07, "loss": 0.0351, "step": 1750 }, { "epoch": 2.57542310522443, "eval_loss": 0.08865496516227722, "eval_runtime": 189.4933, "eval_samples_per_second": 6.375, "eval_steps_per_second": 0.797, "step": 1750 }, { "epoch": 2.590139808682855, "grad_norm": 0.4402357233053372, "learning_rate": 5.529703629458027e-07, "loss": 0.0351, "step": 1760 }, { "epoch": 2.6048565121412803, "grad_norm": 0.4547936707636127, "learning_rate": 5.144524950410074e-07, "loss": 0.0353, "step": 1770 }, { "epoch": 2.6195732155997056, "grad_norm": 0.46968163264663654, "learning_rate": 4.772522151787822e-07, "loss": 0.0335, "step": 1780 }, { "epoch": 2.634289919058131, "grad_norm": 0.5323493186585175, "learning_rate": 4.413804505968533e-07, "loss": 0.0381, "step": 1790 }, { "epoch": 2.6490066225165565, "grad_norm": 0.44646022512750955, "learning_rate": 4.0684773829388737e-07, "loss": 0.0365, "step": 1800 }, { "epoch": 2.6490066225165565, "eval_loss": 0.08848826587200165, "eval_runtime": 144.7247, "eval_samples_per_second": 8.347, "eval_steps_per_second": 1.043, "step": 1800 }, { "epoch": 2.6637233259749817, "grad_norm": 0.48313861824298177, "learning_rate": 3.736642219343456e-07, "loss": 0.0341, "step": 1810 }, { "epoch": 2.678440029433407, "grad_norm": 0.46254464308741905, "learning_rate": 3.4183964886887135e-07, "loss": 0.035, "step": 1820 }, { "epoch": 2.693156732891832, "grad_norm": 0.44252007786800557, "learning_rate": 3.1138336727110307e-07, "loss": 0.0349, "step": 1830 }, { "epoch": 2.7078734363502575, "grad_norm": 0.4843414570638625, "learning_rate": 2.823043233917272e-07, "loss": 0.0315, "step": 1840 }, { "epoch": 2.7225901398086827, "grad_norm": 0.4233437476360991, "learning_rate": 2.5461105893060667e-07, "loss": 0.0336, "step": 1850 }, { "epoch": 2.7225901398086827, "eval_loss": 0.0889279693365097, "eval_runtime": 148.3, "eval_samples_per_second": 8.146, "eval_steps_per_second": 1.018, "step": 1850 }, { "epoch": 2.737306843267108, "grad_norm": 0.43298766819944895, "learning_rate": 2.2831170852773198e-07, "loss": 0.0327, "step": 1860 }, { "epoch": 2.7520235467255336, "grad_norm": 0.5107092795769058, "learning_rate": 2.03413997373747e-07, "loss": 0.035, "step": 1870 }, { "epoch": 2.766740250183959, "grad_norm": 0.42425673512298995, "learning_rate": 1.7992523894074688e-07, "loss": 0.0356, "step": 1880 }, { "epoch": 2.781456953642384, "grad_norm": 0.4354877107084126, "learning_rate": 1.578523328340087e-07, "loss": 0.0351, "step": 1890 }, { "epoch": 2.7961736571008093, "grad_norm": 0.409569927662352, "learning_rate": 1.372017627653044e-07, "loss": 0.0328, "step": 1900 }, { "epoch": 2.7961736571008093, "eval_loss": 0.08891716599464417, "eval_runtime": 150.1495, "eval_samples_per_second": 8.045, "eval_steps_per_second": 1.006, "step": 1900 }, { "epoch": 2.8108903605592346, "grad_norm": 0.505372980725658, "learning_rate": 1.179795946483625e-07, "loss": 0.0359, "step": 1910 }, { "epoch": 2.8256070640176603, "grad_norm": 0.4789426274321432, "learning_rate": 1.0019147481706626e-07, "loss": 0.034, "step": 1920 }, { "epoch": 2.8403237674760855, "grad_norm": 0.4326698452169212, "learning_rate": 8.384262836689472e-08, "loss": 0.0359, "step": 1930 }, { "epoch": 2.8550404709345107, "grad_norm": 0.4504134801165135, "learning_rate": 6.893785762009942e-08, "loss": 0.033, "step": 1940 }, { "epoch": 2.869757174392936, "grad_norm": 0.4418357481535817, "learning_rate": 5.5481540715066616e-08, "loss": 0.031, "step": 1950 }, { "epoch": 2.869757174392936, "eval_loss": 0.08881029486656189, "eval_runtime": 160.2824, "eval_samples_per_second": 7.537, "eval_steps_per_second": 0.942, "step": 1950 }, { "epoch": 2.8844738778513612, "grad_norm": 0.39177398397892965, "learning_rate": 4.3477630320279405e-08, "loss": 0.0341, "step": 1960 }, { "epoch": 2.8991905813097865, "grad_norm": 0.4264281839143634, "learning_rate": 3.292965247325641e-08, "loss": 0.0327, "step": 1970 }, { "epoch": 2.9139072847682117, "grad_norm": 0.4458194572989954, "learning_rate": 2.3840705544815324e-08, "loss": 0.037, "step": 1980 }, { "epoch": 2.9286239882266374, "grad_norm": 0.4161959002875069, "learning_rate": 1.6213459328950355e-08, "loss": 0.0336, "step": 1990 }, { "epoch": 2.9433406916850626, "grad_norm": 0.4581647348930819, "learning_rate": 1.0050154258607336e-08, "loss": 0.0361, "step": 2000 }, { "epoch": 2.9433406916850626, "eval_loss": 0.08885689079761505, "eval_runtime": 115.9736, "eval_samples_per_second": 10.416, "eval_steps_per_second": 1.302, "step": 2000 }, { "epoch": 2.958057395143488, "grad_norm": 0.4098910931260614, "learning_rate": 5.352600747577929e-09, "loss": 0.0323, "step": 2010 }, { "epoch": 2.972774098601913, "grad_norm": 0.4203250122459563, "learning_rate": 2.12217865870612e-09, "loss": 0.0337, "step": 2020 }, { "epoch": 2.9874908020603383, "grad_norm": 0.4851341052865305, "learning_rate": 3.5983689856522453e-10, "loss": 0.0343, "step": 2030 } ], "logging_steps": 10, "max_steps": 2037, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 31897094414336.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }