{ "best_metric": 0.007015639916062355, "best_model_checkpoint": "/home/paperspace/Data/models/dbischof_premise_aea/llm3br256/checkpoint-5450", "epoch": 4.997319034852547, "eval_steps": 5, "global_step": 6990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007149240393208221, "grad_norm": 0.3275327980518341, "learning_rate": 1.430615164520744e-07, "loss": 0.0829, "step": 1 }, { "epoch": 0.0014298480786416443, "grad_norm": 0.3180127441883087, "learning_rate": 2.861230329041488e-07, "loss": 0.0958, "step": 2 }, { "epoch": 0.0021447721179624667, "grad_norm": 0.32196545600891113, "learning_rate": 4.291845493562232e-07, "loss": 0.0898, "step": 3 }, { "epoch": 0.0028596961572832885, "grad_norm": 0.3337332308292389, "learning_rate": 5.722460658082976e-07, "loss": 0.0962, "step": 4 }, { "epoch": 0.0035746201966041107, "grad_norm": 0.3307628035545349, "learning_rate": 7.15307582260372e-07, "loss": 0.087, "step": 5 }, { "epoch": 0.0035746201966041107, "eval_loss": 0.09055491536855698, "eval_runtime": 11.5183, "eval_samples_per_second": 4.341, "eval_steps_per_second": 1.129, "step": 5 }, { "epoch": 0.004289544235924933, "grad_norm": 0.3446979522705078, "learning_rate": 8.583690987124464e-07, "loss": 0.0957, "step": 6 }, { "epoch": 0.005004468275245755, "grad_norm": 0.3131182789802551, "learning_rate": 1.0014306151645207e-06, "loss": 0.0875, "step": 7 }, { "epoch": 0.005719392314566577, "grad_norm": 0.326107919216156, "learning_rate": 1.1444921316165953e-06, "loss": 0.1071, "step": 8 }, { "epoch": 0.006434316353887399, "grad_norm": 0.3342755138874054, "learning_rate": 1.2875536480686696e-06, "loss": 0.1025, "step": 9 }, { "epoch": 0.0071492403932082215, "grad_norm": 0.2851223051548004, "learning_rate": 1.430615164520744e-06, "loss": 0.0863, "step": 10 }, { "epoch": 0.0071492403932082215, "eval_loss": 0.08814650028944016, "eval_runtime": 9.7027, "eval_samples_per_second": 5.153, "eval_steps_per_second": 1.34, "step": 10 }, { "epoch": 0.007864164432529044, "grad_norm": 0.30768558382987976, "learning_rate": 1.5736766809728184e-06, "loss": 0.0925, "step": 11 }, { "epoch": 0.008579088471849867, "grad_norm": 0.2860584855079651, "learning_rate": 1.7167381974248929e-06, "loss": 0.0893, "step": 12 }, { "epoch": 0.009294012511170688, "grad_norm": 0.29086631536483765, "learning_rate": 1.8597997138769672e-06, "loss": 0.084, "step": 13 }, { "epoch": 0.01000893655049151, "grad_norm": 0.2922893762588501, "learning_rate": 2.0028612303290415e-06, "loss": 0.0942, "step": 14 }, { "epoch": 0.010723860589812333, "grad_norm": 0.28947821259498596, "learning_rate": 2.1459227467811158e-06, "loss": 0.0931, "step": 15 }, { "epoch": 0.010723860589812333, "eval_loss": 0.08019693195819855, "eval_runtime": 9.7268, "eval_samples_per_second": 5.14, "eval_steps_per_second": 1.337, "step": 15 }, { "epoch": 0.011438784629133154, "grad_norm": 0.2612931430339813, "learning_rate": 2.2889842632331905e-06, "loss": 0.0788, "step": 16 }, { "epoch": 0.012153708668453977, "grad_norm": 0.27568405866622925, "learning_rate": 2.432045779685265e-06, "loss": 0.0844, "step": 17 }, { "epoch": 0.012868632707774798, "grad_norm": 0.245876282453537, "learning_rate": 2.575107296137339e-06, "loss": 0.08, "step": 18 }, { "epoch": 0.013583556747095622, "grad_norm": 0.23232878744602203, "learning_rate": 2.7181688125894134e-06, "loss": 0.0862, "step": 19 }, { "epoch": 0.014298480786416443, "grad_norm": 0.25545862317085266, "learning_rate": 2.861230329041488e-06, "loss": 0.082, "step": 20 }, { "epoch": 0.014298480786416443, "eval_loss": 0.06646817922592163, "eval_runtime": 9.6948, "eval_samples_per_second": 5.157, "eval_steps_per_second": 1.341, "step": 20 }, { "epoch": 0.015013404825737266, "grad_norm": 0.19042891263961792, "learning_rate": 3.0042918454935624e-06, "loss": 0.0646, "step": 21 }, { "epoch": 0.015728328865058087, "grad_norm": 0.20493075251579285, "learning_rate": 3.1473533619456367e-06, "loss": 0.0648, "step": 22 }, { "epoch": 0.01644325290437891, "grad_norm": 0.1941135823726654, "learning_rate": 3.2904148783977115e-06, "loss": 0.0634, "step": 23 }, { "epoch": 0.017158176943699734, "grad_norm": 0.17212936282157898, "learning_rate": 3.4334763948497858e-06, "loss": 0.0607, "step": 24 }, { "epoch": 0.017873100983020553, "grad_norm": 0.1575615257024765, "learning_rate": 3.5765379113018605e-06, "loss": 0.0534, "step": 25 }, { "epoch": 0.017873100983020553, "eval_loss": 0.05097019672393799, "eval_runtime": 9.6667, "eval_samples_per_second": 5.172, "eval_steps_per_second": 1.345, "step": 25 }, { "epoch": 0.018588025022341376, "grad_norm": 0.154325470328331, "learning_rate": 3.7195994277539344e-06, "loss": 0.0484, "step": 26 }, { "epoch": 0.0193029490616622, "grad_norm": 0.15124262869358063, "learning_rate": 3.8626609442060095e-06, "loss": 0.0555, "step": 27 }, { "epoch": 0.02001787310098302, "grad_norm": 0.12750396132469177, "learning_rate": 4.005722460658083e-06, "loss": 0.0474, "step": 28 }, { "epoch": 0.020732797140303842, "grad_norm": 0.12056029587984085, "learning_rate": 4.148783977110158e-06, "loss": 0.0494, "step": 29 }, { "epoch": 0.021447721179624665, "grad_norm": 0.10755834728479385, "learning_rate": 4.2918454935622316e-06, "loss": 0.041, "step": 30 }, { "epoch": 0.021447721179624665, "eval_loss": 0.0401221364736557, "eval_runtime": 9.7183, "eval_samples_per_second": 5.145, "eval_steps_per_second": 1.338, "step": 30 }, { "epoch": 0.02216264521894549, "grad_norm": 0.10011135786771774, "learning_rate": 4.434907010014307e-06, "loss": 0.0382, "step": 31 }, { "epoch": 0.022877569258266308, "grad_norm": 0.09062106162309647, "learning_rate": 4.577968526466381e-06, "loss": 0.0383, "step": 32 }, { "epoch": 0.02359249329758713, "grad_norm": 0.0870615765452385, "learning_rate": 4.721030042918455e-06, "loss": 0.052, "step": 33 }, { "epoch": 0.024307417336907954, "grad_norm": 0.09297549724578857, "learning_rate": 4.86409155937053e-06, "loss": 0.0426, "step": 34 }, { "epoch": 0.025022341376228777, "grad_norm": 0.09612537920475006, "learning_rate": 5.007153075822604e-06, "loss": 0.045, "step": 35 }, { "epoch": 0.025022341376228777, "eval_loss": 0.03599178418517113, "eval_runtime": 9.6828, "eval_samples_per_second": 5.164, "eval_steps_per_second": 1.343, "step": 35 }, { "epoch": 0.025737265415549597, "grad_norm": 0.0899239182472229, "learning_rate": 5.150214592274678e-06, "loss": 0.0332, "step": 36 }, { "epoch": 0.02645218945487042, "grad_norm": 0.1013641282916069, "learning_rate": 5.293276108726753e-06, "loss": 0.0429, "step": 37 }, { "epoch": 0.027167113494191243, "grad_norm": 0.09340202808380127, "learning_rate": 5.436337625178827e-06, "loss": 0.0452, "step": 38 }, { "epoch": 0.027882037533512063, "grad_norm": 0.08030243963003159, "learning_rate": 5.579399141630902e-06, "loss": 0.0414, "step": 39 }, { "epoch": 0.028596961572832886, "grad_norm": 0.08294231444597244, "learning_rate": 5.722460658082976e-06, "loss": 0.0347, "step": 40 }, { "epoch": 0.028596961572832886, "eval_loss": 0.03281966969370842, "eval_runtime": 9.7155, "eval_samples_per_second": 5.146, "eval_steps_per_second": 1.338, "step": 40 }, { "epoch": 0.02931188561215371, "grad_norm": 0.08208201080560684, "learning_rate": 5.8655221745350506e-06, "loss": 0.0358, "step": 41 }, { "epoch": 0.030026809651474532, "grad_norm": 0.06436379253864288, "learning_rate": 6.008583690987125e-06, "loss": 0.028, "step": 42 }, { "epoch": 0.03074173369079535, "grad_norm": 0.06476058065891266, "learning_rate": 6.151645207439199e-06, "loss": 0.0406, "step": 43 }, { "epoch": 0.031456657730116175, "grad_norm": 0.05703894421458244, "learning_rate": 6.2947067238912735e-06, "loss": 0.0401, "step": 44 }, { "epoch": 0.032171581769437, "grad_norm": 0.05328533053398132, "learning_rate": 6.437768240343348e-06, "loss": 0.0377, "step": 45 }, { "epoch": 0.032171581769437, "eval_loss": 0.030925849452614784, "eval_runtime": 9.7272, "eval_samples_per_second": 5.14, "eval_steps_per_second": 1.336, "step": 45 }, { "epoch": 0.03288650580875782, "grad_norm": 0.06106099113821983, "learning_rate": 6.580829756795423e-06, "loss": 0.0284, "step": 46 }, { "epoch": 0.033601429848078644, "grad_norm": 0.057272788137197495, "learning_rate": 6.723891273247497e-06, "loss": 0.0314, "step": 47 }, { "epoch": 0.03431635388739947, "grad_norm": 0.061637260019779205, "learning_rate": 6.8669527896995715e-06, "loss": 0.0363, "step": 48 }, { "epoch": 0.03503127792672028, "grad_norm": 0.06651395559310913, "learning_rate": 7.010014306151645e-06, "loss": 0.031, "step": 49 }, { "epoch": 0.035746201966041107, "grad_norm": 0.05674408748745918, "learning_rate": 7.153075822603721e-06, "loss": 0.0314, "step": 50 }, { "epoch": 0.035746201966041107, "eval_loss": 0.029729090631008148, "eval_runtime": 9.723, "eval_samples_per_second": 5.142, "eval_steps_per_second": 1.337, "step": 50 }, { "epoch": 0.03646112600536193, "grad_norm": 0.05944279953837395, "learning_rate": 7.296137339055794e-06, "loss": 0.0342, "step": 51 }, { "epoch": 0.03717605004468275, "grad_norm": 0.05442027375102043, "learning_rate": 7.439198855507869e-06, "loss": 0.0294, "step": 52 }, { "epoch": 0.037890974084003576, "grad_norm": 0.04743171110749245, "learning_rate": 7.582260371959943e-06, "loss": 0.0257, "step": 53 }, { "epoch": 0.0386058981233244, "grad_norm": 0.07554090023040771, "learning_rate": 7.725321888412019e-06, "loss": 0.0305, "step": 54 }, { "epoch": 0.03932082216264522, "grad_norm": 0.05178272724151611, "learning_rate": 7.868383404864092e-06, "loss": 0.0287, "step": 55 }, { "epoch": 0.03932082216264522, "eval_loss": 0.028114639222621918, "eval_runtime": 9.8616, "eval_samples_per_second": 5.07, "eval_steps_per_second": 1.318, "step": 55 }, { "epoch": 0.04003574620196604, "grad_norm": 0.04709479957818985, "learning_rate": 8.011444921316166e-06, "loss": 0.025, "step": 56 }, { "epoch": 0.04075067024128686, "grad_norm": 0.043908603489398956, "learning_rate": 8.154506437768241e-06, "loss": 0.0426, "step": 57 }, { "epoch": 0.041465594280607684, "grad_norm": 0.04461320862174034, "learning_rate": 8.297567954220316e-06, "loss": 0.0323, "step": 58 }, { "epoch": 0.04218051831992851, "grad_norm": 0.05134046822786331, "learning_rate": 8.44062947067239e-06, "loss": 0.0419, "step": 59 }, { "epoch": 0.04289544235924933, "grad_norm": 0.04562467709183693, "learning_rate": 8.583690987124463e-06, "loss": 0.0288, "step": 60 }, { "epoch": 0.04289544235924933, "eval_loss": 0.02699941024184227, "eval_runtime": 9.4801, "eval_samples_per_second": 5.274, "eval_steps_per_second": 1.371, "step": 60 }, { "epoch": 0.043610366398570154, "grad_norm": 0.04455569386482239, "learning_rate": 8.726752503576538e-06, "loss": 0.0345, "step": 61 }, { "epoch": 0.04432529043789098, "grad_norm": 0.04751044511795044, "learning_rate": 8.869814020028613e-06, "loss": 0.0341, "step": 62 }, { "epoch": 0.04504021447721179, "grad_norm": 0.04918158799409866, "learning_rate": 9.012875536480687e-06, "loss": 0.0308, "step": 63 }, { "epoch": 0.045755138516532616, "grad_norm": 0.043550483882427216, "learning_rate": 9.155937052932762e-06, "loss": 0.0264, "step": 64 }, { "epoch": 0.04647006255585344, "grad_norm": 0.04692668095231056, "learning_rate": 9.298998569384835e-06, "loss": 0.0341, "step": 65 }, { "epoch": 0.04647006255585344, "eval_loss": 0.026099223643541336, "eval_runtime": 9.9913, "eval_samples_per_second": 5.004, "eval_steps_per_second": 1.301, "step": 65 }, { "epoch": 0.04718498659517426, "grad_norm": 0.043388139456510544, "learning_rate": 9.44206008583691e-06, "loss": 0.0241, "step": 66 }, { "epoch": 0.047899910634495085, "grad_norm": 0.03678254783153534, "learning_rate": 9.585121602288986e-06, "loss": 0.0324, "step": 67 }, { "epoch": 0.04861483467381591, "grad_norm": 0.03629340976476669, "learning_rate": 9.72818311874106e-06, "loss": 0.0288, "step": 68 }, { "epoch": 0.04932975871313673, "grad_norm": 0.040581271052360535, "learning_rate": 9.871244635193133e-06, "loss": 0.03, "step": 69 }, { "epoch": 0.050044682752457555, "grad_norm": 0.04365529865026474, "learning_rate": 1.0014306151645208e-05, "loss": 0.0302, "step": 70 }, { "epoch": 0.050044682752457555, "eval_loss": 0.02531510591506958, "eval_runtime": 9.6392, "eval_samples_per_second": 5.187, "eval_steps_per_second": 1.349, "step": 70 }, { "epoch": 0.05075960679177837, "grad_norm": 0.05037260055541992, "learning_rate": 1.0157367668097283e-05, "loss": 0.0373, "step": 71 }, { "epoch": 0.051474530831099194, "grad_norm": 0.039430588483810425, "learning_rate": 1.0300429184549356e-05, "loss": 0.0291, "step": 72 }, { "epoch": 0.05218945487042002, "grad_norm": 0.04580322653055191, "learning_rate": 1.0443490701001432e-05, "loss": 0.0264, "step": 73 }, { "epoch": 0.05290437890974084, "grad_norm": 0.03900410607457161, "learning_rate": 1.0586552217453507e-05, "loss": 0.0239, "step": 74 }, { "epoch": 0.05361930294906166, "grad_norm": 0.042930781841278076, "learning_rate": 1.072961373390558e-05, "loss": 0.0328, "step": 75 }, { "epoch": 0.05361930294906166, "eval_loss": 0.02443748340010643, "eval_runtime": 9.7874, "eval_samples_per_second": 5.109, "eval_steps_per_second": 1.328, "step": 75 }, { "epoch": 0.054334226988382486, "grad_norm": 0.038007065653800964, "learning_rate": 1.0872675250357654e-05, "loss": 0.0258, "step": 76 }, { "epoch": 0.05504915102770331, "grad_norm": 0.04679590091109276, "learning_rate": 1.1015736766809729e-05, "loss": 0.0239, "step": 77 }, { "epoch": 0.055764075067024126, "grad_norm": 0.03922339528799057, "learning_rate": 1.1158798283261804e-05, "loss": 0.0242, "step": 78 }, { "epoch": 0.05647899910634495, "grad_norm": 0.042693473398685455, "learning_rate": 1.1301859799713877e-05, "loss": 0.0243, "step": 79 }, { "epoch": 0.05719392314566577, "grad_norm": 0.04190599545836449, "learning_rate": 1.1444921316165953e-05, "loss": 0.0341, "step": 80 }, { "epoch": 0.05719392314566577, "eval_loss": 0.02363392524421215, "eval_runtime": 9.6763, "eval_samples_per_second": 5.167, "eval_steps_per_second": 1.343, "step": 80 }, { "epoch": 0.057908847184986595, "grad_norm": 0.03888407349586487, "learning_rate": 1.1587982832618026e-05, "loss": 0.0281, "step": 81 }, { "epoch": 0.05862377122430742, "grad_norm": 0.039355259388685226, "learning_rate": 1.1731044349070101e-05, "loss": 0.0286, "step": 82 }, { "epoch": 0.05933869526362824, "grad_norm": 0.040316179394721985, "learning_rate": 1.1874105865522175e-05, "loss": 0.024, "step": 83 }, { "epoch": 0.060053619302949064, "grad_norm": 0.034487709403038025, "learning_rate": 1.201716738197425e-05, "loss": 0.0253, "step": 84 }, { "epoch": 0.06076854334226988, "grad_norm": 0.040082525461912155, "learning_rate": 1.2160228898426323e-05, "loss": 0.0322, "step": 85 }, { "epoch": 0.06076854334226988, "eval_loss": 0.02287757769227028, "eval_runtime": 9.7469, "eval_samples_per_second": 5.13, "eval_steps_per_second": 1.334, "step": 85 }, { "epoch": 0.0614834673815907, "grad_norm": 0.041836339980363846, "learning_rate": 1.2303290414878398e-05, "loss": 0.0301, "step": 86 }, { "epoch": 0.06219839142091153, "grad_norm": 0.038112565875053406, "learning_rate": 1.2446351931330473e-05, "loss": 0.0283, "step": 87 }, { "epoch": 0.06291331546023235, "grad_norm": 0.03906940296292305, "learning_rate": 1.2589413447782547e-05, "loss": 0.027, "step": 88 }, { "epoch": 0.06362823949955317, "grad_norm": 0.04331241548061371, "learning_rate": 1.2732474964234622e-05, "loss": 0.0308, "step": 89 }, { "epoch": 0.064343163538874, "grad_norm": 0.04183993861079216, "learning_rate": 1.2875536480686696e-05, "loss": 0.0251, "step": 90 }, { "epoch": 0.064343163538874, "eval_loss": 0.02227693423628807, "eval_runtime": 9.6729, "eval_samples_per_second": 5.169, "eval_steps_per_second": 1.344, "step": 90 }, { "epoch": 0.06505808757819481, "grad_norm": 0.03443620726466179, "learning_rate": 1.301859799713877e-05, "loss": 0.0303, "step": 91 }, { "epoch": 0.06577301161751564, "grad_norm": 0.034096866846084595, "learning_rate": 1.3161659513590846e-05, "loss": 0.0306, "step": 92 }, { "epoch": 0.06648793565683646, "grad_norm": 0.03327767550945282, "learning_rate": 1.330472103004292e-05, "loss": 0.0185, "step": 93 }, { "epoch": 0.06720285969615729, "grad_norm": 0.03854461759328842, "learning_rate": 1.3447782546494994e-05, "loss": 0.0283, "step": 94 }, { "epoch": 0.0679177837354781, "grad_norm": 0.03775477036833763, "learning_rate": 1.3590844062947066e-05, "loss": 0.0342, "step": 95 }, { "epoch": 0.0679177837354781, "eval_loss": 0.02169269695878029, "eval_runtime": 6.4386, "eval_samples_per_second": 7.766, "eval_steps_per_second": 2.019, "step": 95 }, { "epoch": 0.06863270777479893, "grad_norm": 0.0387720912694931, "learning_rate": 1.3733905579399143e-05, "loss": 0.0212, "step": 96 }, { "epoch": 0.06934763181411975, "grad_norm": 0.0374082513153553, "learning_rate": 1.3876967095851218e-05, "loss": 0.0238, "step": 97 }, { "epoch": 0.07006255585344057, "grad_norm": 0.03322593867778778, "learning_rate": 1.402002861230329e-05, "loss": 0.0263, "step": 98 }, { "epoch": 0.0707774798927614, "grad_norm": 0.03851554915308952, "learning_rate": 1.4163090128755365e-05, "loss": 0.0266, "step": 99 }, { "epoch": 0.07149240393208221, "grad_norm": 0.038667093962430954, "learning_rate": 1.4306151645207442e-05, "loss": 0.0234, "step": 100 }, { "epoch": 0.07149240393208221, "eval_loss": 0.021222971379756927, "eval_runtime": 9.5344, "eval_samples_per_second": 5.244, "eval_steps_per_second": 1.363, "step": 100 }, { "epoch": 0.07220732797140304, "grad_norm": 0.03627876937389374, "learning_rate": 1.4449213161659514e-05, "loss": 0.0197, "step": 101 }, { "epoch": 0.07292225201072386, "grad_norm": 0.03812776878476143, "learning_rate": 1.4592274678111589e-05, "loss": 0.0254, "step": 102 }, { "epoch": 0.07363717605004469, "grad_norm": 0.041352637112140656, "learning_rate": 1.4735336194563662e-05, "loss": 0.0293, "step": 103 }, { "epoch": 0.0743521000893655, "grad_norm": 0.03348153084516525, "learning_rate": 1.4878397711015737e-05, "loss": 0.0243, "step": 104 }, { "epoch": 0.07506702412868632, "grad_norm": 0.048818014562129974, "learning_rate": 1.5021459227467813e-05, "loss": 0.0242, "step": 105 }, { "epoch": 0.07506702412868632, "eval_loss": 0.020723415538668633, "eval_runtime": 9.708, "eval_samples_per_second": 5.15, "eval_steps_per_second": 1.339, "step": 105 }, { "epoch": 0.07578194816800715, "grad_norm": 0.03160472214221954, "learning_rate": 1.5164520743919886e-05, "loss": 0.0183, "step": 106 }, { "epoch": 0.07649687220732797, "grad_norm": 0.038375623524188995, "learning_rate": 1.530758226037196e-05, "loss": 0.0243, "step": 107 }, { "epoch": 0.0772117962466488, "grad_norm": 0.05067029967904091, "learning_rate": 1.5450643776824038e-05, "loss": 0.0268, "step": 108 }, { "epoch": 0.07792672028596961, "grad_norm": 0.04879362881183624, "learning_rate": 1.5593705293276108e-05, "loss": 0.0261, "step": 109 }, { "epoch": 0.07864164432529044, "grad_norm": 0.03321097791194916, "learning_rate": 1.5736766809728185e-05, "loss": 0.0195, "step": 110 }, { "epoch": 0.07864164432529044, "eval_loss": 0.020412543788552284, "eval_runtime": 9.6278, "eval_samples_per_second": 5.193, "eval_steps_per_second": 1.35, "step": 110 }, { "epoch": 0.07935656836461126, "grad_norm": 0.03311232477426529, "learning_rate": 1.587982832618026e-05, "loss": 0.0202, "step": 111 }, { "epoch": 0.08007149240393208, "grad_norm": 0.03284924849867821, "learning_rate": 1.6022889842632332e-05, "loss": 0.0271, "step": 112 }, { "epoch": 0.0807864164432529, "grad_norm": 0.04706140235066414, "learning_rate": 1.616595135908441e-05, "loss": 0.0263, "step": 113 }, { "epoch": 0.08150134048257372, "grad_norm": 0.04280773550271988, "learning_rate": 1.6309012875536482e-05, "loss": 0.0365, "step": 114 }, { "epoch": 0.08221626452189455, "grad_norm": 0.04438168928027153, "learning_rate": 1.6452074391988556e-05, "loss": 0.0287, "step": 115 }, { "epoch": 0.08221626452189455, "eval_loss": 0.020317204296588898, "eval_runtime": 9.6065, "eval_samples_per_second": 5.205, "eval_steps_per_second": 1.353, "step": 115 }, { "epoch": 0.08293118856121537, "grad_norm": 0.03584986925125122, "learning_rate": 1.6595135908440632e-05, "loss": 0.0259, "step": 116 }, { "epoch": 0.0836461126005362, "grad_norm": 0.03458195924758911, "learning_rate": 1.6738197424892706e-05, "loss": 0.0182, "step": 117 }, { "epoch": 0.08436103663985701, "grad_norm": 0.041829150170087814, "learning_rate": 1.688125894134478e-05, "loss": 0.0238, "step": 118 }, { "epoch": 0.08507596067917783, "grad_norm": 0.04319094493985176, "learning_rate": 1.7024320457796853e-05, "loss": 0.0249, "step": 119 }, { "epoch": 0.08579088471849866, "grad_norm": 0.04748925939202309, "learning_rate": 1.7167381974248926e-05, "loss": 0.0259, "step": 120 }, { "epoch": 0.08579088471849866, "eval_loss": 0.019910821691155434, "eval_runtime": 9.7126, "eval_samples_per_second": 5.148, "eval_steps_per_second": 1.338, "step": 120 }, { "epoch": 0.08650580875781948, "grad_norm": 0.037860143929719925, "learning_rate": 1.7310443490701003e-05, "loss": 0.0228, "step": 121 }, { "epoch": 0.08722073279714031, "grad_norm": 0.05506071448326111, "learning_rate": 1.7453505007153077e-05, "loss": 0.0266, "step": 122 }, { "epoch": 0.08793565683646112, "grad_norm": 0.03442307561635971, "learning_rate": 1.759656652360515e-05, "loss": 0.0191, "step": 123 }, { "epoch": 0.08865058087578195, "grad_norm": 0.043473776429891586, "learning_rate": 1.7739628040057227e-05, "loss": 0.0204, "step": 124 }, { "epoch": 0.08936550491510277, "grad_norm": 0.03508966788649559, "learning_rate": 1.78826895565093e-05, "loss": 0.0267, "step": 125 }, { "epoch": 0.08936550491510277, "eval_loss": 0.019728153944015503, "eval_runtime": 9.6888, "eval_samples_per_second": 5.161, "eval_steps_per_second": 1.342, "step": 125 }, { "epoch": 0.09008042895442359, "grad_norm": 0.03464685007929802, "learning_rate": 1.8025751072961374e-05, "loss": 0.0176, "step": 126 }, { "epoch": 0.09079535299374442, "grad_norm": 0.038567788898944855, "learning_rate": 1.8168812589413447e-05, "loss": 0.0167, "step": 127 }, { "epoch": 0.09151027703306523, "grad_norm": 0.032014086842536926, "learning_rate": 1.8311874105865524e-05, "loss": 0.0235, "step": 128 }, { "epoch": 0.09222520107238606, "grad_norm": 0.035636402666568756, "learning_rate": 1.8454935622317597e-05, "loss": 0.0195, "step": 129 }, { "epoch": 0.09294012511170688, "grad_norm": 0.038021791726350784, "learning_rate": 1.859799713876967e-05, "loss": 0.0286, "step": 130 }, { "epoch": 0.09294012511170688, "eval_loss": 0.01935666799545288, "eval_runtime": 9.5191, "eval_samples_per_second": 5.253, "eval_steps_per_second": 1.366, "step": 130 }, { "epoch": 0.09365504915102771, "grad_norm": 0.0396159291267395, "learning_rate": 1.8741058655221748e-05, "loss": 0.0204, "step": 131 }, { "epoch": 0.09436997319034852, "grad_norm": 0.04586606100201607, "learning_rate": 1.888412017167382e-05, "loss": 0.0149, "step": 132 }, { "epoch": 0.09508489722966935, "grad_norm": 0.03820332884788513, "learning_rate": 1.9027181688125895e-05, "loss": 0.0259, "step": 133 }, { "epoch": 0.09579982126899017, "grad_norm": 0.036863140761852264, "learning_rate": 1.917024320457797e-05, "loss": 0.0229, "step": 134 }, { "epoch": 0.09651474530831099, "grad_norm": 0.04298887401819229, "learning_rate": 1.931330472103004e-05, "loss": 0.0204, "step": 135 }, { "epoch": 0.09651474530831099, "eval_loss": 0.019136320799589157, "eval_runtime": 9.6408, "eval_samples_per_second": 5.186, "eval_steps_per_second": 1.348, "step": 135 }, { "epoch": 0.09722966934763182, "grad_norm": 0.042081069201231, "learning_rate": 1.945636623748212e-05, "loss": 0.0225, "step": 136 }, { "epoch": 0.09794459338695263, "grad_norm": 0.040681999176740646, "learning_rate": 1.9599427753934195e-05, "loss": 0.0162, "step": 137 }, { "epoch": 0.09865951742627346, "grad_norm": 0.042670901864767075, "learning_rate": 1.9742489270386265e-05, "loss": 0.0243, "step": 138 }, { "epoch": 0.09937444146559428, "grad_norm": 0.045691631734371185, "learning_rate": 1.9885550786838342e-05, "loss": 0.0258, "step": 139 }, { "epoch": 0.10008936550491511, "grad_norm": 0.03945830091834068, "learning_rate": 2.0028612303290416e-05, "loss": 0.0242, "step": 140 }, { "epoch": 0.10008936550491511, "eval_loss": 0.019022315740585327, "eval_runtime": 9.55, "eval_samples_per_second": 5.236, "eval_steps_per_second": 1.361, "step": 140 }, { "epoch": 0.10080428954423593, "grad_norm": 0.04007583484053612, "learning_rate": 2.017167381974249e-05, "loss": 0.0171, "step": 141 }, { "epoch": 0.10151921358355674, "grad_norm": 0.04206077381968498, "learning_rate": 2.0314735336194566e-05, "loss": 0.0237, "step": 142 }, { "epoch": 0.10223413762287757, "grad_norm": 0.037879329174757004, "learning_rate": 2.045779685264664e-05, "loss": 0.0247, "step": 143 }, { "epoch": 0.10294906166219839, "grad_norm": 0.03997822850942612, "learning_rate": 2.0600858369098713e-05, "loss": 0.0218, "step": 144 }, { "epoch": 0.10366398570151922, "grad_norm": 0.03970559313893318, "learning_rate": 2.074391988555079e-05, "loss": 0.0228, "step": 145 }, { "epoch": 0.10366398570151922, "eval_loss": 0.018724817782640457, "eval_runtime": 9.7168, "eval_samples_per_second": 5.146, "eval_steps_per_second": 1.338, "step": 145 }, { "epoch": 0.10437890974084003, "grad_norm": 0.04645358398556709, "learning_rate": 2.0886981402002863e-05, "loss": 0.0179, "step": 146 }, { "epoch": 0.10509383378016086, "grad_norm": 0.03722456842660904, "learning_rate": 2.1030042918454937e-05, "loss": 0.0284, "step": 147 }, { "epoch": 0.10580875781948168, "grad_norm": 0.0320768803358078, "learning_rate": 2.1173104434907013e-05, "loss": 0.0256, "step": 148 }, { "epoch": 0.1065236818588025, "grad_norm": 0.031241778284311295, "learning_rate": 2.1316165951359084e-05, "loss": 0.0157, "step": 149 }, { "epoch": 0.10723860589812333, "grad_norm": 0.03884207829833031, "learning_rate": 2.145922746781116e-05, "loss": 0.0248, "step": 150 }, { "epoch": 0.10723860589812333, "eval_loss": 0.018527572974562645, "eval_runtime": 9.694, "eval_samples_per_second": 5.158, "eval_steps_per_second": 1.341, "step": 150 }, { "epoch": 0.10795352993744414, "grad_norm": 0.042844079434871674, "learning_rate": 2.1602288984263234e-05, "loss": 0.0184, "step": 151 }, { "epoch": 0.10866845397676497, "grad_norm": 0.03895885497331619, "learning_rate": 2.1745350500715307e-05, "loss": 0.0294, "step": 152 }, { "epoch": 0.10938337801608579, "grad_norm": 0.03502436354756355, "learning_rate": 2.1888412017167384e-05, "loss": 0.0194, "step": 153 }, { "epoch": 0.11009830205540662, "grad_norm": 0.06940220296382904, "learning_rate": 2.2031473533619458e-05, "loss": 0.0186, "step": 154 }, { "epoch": 0.11081322609472744, "grad_norm": 0.045711129903793335, "learning_rate": 2.217453505007153e-05, "loss": 0.0263, "step": 155 }, { "epoch": 0.11081322609472744, "eval_loss": 0.018343131989240646, "eval_runtime": 9.7378, "eval_samples_per_second": 5.135, "eval_steps_per_second": 1.335, "step": 155 }, { "epoch": 0.11152815013404825, "grad_norm": 0.036760155111551285, "learning_rate": 2.2317596566523608e-05, "loss": 0.0201, "step": 156 }, { "epoch": 0.11224307417336908, "grad_norm": 0.04823607951402664, "learning_rate": 2.246065808297568e-05, "loss": 0.0235, "step": 157 }, { "epoch": 0.1129579982126899, "grad_norm": 0.06112448126077652, "learning_rate": 2.2603719599427755e-05, "loss": 0.0217, "step": 158 }, { "epoch": 0.11367292225201073, "grad_norm": 0.03302643075585365, "learning_rate": 2.2746781115879828e-05, "loss": 0.0252, "step": 159 }, { "epoch": 0.11438784629133154, "grad_norm": 0.04496656730771065, "learning_rate": 2.2889842632331905e-05, "loss": 0.0222, "step": 160 }, { "epoch": 0.11438784629133154, "eval_loss": 0.018444441258907318, "eval_runtime": 9.727, "eval_samples_per_second": 5.14, "eval_steps_per_second": 1.336, "step": 160 }, { "epoch": 0.11510277033065237, "grad_norm": 0.04440809041261673, "learning_rate": 2.303290414878398e-05, "loss": 0.0233, "step": 161 }, { "epoch": 0.11581769436997319, "grad_norm": 0.041358571499586105, "learning_rate": 2.3175965665236052e-05, "loss": 0.0234, "step": 162 }, { "epoch": 0.116532618409294, "grad_norm": 0.04183349758386612, "learning_rate": 2.331902718168813e-05, "loss": 0.0198, "step": 163 }, { "epoch": 0.11724754244861484, "grad_norm": 0.0447751022875309, "learning_rate": 2.3462088698140202e-05, "loss": 0.0186, "step": 164 }, { "epoch": 0.11796246648793565, "grad_norm": 0.04264550283551216, "learning_rate": 2.3605150214592276e-05, "loss": 0.0185, "step": 165 }, { "epoch": 0.11796246648793565, "eval_loss": 0.01825426146388054, "eval_runtime": 9.7022, "eval_samples_per_second": 5.153, "eval_steps_per_second": 1.34, "step": 165 }, { "epoch": 0.11867739052725648, "grad_norm": 0.03637390583753586, "learning_rate": 2.374821173104435e-05, "loss": 0.0208, "step": 166 }, { "epoch": 0.1193923145665773, "grad_norm": 0.050255224108695984, "learning_rate": 2.3891273247496423e-05, "loss": 0.0224, "step": 167 }, { "epoch": 0.12010723860589813, "grad_norm": 0.04119570925831795, "learning_rate": 2.40343347639485e-05, "loss": 0.0191, "step": 168 }, { "epoch": 0.12082216264521894, "grad_norm": 0.04005882143974304, "learning_rate": 2.4177396280400573e-05, "loss": 0.0229, "step": 169 }, { "epoch": 0.12153708668453976, "grad_norm": 0.0436907596886158, "learning_rate": 2.4320457796852646e-05, "loss": 0.0197, "step": 170 }, { "epoch": 0.12153708668453976, "eval_loss": 0.018159380182623863, "eval_runtime": 9.7234, "eval_samples_per_second": 5.142, "eval_steps_per_second": 1.337, "step": 170 }, { "epoch": 0.12225201072386059, "grad_norm": 0.04689564183354378, "learning_rate": 2.4463519313304723e-05, "loss": 0.019, "step": 171 }, { "epoch": 0.1229669347631814, "grad_norm": 0.03970903530716896, "learning_rate": 2.4606580829756797e-05, "loss": 0.0221, "step": 172 }, { "epoch": 0.12368185880250224, "grad_norm": 0.03555913642048836, "learning_rate": 2.474964234620887e-05, "loss": 0.0274, "step": 173 }, { "epoch": 0.12439678284182305, "grad_norm": 0.0457729734480381, "learning_rate": 2.4892703862660947e-05, "loss": 0.0219, "step": 174 }, { "epoch": 0.12511170688114387, "grad_norm": 0.046179238706827164, "learning_rate": 2.5035765379113017e-05, "loss": 0.0297, "step": 175 }, { "epoch": 0.12511170688114387, "eval_loss": 0.01788422465324402, "eval_runtime": 9.5246, "eval_samples_per_second": 5.25, "eval_steps_per_second": 1.365, "step": 175 }, { "epoch": 0.1258266309204647, "grad_norm": 0.041476182639598846, "learning_rate": 2.5178826895565094e-05, "loss": 0.0265, "step": 176 }, { "epoch": 0.12654155495978553, "grad_norm": 0.034255798906087875, "learning_rate": 2.532188841201717e-05, "loss": 0.0226, "step": 177 }, { "epoch": 0.12725647899910633, "grad_norm": 0.04008769243955612, "learning_rate": 2.5464949928469244e-05, "loss": 0.0183, "step": 178 }, { "epoch": 0.12797140303842716, "grad_norm": 0.042479488998651505, "learning_rate": 2.5608011444921314e-05, "loss": 0.0278, "step": 179 }, { "epoch": 0.128686327077748, "grad_norm": 0.03340573236346245, "learning_rate": 2.575107296137339e-05, "loss": 0.0164, "step": 180 }, { "epoch": 0.128686327077748, "eval_loss": 0.01777827739715576, "eval_runtime": 9.7143, "eval_samples_per_second": 5.147, "eval_steps_per_second": 1.338, "step": 180 }, { "epoch": 0.12940125111706882, "grad_norm": 0.046375010162591934, "learning_rate": 2.5894134477825465e-05, "loss": 0.0237, "step": 181 }, { "epoch": 0.13011617515638962, "grad_norm": 0.036998260766267776, "learning_rate": 2.603719599427754e-05, "loss": 0.0185, "step": 182 }, { "epoch": 0.13083109919571045, "grad_norm": 0.03544262796640396, "learning_rate": 2.6180257510729618e-05, "loss": 0.0246, "step": 183 }, { "epoch": 0.13154602323503128, "grad_norm": 0.04755708575248718, "learning_rate": 2.632331902718169e-05, "loss": 0.0213, "step": 184 }, { "epoch": 0.1322609472743521, "grad_norm": 0.041505903005599976, "learning_rate": 2.6466380543633762e-05, "loss": 0.0234, "step": 185 }, { "epoch": 0.1322609472743521, "eval_loss": 0.017496012151241302, "eval_runtime": 9.6436, "eval_samples_per_second": 5.185, "eval_steps_per_second": 1.348, "step": 185 }, { "epoch": 0.13297587131367292, "grad_norm": 0.037543050944805145, "learning_rate": 2.660944206008584e-05, "loss": 0.024, "step": 186 }, { "epoch": 0.13369079535299375, "grad_norm": 0.04594096168875694, "learning_rate": 2.6752503576537912e-05, "loss": 0.0217, "step": 187 }, { "epoch": 0.13440571939231458, "grad_norm": 0.04478172957897186, "learning_rate": 2.689556509298999e-05, "loss": 0.0205, "step": 188 }, { "epoch": 0.13512064343163538, "grad_norm": 0.03514755144715309, "learning_rate": 2.7038626609442062e-05, "loss": 0.0202, "step": 189 }, { "epoch": 0.1358355674709562, "grad_norm": 0.03703011944890022, "learning_rate": 2.7181688125894132e-05, "loss": 0.0298, "step": 190 }, { "epoch": 0.1358355674709562, "eval_loss": 0.017410937696695328, "eval_runtime": 9.673, "eval_samples_per_second": 5.169, "eval_steps_per_second": 1.344, "step": 190 }, { "epoch": 0.13655049151027704, "grad_norm": 0.03919641301035881, "learning_rate": 2.732474964234621e-05, "loss": 0.0202, "step": 191 }, { "epoch": 0.13726541554959787, "grad_norm": 0.0402408093214035, "learning_rate": 2.7467811158798286e-05, "loss": 0.0252, "step": 192 }, { "epoch": 0.13798033958891867, "grad_norm": 0.03372427821159363, "learning_rate": 2.761087267525036e-05, "loss": 0.0267, "step": 193 }, { "epoch": 0.1386952636282395, "grad_norm": 0.044197794049978256, "learning_rate": 2.7753934191702436e-05, "loss": 0.0146, "step": 194 }, { "epoch": 0.13941018766756033, "grad_norm": 0.032698146998882294, "learning_rate": 2.7896995708154506e-05, "loss": 0.0231, "step": 195 }, { "epoch": 0.13941018766756033, "eval_loss": 0.017060289159417152, "eval_runtime": 10.0382, "eval_samples_per_second": 4.981, "eval_steps_per_second": 1.295, "step": 195 }, { "epoch": 0.14012511170688113, "grad_norm": 0.03240122273564339, "learning_rate": 2.804005722460658e-05, "loss": 0.0173, "step": 196 }, { "epoch": 0.14084003574620196, "grad_norm": 0.03941943123936653, "learning_rate": 2.8183118741058657e-05, "loss": 0.0287, "step": 197 }, { "epoch": 0.1415549597855228, "grad_norm": 0.04499538615345955, "learning_rate": 2.832618025751073e-05, "loss": 0.022, "step": 198 }, { "epoch": 0.14226988382484362, "grad_norm": 0.03967476263642311, "learning_rate": 2.8469241773962807e-05, "loss": 0.0142, "step": 199 }, { "epoch": 0.14298480786416443, "grad_norm": 0.04585069417953491, "learning_rate": 2.8612303290414884e-05, "loss": 0.0179, "step": 200 }, { "epoch": 0.14298480786416443, "eval_loss": 0.017181234434247017, "eval_runtime": 4.611, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.819, "step": 200 }, { "epoch": 0.14369973190348526, "grad_norm": 0.04243015497922897, "learning_rate": 2.8755364806866954e-05, "loss": 0.0276, "step": 201 }, { "epoch": 0.14441465594280609, "grad_norm": 0.04968704283237457, "learning_rate": 2.8898426323319027e-05, "loss": 0.021, "step": 202 }, { "epoch": 0.1451295799821269, "grad_norm": 0.043205294758081436, "learning_rate": 2.9041487839771104e-05, "loss": 0.0235, "step": 203 }, { "epoch": 0.14584450402144772, "grad_norm": 0.0406709723174572, "learning_rate": 2.9184549356223178e-05, "loss": 0.0164, "step": 204 }, { "epoch": 0.14655942806076855, "grad_norm": 0.041751131415367126, "learning_rate": 2.9327610872675255e-05, "loss": 0.0207, "step": 205 }, { "epoch": 0.14655942806076855, "eval_loss": 0.017275827005505562, "eval_runtime": 4.6065, "eval_samples_per_second": 10.854, "eval_steps_per_second": 2.822, "step": 205 }, { "epoch": 0.14727435210008938, "grad_norm": 0.03581860661506653, "learning_rate": 2.9470672389127325e-05, "loss": 0.0215, "step": 206 }, { "epoch": 0.14798927613941018, "grad_norm": 0.04252663254737854, "learning_rate": 2.9613733905579398e-05, "loss": 0.0239, "step": 207 }, { "epoch": 0.148704200178731, "grad_norm": 0.03467275947332382, "learning_rate": 2.9756795422031475e-05, "loss": 0.0274, "step": 208 }, { "epoch": 0.14941912421805184, "grad_norm": 0.036709822714328766, "learning_rate": 2.9899856938483552e-05, "loss": 0.0202, "step": 209 }, { "epoch": 0.15013404825737264, "grad_norm": 0.03977267071604729, "learning_rate": 3.0042918454935625e-05, "loss": 0.0263, "step": 210 }, { "epoch": 0.15013404825737264, "eval_loss": 0.017209889367222786, "eval_runtime": 4.6018, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 210 }, { "epoch": 0.15084897229669347, "grad_norm": 0.043679334223270416, "learning_rate": 3.0185979971387695e-05, "loss": 0.0208, "step": 211 }, { "epoch": 0.1515638963360143, "grad_norm": 0.037616752088069916, "learning_rate": 3.0329041487839772e-05, "loss": 0.0216, "step": 212 }, { "epoch": 0.15227882037533513, "grad_norm": 0.044638264924287796, "learning_rate": 3.0472103004291846e-05, "loss": 0.0156, "step": 213 }, { "epoch": 0.15299374441465594, "grad_norm": 0.036564696580171585, "learning_rate": 3.061516452074392e-05, "loss": 0.0198, "step": 214 }, { "epoch": 0.15370866845397677, "grad_norm": 0.04377096891403198, "learning_rate": 3.0758226037196e-05, "loss": 0.0204, "step": 215 }, { "epoch": 0.15370866845397677, "eval_loss": 0.017455143854022026, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 215 }, { "epoch": 0.1544235924932976, "grad_norm": 0.04742162674665451, "learning_rate": 3.0901287553648076e-05, "loss": 0.0275, "step": 216 }, { "epoch": 0.1551385165326184, "grad_norm": 0.06221901997923851, "learning_rate": 3.104434907010014e-05, "loss": 0.0338, "step": 217 }, { "epoch": 0.15585344057193923, "grad_norm": 0.04140050709247589, "learning_rate": 3.1187410586552216e-05, "loss": 0.0194, "step": 218 }, { "epoch": 0.15656836461126006, "grad_norm": 0.044630493968725204, "learning_rate": 3.133047210300429e-05, "loss": 0.021, "step": 219 }, { "epoch": 0.1572832886505809, "grad_norm": 0.0451500229537487, "learning_rate": 3.147353361945637e-05, "loss": 0.0222, "step": 220 }, { "epoch": 0.1572832886505809, "eval_loss": 0.017219696193933487, "eval_runtime": 4.6022, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 220 }, { "epoch": 0.1579982126899017, "grad_norm": 0.04439837858080864, "learning_rate": 3.161659513590845e-05, "loss": 0.0206, "step": 221 }, { "epoch": 0.15871313672922252, "grad_norm": 0.032404255121946335, "learning_rate": 3.175965665236052e-05, "loss": 0.0169, "step": 222 }, { "epoch": 0.15942806076854335, "grad_norm": 0.043374765664339066, "learning_rate": 3.190271816881259e-05, "loss": 0.0243, "step": 223 }, { "epoch": 0.16014298480786415, "grad_norm": 0.05310199409723282, "learning_rate": 3.2045779685264664e-05, "loss": 0.0216, "step": 224 }, { "epoch": 0.16085790884718498, "grad_norm": 0.04065870866179466, "learning_rate": 3.218884120171674e-05, "loss": 0.024, "step": 225 }, { "epoch": 0.16085790884718498, "eval_loss": 0.017296236008405685, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 225 }, { "epoch": 0.1615728328865058, "grad_norm": 0.035469427704811096, "learning_rate": 3.233190271816882e-05, "loss": 0.02, "step": 226 }, { "epoch": 0.16228775692582664, "grad_norm": 0.04220125824213028, "learning_rate": 3.247496423462089e-05, "loss": 0.0279, "step": 227 }, { "epoch": 0.16300268096514745, "grad_norm": 0.03961128741502762, "learning_rate": 3.2618025751072964e-05, "loss": 0.0197, "step": 228 }, { "epoch": 0.16371760500446828, "grad_norm": 0.03918292745947838, "learning_rate": 3.2761087267525034e-05, "loss": 0.0196, "step": 229 }, { "epoch": 0.1644325290437891, "grad_norm": 0.0357474759221077, "learning_rate": 3.290414878397711e-05, "loss": 0.0258, "step": 230 }, { "epoch": 0.1644325290437891, "eval_loss": 0.016940994188189507, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 230 }, { "epoch": 0.1651474530831099, "grad_norm": 0.041401706635951996, "learning_rate": 3.304721030042919e-05, "loss": 0.0266, "step": 231 }, { "epoch": 0.16586237712243074, "grad_norm": 0.043397869914770126, "learning_rate": 3.3190271816881265e-05, "loss": 0.0185, "step": 232 }, { "epoch": 0.16657730116175157, "grad_norm": 0.04559559375047684, "learning_rate": 3.3333333333333335e-05, "loss": 0.0177, "step": 233 }, { "epoch": 0.1672922252010724, "grad_norm": 0.031015632674098015, "learning_rate": 3.347639484978541e-05, "loss": 0.0186, "step": 234 }, { "epoch": 0.1680071492403932, "grad_norm": 0.037535570561885834, "learning_rate": 3.361945636623748e-05, "loss": 0.0171, "step": 235 }, { "epoch": 0.1680071492403932, "eval_loss": 0.01675962470471859, "eval_runtime": 4.5926, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 235 }, { "epoch": 0.16872207327971403, "grad_norm": 0.03374025225639343, "learning_rate": 3.376251788268956e-05, "loss": 0.0239, "step": 236 }, { "epoch": 0.16943699731903486, "grad_norm": 0.048096250742673874, "learning_rate": 3.3905579399141636e-05, "loss": 0.0222, "step": 237 }, { "epoch": 0.17015192135835566, "grad_norm": 0.03289438784122467, "learning_rate": 3.4048640915593706e-05, "loss": 0.0167, "step": 238 }, { "epoch": 0.1708668453976765, "grad_norm": 0.03813895955681801, "learning_rate": 3.419170243204578e-05, "loss": 0.0334, "step": 239 }, { "epoch": 0.17158176943699732, "grad_norm": 0.04790525510907173, "learning_rate": 3.433476394849785e-05, "loss": 0.0223, "step": 240 }, { "epoch": 0.17158176943699732, "eval_loss": 0.01669895276427269, "eval_runtime": 4.5933, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 240 }, { "epoch": 0.17229669347631815, "grad_norm": 0.05035453662276268, "learning_rate": 3.447782546494993e-05, "loss": 0.0315, "step": 241 }, { "epoch": 0.17301161751563895, "grad_norm": 0.03603880852460861, "learning_rate": 3.4620886981402006e-05, "loss": 0.0166, "step": 242 }, { "epoch": 0.17372654155495978, "grad_norm": 0.03881281241774559, "learning_rate": 3.4763948497854076e-05, "loss": 0.0258, "step": 243 }, { "epoch": 0.17444146559428061, "grad_norm": 0.046464867889881134, "learning_rate": 3.490701001430615e-05, "loss": 0.026, "step": 244 }, { "epoch": 0.17515638963360142, "grad_norm": 0.04398578777909279, "learning_rate": 3.505007153075823e-05, "loss": 0.0291, "step": 245 }, { "epoch": 0.17515638963360142, "eval_loss": 0.01682448759675026, "eval_runtime": 4.6112, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 245 }, { "epoch": 0.17587131367292225, "grad_norm": 0.038608159869909286, "learning_rate": 3.51931330472103e-05, "loss": 0.0182, "step": 246 }, { "epoch": 0.17658623771224308, "grad_norm": 0.04350871965289116, "learning_rate": 3.533619456366238e-05, "loss": 0.0261, "step": 247 }, { "epoch": 0.1773011617515639, "grad_norm": 0.044403012841939926, "learning_rate": 3.5479256080114454e-05, "loss": 0.0196, "step": 248 }, { "epoch": 0.1780160857908847, "grad_norm": 0.031681399792432785, "learning_rate": 3.5622317596566524e-05, "loss": 0.0169, "step": 249 }, { "epoch": 0.17873100983020554, "grad_norm": 0.034804731607437134, "learning_rate": 3.57653791130186e-05, "loss": 0.0138, "step": 250 }, { "epoch": 0.17873100983020554, "eval_loss": 0.016402781009674072, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 250 }, { "epoch": 0.17944593386952637, "grad_norm": 0.04489169269800186, "learning_rate": 3.590844062947068e-05, "loss": 0.0263, "step": 251 }, { "epoch": 0.18016085790884717, "grad_norm": 0.04136953130364418, "learning_rate": 3.605150214592275e-05, "loss": 0.0268, "step": 252 }, { "epoch": 0.180875781948168, "grad_norm": 0.0351882129907608, "learning_rate": 3.6194563662374824e-05, "loss": 0.0183, "step": 253 }, { "epoch": 0.18159070598748883, "grad_norm": 0.03557763621211052, "learning_rate": 3.6337625178826894e-05, "loss": 0.0204, "step": 254 }, { "epoch": 0.18230563002680966, "grad_norm": 0.03994687274098396, "learning_rate": 3.648068669527897e-05, "loss": 0.0251, "step": 255 }, { "epoch": 0.18230563002680966, "eval_loss": 0.016236405819654465, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 255 }, { "epoch": 0.18302055406613046, "grad_norm": 0.041298747062683105, "learning_rate": 3.662374821173105e-05, "loss": 0.0254, "step": 256 }, { "epoch": 0.1837354781054513, "grad_norm": 0.03407914191484451, "learning_rate": 3.676680972818312e-05, "loss": 0.0262, "step": 257 }, { "epoch": 0.18445040214477212, "grad_norm": 0.036786869168281555, "learning_rate": 3.6909871244635195e-05, "loss": 0.0155, "step": 258 }, { "epoch": 0.18516532618409293, "grad_norm": 0.04546934738755226, "learning_rate": 3.7052932761087265e-05, "loss": 0.016, "step": 259 }, { "epoch": 0.18588025022341376, "grad_norm": 0.043993718922138214, "learning_rate": 3.719599427753934e-05, "loss": 0.0145, "step": 260 }, { "epoch": 0.18588025022341376, "eval_loss": 0.01629067212343216, "eval_runtime": 4.6072, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 260 }, { "epoch": 0.1865951742627346, "grad_norm": 0.036715976893901825, "learning_rate": 3.733905579399142e-05, "loss": 0.0184, "step": 261 }, { "epoch": 0.18731009830205542, "grad_norm": 0.038279205560684204, "learning_rate": 3.7482117310443496e-05, "loss": 0.0216, "step": 262 }, { "epoch": 0.18802502234137622, "grad_norm": 0.03692355379462242, "learning_rate": 3.7625178826895566e-05, "loss": 0.0315, "step": 263 }, { "epoch": 0.18873994638069705, "grad_norm": 0.033918507397174835, "learning_rate": 3.776824034334764e-05, "loss": 0.0251, "step": 264 }, { "epoch": 0.18945487042001788, "grad_norm": 0.03748100623488426, "learning_rate": 3.791130185979971e-05, "loss": 0.0232, "step": 265 }, { "epoch": 0.18945487042001788, "eval_loss": 0.016455834731459618, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 265 }, { "epoch": 0.1901697944593387, "grad_norm": 0.03911001980304718, "learning_rate": 3.805436337625179e-05, "loss": 0.0175, "step": 266 }, { "epoch": 0.1908847184986595, "grad_norm": 0.03587862849235535, "learning_rate": 3.8197424892703866e-05, "loss": 0.0171, "step": 267 }, { "epoch": 0.19159964253798034, "grad_norm": 0.030233412981033325, "learning_rate": 3.834048640915594e-05, "loss": 0.023, "step": 268 }, { "epoch": 0.19231456657730117, "grad_norm": 0.03616298362612724, "learning_rate": 3.848354792560801e-05, "loss": 0.0179, "step": 269 }, { "epoch": 0.19302949061662197, "grad_norm": 0.03582935407757759, "learning_rate": 3.862660944206008e-05, "loss": 0.0207, "step": 270 }, { "epoch": 0.19302949061662197, "eval_loss": 0.016115685924887657, "eval_runtime": 4.5919, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 270 }, { "epoch": 0.1937444146559428, "grad_norm": 0.03806351497769356, "learning_rate": 3.876967095851216e-05, "loss": 0.025, "step": 271 }, { "epoch": 0.19445933869526363, "grad_norm": 0.04092242196202278, "learning_rate": 3.891273247496424e-05, "loss": 0.0189, "step": 272 }, { "epoch": 0.19517426273458446, "grad_norm": 0.035148922353982925, "learning_rate": 3.9055793991416314e-05, "loss": 0.0218, "step": 273 }, { "epoch": 0.19588918677390527, "grad_norm": 0.03595191240310669, "learning_rate": 3.919885550786839e-05, "loss": 0.0236, "step": 274 }, { "epoch": 0.1966041108132261, "grad_norm": 0.03373287618160248, "learning_rate": 3.9341917024320454e-05, "loss": 0.0227, "step": 275 }, { "epoch": 0.1966041108132261, "eval_loss": 0.01615576259791851, "eval_runtime": 4.5894, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 275 }, { "epoch": 0.19731903485254693, "grad_norm": 0.03490866348147392, "learning_rate": 3.948497854077253e-05, "loss": 0.0298, "step": 276 }, { "epoch": 0.19803395889186773, "grad_norm": 0.03343784809112549, "learning_rate": 3.962804005722461e-05, "loss": 0.0193, "step": 277 }, { "epoch": 0.19874888293118856, "grad_norm": 0.02724776230752468, "learning_rate": 3.9771101573676684e-05, "loss": 0.026, "step": 278 }, { "epoch": 0.1994638069705094, "grad_norm": 0.031032856553792953, "learning_rate": 3.991416309012876e-05, "loss": 0.0175, "step": 279 }, { "epoch": 0.20017873100983022, "grad_norm": 0.03315596655011177, "learning_rate": 4.005722460658083e-05, "loss": 0.0199, "step": 280 }, { "epoch": 0.20017873100983022, "eval_loss": 0.01615453138947487, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 280 }, { "epoch": 0.20089365504915102, "grad_norm": 0.0352381095290184, "learning_rate": 4.02002861230329e-05, "loss": 0.0218, "step": 281 }, { "epoch": 0.20160857908847185, "grad_norm": 0.03223356977105141, "learning_rate": 4.034334763948498e-05, "loss": 0.0206, "step": 282 }, { "epoch": 0.20232350312779268, "grad_norm": 0.034352339804172516, "learning_rate": 4.0486409155937055e-05, "loss": 0.0169, "step": 283 }, { "epoch": 0.20303842716711348, "grad_norm": 0.035642318427562714, "learning_rate": 4.062947067238913e-05, "loss": 0.019, "step": 284 }, { "epoch": 0.2037533512064343, "grad_norm": 0.0390208438038826, "learning_rate": 4.077253218884121e-05, "loss": 0.0216, "step": 285 }, { "epoch": 0.2037533512064343, "eval_loss": 0.015760380774736404, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 285 }, { "epoch": 0.20446827524575514, "grad_norm": 0.035041261464357376, "learning_rate": 4.091559370529328e-05, "loss": 0.0196, "step": 286 }, { "epoch": 0.20518319928507597, "grad_norm": 0.03334460407495499, "learning_rate": 4.105865522174535e-05, "loss": 0.0261, "step": 287 }, { "epoch": 0.20589812332439678, "grad_norm": 0.03778688982129097, "learning_rate": 4.1201716738197426e-05, "loss": 0.016, "step": 288 }, { "epoch": 0.2066130473637176, "grad_norm": 0.03744954988360405, "learning_rate": 4.13447782546495e-05, "loss": 0.0208, "step": 289 }, { "epoch": 0.20732797140303844, "grad_norm": 0.037781305611133575, "learning_rate": 4.148783977110158e-05, "loss": 0.0235, "step": 290 }, { "epoch": 0.20732797140303844, "eval_loss": 0.015767080709338188, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 290 }, { "epoch": 0.20804289544235924, "grad_norm": 0.04693910479545593, "learning_rate": 4.163090128755365e-05, "loss": 0.0254, "step": 291 }, { "epoch": 0.20875781948168007, "grad_norm": 0.03790517523884773, "learning_rate": 4.1773962804005726e-05, "loss": 0.0306, "step": 292 }, { "epoch": 0.2094727435210009, "grad_norm": 0.03439391404390335, "learning_rate": 4.1917024320457796e-05, "loss": 0.0256, "step": 293 }, { "epoch": 0.21018766756032173, "grad_norm": 0.03223295509815216, "learning_rate": 4.206008583690987e-05, "loss": 0.017, "step": 294 }, { "epoch": 0.21090259159964253, "grad_norm": 0.035657111555337906, "learning_rate": 4.220314735336195e-05, "loss": 0.0178, "step": 295 }, { "epoch": 0.21090259159964253, "eval_loss": 0.016034070402383804, "eval_runtime": 4.5918, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 295 }, { "epoch": 0.21161751563896336, "grad_norm": 0.03423287719488144, "learning_rate": 4.234620886981403e-05, "loss": 0.017, "step": 296 }, { "epoch": 0.2123324396782842, "grad_norm": 0.042552974075078964, "learning_rate": 4.24892703862661e-05, "loss": 0.0199, "step": 297 }, { "epoch": 0.213047363717605, "grad_norm": 0.03131406009197235, "learning_rate": 4.263233190271817e-05, "loss": 0.0211, "step": 298 }, { "epoch": 0.21376228775692582, "grad_norm": 0.04598421975970268, "learning_rate": 4.2775393419170244e-05, "loss": 0.026, "step": 299 }, { "epoch": 0.21447721179624665, "grad_norm": 0.03696037083864212, "learning_rate": 4.291845493562232e-05, "loss": 0.0168, "step": 300 }, { "epoch": 0.21447721179624665, "eval_loss": 0.015587572939693928, "eval_runtime": 4.6195, "eval_samples_per_second": 10.824, "eval_steps_per_second": 2.814, "step": 300 }, { "epoch": 0.21519213583556748, "grad_norm": 0.033249326050281525, "learning_rate": 4.30615164520744e-05, "loss": 0.0163, "step": 301 }, { "epoch": 0.21590705987488829, "grad_norm": 0.0377253033220768, "learning_rate": 4.320457796852647e-05, "loss": 0.0209, "step": 302 }, { "epoch": 0.21662198391420912, "grad_norm": 0.03423323482275009, "learning_rate": 4.3347639484978544e-05, "loss": 0.0316, "step": 303 }, { "epoch": 0.21733690795352995, "grad_norm": 0.039379242807626724, "learning_rate": 4.3490701001430615e-05, "loss": 0.0172, "step": 304 }, { "epoch": 0.21805183199285075, "grad_norm": 0.04135379195213318, "learning_rate": 4.363376251788269e-05, "loss": 0.0235, "step": 305 }, { "epoch": 0.21805183199285075, "eval_loss": 0.0158777367323637, "eval_runtime": 4.6011, "eval_samples_per_second": 10.867, "eval_steps_per_second": 2.825, "step": 305 }, { "epoch": 0.21876675603217158, "grad_norm": 0.03823694959282875, "learning_rate": 4.377682403433477e-05, "loss": 0.0224, "step": 306 }, { "epoch": 0.2194816800714924, "grad_norm": 0.03432565927505493, "learning_rate": 4.391988555078684e-05, "loss": 0.0196, "step": 307 }, { "epoch": 0.22019660411081324, "grad_norm": 0.03366350382566452, "learning_rate": 4.4062947067238915e-05, "loss": 0.015, "step": 308 }, { "epoch": 0.22091152815013404, "grad_norm": 0.03745386004447937, "learning_rate": 4.420600858369099e-05, "loss": 0.0207, "step": 309 }, { "epoch": 0.22162645218945487, "grad_norm": 0.04112129658460617, "learning_rate": 4.434907010014306e-05, "loss": 0.0159, "step": 310 }, { "epoch": 0.22162645218945487, "eval_loss": 0.0156700499355793, "eval_runtime": 4.5836, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 310 }, { "epoch": 0.2223413762287757, "grad_norm": 0.03107276000082493, "learning_rate": 4.449213161659514e-05, "loss": 0.0154, "step": 311 }, { "epoch": 0.2230563002680965, "grad_norm": 0.03719180077314377, "learning_rate": 4.4635193133047216e-05, "loss": 0.0176, "step": 312 }, { "epoch": 0.22377122430741733, "grad_norm": 0.04199746996164322, "learning_rate": 4.4778254649499286e-05, "loss": 0.0175, "step": 313 }, { "epoch": 0.22448614834673816, "grad_norm": 0.036725662648677826, "learning_rate": 4.492131616595136e-05, "loss": 0.0205, "step": 314 }, { "epoch": 0.225201072386059, "grad_norm": 0.032895248383283615, "learning_rate": 4.506437768240343e-05, "loss": 0.0193, "step": 315 }, { "epoch": 0.225201072386059, "eval_loss": 0.015615073963999748, "eval_runtime": 4.5935, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 315 }, { "epoch": 0.2259159964253798, "grad_norm": 0.024261536076664925, "learning_rate": 4.520743919885551e-05, "loss": 0.0151, "step": 316 }, { "epoch": 0.22663092046470062, "grad_norm": 0.03155319020152092, "learning_rate": 4.5350500715307586e-05, "loss": 0.0245, "step": 317 }, { "epoch": 0.22734584450402145, "grad_norm": 0.031224185600876808, "learning_rate": 4.5493562231759656e-05, "loss": 0.0225, "step": 318 }, { "epoch": 0.22806076854334226, "grad_norm": 0.03898489102721214, "learning_rate": 4.563662374821173e-05, "loss": 0.0211, "step": 319 }, { "epoch": 0.2287756925826631, "grad_norm": 0.03373716026544571, "learning_rate": 4.577968526466381e-05, "loss": 0.0241, "step": 320 }, { "epoch": 0.2287756925826631, "eval_loss": 0.015392209403216839, "eval_runtime": 4.5899, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 320 }, { "epoch": 0.22949061662198392, "grad_norm": 0.04045966640114784, "learning_rate": 4.592274678111588e-05, "loss": 0.0237, "step": 321 }, { "epoch": 0.23020554066130475, "grad_norm": 0.038675274699926376, "learning_rate": 4.606580829756796e-05, "loss": 0.022, "step": 322 }, { "epoch": 0.23092046470062555, "grad_norm": 0.032209500670433044, "learning_rate": 4.620886981402003e-05, "loss": 0.0199, "step": 323 }, { "epoch": 0.23163538873994638, "grad_norm": 0.03816978260874748, "learning_rate": 4.6351931330472104e-05, "loss": 0.0167, "step": 324 }, { "epoch": 0.2323503127792672, "grad_norm": 0.03688159957528114, "learning_rate": 4.649499284692418e-05, "loss": 0.0204, "step": 325 }, { "epoch": 0.2323503127792672, "eval_loss": 0.015513686463236809, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 325 }, { "epoch": 0.233065236818588, "grad_norm": 0.033467963337898254, "learning_rate": 4.663805436337626e-05, "loss": 0.0206, "step": 326 }, { "epoch": 0.23378016085790884, "grad_norm": 0.045416511595249176, "learning_rate": 4.678111587982833e-05, "loss": 0.0194, "step": 327 }, { "epoch": 0.23449508489722967, "grad_norm": 0.030489258468151093, "learning_rate": 4.6924177396280405e-05, "loss": 0.0196, "step": 328 }, { "epoch": 0.2352100089365505, "grad_norm": 0.05016322806477547, "learning_rate": 4.7067238912732475e-05, "loss": 0.025, "step": 329 }, { "epoch": 0.2359249329758713, "grad_norm": 0.03419274091720581, "learning_rate": 4.721030042918455e-05, "loss": 0.0225, "step": 330 }, { "epoch": 0.2359249329758713, "eval_loss": 0.015677740797400475, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 330 }, { "epoch": 0.23663985701519213, "grad_norm": 0.051440414041280746, "learning_rate": 4.735336194563663e-05, "loss": 0.0233, "step": 331 }, { "epoch": 0.23735478105451296, "grad_norm": 0.03153642639517784, "learning_rate": 4.74964234620887e-05, "loss": 0.0172, "step": 332 }, { "epoch": 0.23806970509383377, "grad_norm": 0.037176452577114105, "learning_rate": 4.7639484978540775e-05, "loss": 0.0165, "step": 333 }, { "epoch": 0.2387846291331546, "grad_norm": 0.0447237491607666, "learning_rate": 4.7782546494992845e-05, "loss": 0.0164, "step": 334 }, { "epoch": 0.23949955317247543, "grad_norm": 0.03729422390460968, "learning_rate": 4.792560801144492e-05, "loss": 0.0176, "step": 335 }, { "epoch": 0.23949955317247543, "eval_loss": 0.015609651803970337, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 335 }, { "epoch": 0.24021447721179626, "grad_norm": 0.031058330088853836, "learning_rate": 4.8068669527897e-05, "loss": 0.0164, "step": 336 }, { "epoch": 0.24092940125111706, "grad_norm": 0.04572097584605217, "learning_rate": 4.8211731044349076e-05, "loss": 0.0249, "step": 337 }, { "epoch": 0.2416443252904379, "grad_norm": 0.03700479865074158, "learning_rate": 4.8354792560801146e-05, "loss": 0.0218, "step": 338 }, { "epoch": 0.24235924932975872, "grad_norm": 0.03446533530950546, "learning_rate": 4.8497854077253216e-05, "loss": 0.0135, "step": 339 }, { "epoch": 0.24307417336907952, "grad_norm": 0.030244816094636917, "learning_rate": 4.864091559370529e-05, "loss": 0.0122, "step": 340 }, { "epoch": 0.24307417336907952, "eval_loss": 0.01618138886988163, "eval_runtime": 4.5943, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.83, "step": 340 }, { "epoch": 0.24378909740840035, "grad_norm": 0.04792407900094986, "learning_rate": 4.878397711015737e-05, "loss": 0.0193, "step": 341 }, { "epoch": 0.24450402144772118, "grad_norm": 0.039650481194257736, "learning_rate": 4.8927038626609446e-05, "loss": 0.0209, "step": 342 }, { "epoch": 0.245218945487042, "grad_norm": 0.03580614924430847, "learning_rate": 4.907010014306152e-05, "loss": 0.0269, "step": 343 }, { "epoch": 0.2459338695263628, "grad_norm": 0.0274807121604681, "learning_rate": 4.921316165951359e-05, "loss": 0.0229, "step": 344 }, { "epoch": 0.24664879356568364, "grad_norm": 0.03292469307780266, "learning_rate": 4.935622317596566e-05, "loss": 0.0238, "step": 345 }, { "epoch": 0.24664879356568364, "eval_loss": 0.01575453020632267, "eval_runtime": 4.5908, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 345 }, { "epoch": 0.24736371760500447, "grad_norm": 0.03163152560591698, "learning_rate": 4.949928469241774e-05, "loss": 0.015, "step": 346 }, { "epoch": 0.2480786416443253, "grad_norm": 0.03378206118941307, "learning_rate": 4.964234620886982e-05, "loss": 0.0163, "step": 347 }, { "epoch": 0.2487935656836461, "grad_norm": 0.03625296428799629, "learning_rate": 4.9785407725321894e-05, "loss": 0.02, "step": 348 }, { "epoch": 0.24950848972296694, "grad_norm": 0.03936266899108887, "learning_rate": 4.992846924177397e-05, "loss": 0.0173, "step": 349 }, { "epoch": 0.25022341376228774, "grad_norm": 0.0350416861474514, "learning_rate": 5.0071530758226034e-05, "loss": 0.021, "step": 350 }, { "epoch": 0.25022341376228774, "eval_loss": 0.015687230974435806, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 350 }, { "epoch": 0.25093833780160857, "grad_norm": 0.035547997802495956, "learning_rate": 5.021459227467812e-05, "loss": 0.0187, "step": 351 }, { "epoch": 0.2516532618409294, "grad_norm": 0.05379074066877365, "learning_rate": 5.035765379113019e-05, "loss": 0.0228, "step": 352 }, { "epoch": 0.25236818588025023, "grad_norm": 0.033798184245824814, "learning_rate": 5.050071530758226e-05, "loss": 0.0143, "step": 353 }, { "epoch": 0.25308310991957106, "grad_norm": 0.038714420050382614, "learning_rate": 5.064377682403434e-05, "loss": 0.0177, "step": 354 }, { "epoch": 0.2537980339588919, "grad_norm": 0.037953607738018036, "learning_rate": 5.078683834048641e-05, "loss": 0.0185, "step": 355 }, { "epoch": 0.2537980339588919, "eval_loss": 0.015223164111375809, "eval_runtime": 4.5865, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 355 }, { "epoch": 0.25451295799821266, "grad_norm": 0.03536302596330643, "learning_rate": 5.092989985693849e-05, "loss": 0.0153, "step": 356 }, { "epoch": 0.2552278820375335, "grad_norm": 0.031815771013498306, "learning_rate": 5.107296137339056e-05, "loss": 0.0198, "step": 357 }, { "epoch": 0.2559428060768543, "grad_norm": 0.03302175924181938, "learning_rate": 5.121602288984263e-05, "loss": 0.0208, "step": 358 }, { "epoch": 0.25665773011617515, "grad_norm": 0.03797147050499916, "learning_rate": 5.135908440629471e-05, "loss": 0.0202, "step": 359 }, { "epoch": 0.257372654155496, "grad_norm": 0.033040788024663925, "learning_rate": 5.150214592274678e-05, "loss": 0.0278, "step": 360 }, { "epoch": 0.257372654155496, "eval_loss": 0.01522147748619318, "eval_runtime": 4.6118, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 360 }, { "epoch": 0.2580875781948168, "grad_norm": 0.03122352994978428, "learning_rate": 5.164520743919886e-05, "loss": 0.03, "step": 361 }, { "epoch": 0.25880250223413764, "grad_norm": 0.037834662944078445, "learning_rate": 5.178826895565093e-05, "loss": 0.0161, "step": 362 }, { "epoch": 0.2595174262734584, "grad_norm": 0.034495241940021515, "learning_rate": 5.1931330472103e-05, "loss": 0.0207, "step": 363 }, { "epoch": 0.26023235031277925, "grad_norm": 0.03398492559790611, "learning_rate": 5.207439198855508e-05, "loss": 0.0225, "step": 364 }, { "epoch": 0.2609472743521001, "grad_norm": 0.034835755825042725, "learning_rate": 5.221745350500715e-05, "loss": 0.0319, "step": 365 }, { "epoch": 0.2609472743521001, "eval_loss": 0.015344860032200813, "eval_runtime": 4.6059, "eval_samples_per_second": 10.856, "eval_steps_per_second": 2.822, "step": 365 }, { "epoch": 0.2616621983914209, "grad_norm": 0.03244407847523689, "learning_rate": 5.2360515021459236e-05, "loss": 0.0201, "step": 366 }, { "epoch": 0.26237712243074174, "grad_norm": 0.030593225732445717, "learning_rate": 5.2503576537911306e-05, "loss": 0.0207, "step": 367 }, { "epoch": 0.26309204647006257, "grad_norm": 0.02883586287498474, "learning_rate": 5.264663805436338e-05, "loss": 0.0216, "step": 368 }, { "epoch": 0.2638069705093834, "grad_norm": 0.031928520649671555, "learning_rate": 5.278969957081545e-05, "loss": 0.0155, "step": 369 }, { "epoch": 0.2645218945487042, "grad_norm": 0.029555220156908035, "learning_rate": 5.2932761087267523e-05, "loss": 0.0224, "step": 370 }, { "epoch": 0.2645218945487042, "eval_loss": 0.015093752183020115, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 370 }, { "epoch": 0.265236818588025, "grad_norm": 0.031050892546772957, "learning_rate": 5.307582260371961e-05, "loss": 0.0237, "step": 371 }, { "epoch": 0.26595174262734583, "grad_norm": 0.03734516724944115, "learning_rate": 5.321888412017168e-05, "loss": 0.016, "step": 372 }, { "epoch": 0.26666666666666666, "grad_norm": 0.03385945409536362, "learning_rate": 5.3361945636623754e-05, "loss": 0.0216, "step": 373 }, { "epoch": 0.2673815907059875, "grad_norm": 0.03023538738489151, "learning_rate": 5.3505007153075824e-05, "loss": 0.0188, "step": 374 }, { "epoch": 0.2680965147453083, "grad_norm": 0.03082319162786007, "learning_rate": 5.3648068669527894e-05, "loss": 0.0175, "step": 375 }, { "epoch": 0.2680965147453083, "eval_loss": 0.015100045129656792, "eval_runtime": 4.6, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 375 }, { "epoch": 0.26881143878462915, "grad_norm": 0.031494855880737305, "learning_rate": 5.379113018597998e-05, "loss": 0.0272, "step": 376 }, { "epoch": 0.2695263628239499, "grad_norm": 0.035347990691661835, "learning_rate": 5.393419170243205e-05, "loss": 0.0219, "step": 377 }, { "epoch": 0.27024128686327076, "grad_norm": 0.028593935072422028, "learning_rate": 5.4077253218884125e-05, "loss": 0.0222, "step": 378 }, { "epoch": 0.2709562109025916, "grad_norm": 0.02419453114271164, "learning_rate": 5.4220314735336195e-05, "loss": 0.0132, "step": 379 }, { "epoch": 0.2716711349419124, "grad_norm": 0.0313359871506691, "learning_rate": 5.4363376251788265e-05, "loss": 0.0212, "step": 380 }, { "epoch": 0.2716711349419124, "eval_loss": 0.015232650563120842, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 380 }, { "epoch": 0.27238605898123325, "grad_norm": 0.031670309603214264, "learning_rate": 5.450643776824035e-05, "loss": 0.0215, "step": 381 }, { "epoch": 0.2731009830205541, "grad_norm": 0.03041732870042324, "learning_rate": 5.464949928469242e-05, "loss": 0.0213, "step": 382 }, { "epoch": 0.2738159070598749, "grad_norm": 0.02918359637260437, "learning_rate": 5.47925608011445e-05, "loss": 0.0204, "step": 383 }, { "epoch": 0.27453083109919574, "grad_norm": 0.03162863105535507, "learning_rate": 5.493562231759657e-05, "loss": 0.0164, "step": 384 }, { "epoch": 0.2752457551385165, "grad_norm": 0.03356940299272537, "learning_rate": 5.507868383404864e-05, "loss": 0.0261, "step": 385 }, { "epoch": 0.2752457551385165, "eval_loss": 0.015180889517068863, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 385 }, { "epoch": 0.27596067917783734, "grad_norm": 0.03340638056397438, "learning_rate": 5.522174535050072e-05, "loss": 0.0254, "step": 386 }, { "epoch": 0.2766756032171582, "grad_norm": 0.029027249664068222, "learning_rate": 5.536480686695279e-05, "loss": 0.0219, "step": 387 }, { "epoch": 0.277390527256479, "grad_norm": 0.033809028565883636, "learning_rate": 5.550786838340487e-05, "loss": 0.023, "step": 388 }, { "epoch": 0.27810545129579983, "grad_norm": 0.029779301956295967, "learning_rate": 5.565092989985694e-05, "loss": 0.023, "step": 389 }, { "epoch": 0.27882037533512066, "grad_norm": 0.029455268755555153, "learning_rate": 5.579399141630901e-05, "loss": 0.0278, "step": 390 }, { "epoch": 0.27882037533512066, "eval_loss": 0.015145715326070786, "eval_runtime": 4.6442, "eval_samples_per_second": 10.766, "eval_steps_per_second": 2.799, "step": 390 }, { "epoch": 0.2795352993744415, "grad_norm": 0.023406632244586945, "learning_rate": 5.593705293276109e-05, "loss": 0.0162, "step": 391 }, { "epoch": 0.28025022341376227, "grad_norm": 0.03577807545661926, "learning_rate": 5.608011444921316e-05, "loss": 0.0238, "step": 392 }, { "epoch": 0.2809651474530831, "grad_norm": 0.03208368644118309, "learning_rate": 5.622317596566524e-05, "loss": 0.016, "step": 393 }, { "epoch": 0.2816800714924039, "grad_norm": 0.03808039054274559, "learning_rate": 5.6366237482117313e-05, "loss": 0.02, "step": 394 }, { "epoch": 0.28239499553172476, "grad_norm": 0.04231199622154236, "learning_rate": 5.6509298998569384e-05, "loss": 0.0163, "step": 395 }, { "epoch": 0.28239499553172476, "eval_loss": 0.015092173591256142, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 395 }, { "epoch": 0.2831099195710456, "grad_norm": 0.023967541754245758, "learning_rate": 5.665236051502146e-05, "loss": 0.0156, "step": 396 }, { "epoch": 0.2838248436103664, "grad_norm": 0.03869270905852318, "learning_rate": 5.679542203147353e-05, "loss": 0.0168, "step": 397 }, { "epoch": 0.28453976764968725, "grad_norm": 0.03230039030313492, "learning_rate": 5.6938483547925614e-05, "loss": 0.0206, "step": 398 }, { "epoch": 0.285254691689008, "grad_norm": 0.029226789250969887, "learning_rate": 5.7081545064377684e-05, "loss": 0.0239, "step": 399 }, { "epoch": 0.28596961572832885, "grad_norm": 0.0276876799762249, "learning_rate": 5.722460658082977e-05, "loss": 0.0154, "step": 400 }, { "epoch": 0.28596961572832885, "eval_loss": 0.014901846647262573, "eval_runtime": 4.5926, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 400 }, { "epoch": 0.2866845397676497, "grad_norm": 0.03752603754401207, "learning_rate": 5.736766809728184e-05, "loss": 0.0166, "step": 401 }, { "epoch": 0.2873994638069705, "grad_norm": 0.024154113605618477, "learning_rate": 5.751072961373391e-05, "loss": 0.015, "step": 402 }, { "epoch": 0.28811438784629134, "grad_norm": 0.026974158361554146, "learning_rate": 5.7653791130185985e-05, "loss": 0.015, "step": 403 }, { "epoch": 0.28882931188561217, "grad_norm": 0.03305017203092575, "learning_rate": 5.7796852646638055e-05, "loss": 0.0224, "step": 404 }, { "epoch": 0.289544235924933, "grad_norm": 0.03352276608347893, "learning_rate": 5.793991416309014e-05, "loss": 0.0231, "step": 405 }, { "epoch": 0.289544235924933, "eval_loss": 0.014785420149564743, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 405 }, { "epoch": 0.2902591599642538, "grad_norm": 0.025250667706131935, "learning_rate": 5.808297567954221e-05, "loss": 0.019, "step": 406 }, { "epoch": 0.2909740840035746, "grad_norm": 0.03176187723875046, "learning_rate": 5.822603719599428e-05, "loss": 0.0245, "step": 407 }, { "epoch": 0.29168900804289544, "grad_norm": 0.035841699689626694, "learning_rate": 5.8369098712446355e-05, "loss": 0.0218, "step": 408 }, { "epoch": 0.29240393208221627, "grad_norm": 0.028857413679361343, "learning_rate": 5.8512160228898425e-05, "loss": 0.0184, "step": 409 }, { "epoch": 0.2931188561215371, "grad_norm": 0.027913937345147133, "learning_rate": 5.865522174535051e-05, "loss": 0.0211, "step": 410 }, { "epoch": 0.2931188561215371, "eval_loss": 0.014987439848482609, "eval_runtime": 4.5956, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 410 }, { "epoch": 0.2938337801608579, "grad_norm": 0.03788643330335617, "learning_rate": 5.879828326180258e-05, "loss": 0.0167, "step": 411 }, { "epoch": 0.29454870420017876, "grad_norm": 0.02558186836540699, "learning_rate": 5.894134477825465e-05, "loss": 0.0213, "step": 412 }, { "epoch": 0.29526362823949953, "grad_norm": 0.026676848530769348, "learning_rate": 5.9084406294706726e-05, "loss": 0.0156, "step": 413 }, { "epoch": 0.29597855227882036, "grad_norm": 0.03475512936711311, "learning_rate": 5.9227467811158796e-05, "loss": 0.0261, "step": 414 }, { "epoch": 0.2966934763181412, "grad_norm": 0.023404987528920174, "learning_rate": 5.937052932761088e-05, "loss": 0.0175, "step": 415 }, { "epoch": 0.2966934763181412, "eval_loss": 0.014811079949140549, "eval_runtime": 4.5837, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 415 }, { "epoch": 0.297408400357462, "grad_norm": 0.029419662430882454, "learning_rate": 5.951359084406295e-05, "loss": 0.0194, "step": 416 }, { "epoch": 0.29812332439678285, "grad_norm": 0.02912021242082119, "learning_rate": 5.965665236051502e-05, "loss": 0.0239, "step": 417 }, { "epoch": 0.2988382484361037, "grad_norm": 0.027610793709754944, "learning_rate": 5.9799713876967103e-05, "loss": 0.0189, "step": 418 }, { "epoch": 0.2995531724754245, "grad_norm": 0.028670048341155052, "learning_rate": 5.9942775393419173e-05, "loss": 0.0164, "step": 419 }, { "epoch": 0.3002680965147453, "grad_norm": 0.03132447972893715, "learning_rate": 6.008583690987125e-05, "loss": 0.0253, "step": 420 }, { "epoch": 0.3002680965147453, "eval_loss": 0.014700024388730526, "eval_runtime": 4.5854, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 420 }, { "epoch": 0.3009830205540661, "grad_norm": 0.03666383773088455, "learning_rate": 6.022889842632332e-05, "loss": 0.0328, "step": 421 }, { "epoch": 0.30169794459338695, "grad_norm": 0.028829652816057205, "learning_rate": 6.037195994277539e-05, "loss": 0.0222, "step": 422 }, { "epoch": 0.3024128686327078, "grad_norm": 0.02907199040055275, "learning_rate": 6.0515021459227474e-05, "loss": 0.0245, "step": 423 }, { "epoch": 0.3031277926720286, "grad_norm": 0.032229844480752945, "learning_rate": 6.0658082975679544e-05, "loss": 0.0215, "step": 424 }, { "epoch": 0.30384271671134944, "grad_norm": 0.030400235205888748, "learning_rate": 6.080114449213162e-05, "loss": 0.0209, "step": 425 }, { "epoch": 0.30384271671134944, "eval_loss": 0.014661336317658424, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 425 }, { "epoch": 0.30455764075067027, "grad_norm": 0.0255142692476511, "learning_rate": 6.094420600858369e-05, "loss": 0.0151, "step": 426 }, { "epoch": 0.30527256478999104, "grad_norm": 0.03183441236615181, "learning_rate": 6.108726752503577e-05, "loss": 0.0192, "step": 427 }, { "epoch": 0.30598748882931187, "grad_norm": 0.03157901391386986, "learning_rate": 6.123032904148784e-05, "loss": 0.0288, "step": 428 }, { "epoch": 0.3067024128686327, "grad_norm": 0.022977767512202263, "learning_rate": 6.137339055793991e-05, "loss": 0.013, "step": 429 }, { "epoch": 0.30741733690795353, "grad_norm": 0.026813944801688194, "learning_rate": 6.1516452074392e-05, "loss": 0.0182, "step": 430 }, { "epoch": 0.30741733690795353, "eval_loss": 0.014624308794736862, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 430 }, { "epoch": 0.30813226094727436, "grad_norm": 0.03398825600743294, "learning_rate": 6.165951359084406e-05, "loss": 0.0221, "step": 431 }, { "epoch": 0.3088471849865952, "grad_norm": 0.032999105751514435, "learning_rate": 6.180257510729615e-05, "loss": 0.0185, "step": 432 }, { "epoch": 0.309562109025916, "grad_norm": 0.025628099218010902, "learning_rate": 6.194563662374822e-05, "loss": 0.0201, "step": 433 }, { "epoch": 0.3102770330652368, "grad_norm": 0.028820572420954704, "learning_rate": 6.208869814020028e-05, "loss": 0.0197, "step": 434 }, { "epoch": 0.3109919571045576, "grad_norm": 0.025667032226920128, "learning_rate": 6.223175965665237e-05, "loss": 0.0156, "step": 435 }, { "epoch": 0.3109919571045576, "eval_loss": 0.014807099476456642, "eval_runtime": 4.579, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 435 }, { "epoch": 0.31170688114387846, "grad_norm": 0.029176905751228333, "learning_rate": 6.237482117310443e-05, "loss": 0.0224, "step": 436 }, { "epoch": 0.3124218051831993, "grad_norm": 0.032533735036849976, "learning_rate": 6.251788268955652e-05, "loss": 0.0224, "step": 437 }, { "epoch": 0.3131367292225201, "grad_norm": 0.03173421695828438, "learning_rate": 6.266094420600859e-05, "loss": 0.0309, "step": 438 }, { "epoch": 0.31385165326184095, "grad_norm": 0.0243076141923666, "learning_rate": 6.280400572246066e-05, "loss": 0.0173, "step": 439 }, { "epoch": 0.3145665773011618, "grad_norm": 0.02635042928159237, "learning_rate": 6.294706723891274e-05, "loss": 0.0181, "step": 440 }, { "epoch": 0.3145665773011618, "eval_loss": 0.015073521994054317, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 440 }, { "epoch": 0.31528150134048255, "grad_norm": 0.02489721029996872, "learning_rate": 6.30901287553648e-05, "loss": 0.0215, "step": 441 }, { "epoch": 0.3159964253798034, "grad_norm": 0.03026987798511982, "learning_rate": 6.32331902718169e-05, "loss": 0.0231, "step": 442 }, { "epoch": 0.3167113494191242, "grad_norm": 0.0285785011947155, "learning_rate": 6.337625178826896e-05, "loss": 0.0232, "step": 443 }, { "epoch": 0.31742627345844504, "grad_norm": 0.027361318469047546, "learning_rate": 6.351931330472103e-05, "loss": 0.0215, "step": 444 }, { "epoch": 0.31814119749776587, "grad_norm": 0.024541562423110008, "learning_rate": 6.366237482117311e-05, "loss": 0.0155, "step": 445 }, { "epoch": 0.31814119749776587, "eval_loss": 0.014634760096669197, "eval_runtime": 4.601, "eval_samples_per_second": 10.867, "eval_steps_per_second": 2.825, "step": 445 }, { "epoch": 0.3188561215370867, "grad_norm": 0.02336801216006279, "learning_rate": 6.380543633762517e-05, "loss": 0.0209, "step": 446 }, { "epoch": 0.31957104557640753, "grad_norm": 0.027382683008909225, "learning_rate": 6.394849785407726e-05, "loss": 0.0168, "step": 447 }, { "epoch": 0.3202859696157283, "grad_norm": 0.02473495714366436, "learning_rate": 6.409155937052933e-05, "loss": 0.019, "step": 448 }, { "epoch": 0.32100089365504914, "grad_norm": 0.026897931471467018, "learning_rate": 6.42346208869814e-05, "loss": 0.0173, "step": 449 }, { "epoch": 0.32171581769436997, "grad_norm": 0.028469974175095558, "learning_rate": 6.437768240343348e-05, "loss": 0.0285, "step": 450 }, { "epoch": 0.32171581769436997, "eval_loss": 0.014480854384601116, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 450 }, { "epoch": 0.3224307417336908, "grad_norm": 0.027518998831510544, "learning_rate": 6.452074391988556e-05, "loss": 0.0232, "step": 451 }, { "epoch": 0.3231456657730116, "grad_norm": 0.029539406299591064, "learning_rate": 6.466380543633763e-05, "loss": 0.0243, "step": 452 }, { "epoch": 0.32386058981233246, "grad_norm": 0.025783803313970566, "learning_rate": 6.48068669527897e-05, "loss": 0.0211, "step": 453 }, { "epoch": 0.3245755138516533, "grad_norm": 0.027744924649596214, "learning_rate": 6.494992846924177e-05, "loss": 0.0279, "step": 454 }, { "epoch": 0.32529043789097406, "grad_norm": 0.025200719013810158, "learning_rate": 6.509298998569385e-05, "loss": 0.0214, "step": 455 }, { "epoch": 0.32529043789097406, "eval_loss": 0.014736389741301537, "eval_runtime": 4.5865, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 455 }, { "epoch": 0.3260053619302949, "grad_norm": 0.024501733481884003, "learning_rate": 6.523605150214593e-05, "loss": 0.0228, "step": 456 }, { "epoch": 0.3267202859696157, "grad_norm": 0.026232441887259483, "learning_rate": 6.5379113018598e-05, "loss": 0.0332, "step": 457 }, { "epoch": 0.32743521000893655, "grad_norm": 0.02283094823360443, "learning_rate": 6.552217453505007e-05, "loss": 0.0179, "step": 458 }, { "epoch": 0.3281501340482574, "grad_norm": 0.034485552459955215, "learning_rate": 6.566523605150215e-05, "loss": 0.0192, "step": 459 }, { "epoch": 0.3288650580875782, "grad_norm": 0.040262144058942795, "learning_rate": 6.580829756795422e-05, "loss": 0.0198, "step": 460 }, { "epoch": 0.3288650580875782, "eval_loss": 0.01484442874789238, "eval_runtime": 4.6036, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.824, "step": 460 }, { "epoch": 0.32957998212689904, "grad_norm": 0.02432776428759098, "learning_rate": 6.59513590844063e-05, "loss": 0.0205, "step": 461 }, { "epoch": 0.3302949061662198, "grad_norm": 0.026828939095139503, "learning_rate": 6.609442060085838e-05, "loss": 0.0217, "step": 462 }, { "epoch": 0.33100983020554064, "grad_norm": 0.027654776349663734, "learning_rate": 6.623748211731044e-05, "loss": 0.024, "step": 463 }, { "epoch": 0.3317247542448615, "grad_norm": 0.026019440963864326, "learning_rate": 6.638054363376253e-05, "loss": 0.02, "step": 464 }, { "epoch": 0.3324396782841823, "grad_norm": 0.032330453395843506, "learning_rate": 6.652360515021459e-05, "loss": 0.0184, "step": 465 }, { "epoch": 0.3324396782841823, "eval_loss": 0.014592702500522137, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 465 }, { "epoch": 0.33315460232350314, "grad_norm": 0.019898969680070877, "learning_rate": 6.666666666666667e-05, "loss": 0.012, "step": 466 }, { "epoch": 0.33386952636282397, "grad_norm": 0.024030594155192375, "learning_rate": 6.680972818311875e-05, "loss": 0.0169, "step": 467 }, { "epoch": 0.3345844504021448, "grad_norm": 0.024787520989775658, "learning_rate": 6.695278969957082e-05, "loss": 0.0238, "step": 468 }, { "epoch": 0.33529937444146557, "grad_norm": 0.02897603251039982, "learning_rate": 6.70958512160229e-05, "loss": 0.0205, "step": 469 }, { "epoch": 0.3360142984807864, "grad_norm": 0.028750836849212646, "learning_rate": 6.723891273247496e-05, "loss": 0.0245, "step": 470 }, { "epoch": 0.3360142984807864, "eval_loss": 0.014713117852807045, "eval_runtime": 4.5973, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 470 }, { "epoch": 0.33672922252010723, "grad_norm": 0.03134264051914215, "learning_rate": 6.738197424892704e-05, "loss": 0.0177, "step": 471 }, { "epoch": 0.33744414655942806, "grad_norm": 0.030902203172445297, "learning_rate": 6.752503576537912e-05, "loss": 0.016, "step": 472 }, { "epoch": 0.3381590705987489, "grad_norm": 0.0332665778696537, "learning_rate": 6.76680972818312e-05, "loss": 0.019, "step": 473 }, { "epoch": 0.3388739946380697, "grad_norm": 0.029507843777537346, "learning_rate": 6.781115879828327e-05, "loss": 0.0165, "step": 474 }, { "epoch": 0.33958891867739055, "grad_norm": 0.02070349082350731, "learning_rate": 6.795422031473533e-05, "loss": 0.0167, "step": 475 }, { "epoch": 0.33958891867739055, "eval_loss": 0.014558276161551476, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 475 }, { "epoch": 0.3403038427167113, "grad_norm": 0.026492523029446602, "learning_rate": 6.809728183118741e-05, "loss": 0.0201, "step": 476 }, { "epoch": 0.34101876675603215, "grad_norm": 0.030563710257411003, "learning_rate": 6.824034334763949e-05, "loss": 0.0128, "step": 477 }, { "epoch": 0.341733690795353, "grad_norm": 0.024015624076128006, "learning_rate": 6.838340486409156e-05, "loss": 0.0151, "step": 478 }, { "epoch": 0.3424486148346738, "grad_norm": 0.029101716354489326, "learning_rate": 6.852646638054364e-05, "loss": 0.0251, "step": 479 }, { "epoch": 0.34316353887399464, "grad_norm": 0.02356347069144249, "learning_rate": 6.86695278969957e-05, "loss": 0.018, "step": 480 }, { "epoch": 0.34316353887399464, "eval_loss": 0.01459542103111744, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 480 }, { "epoch": 0.3438784629133155, "grad_norm": 0.03104209527373314, "learning_rate": 6.881258941344778e-05, "loss": 0.023, "step": 481 }, { "epoch": 0.3445933869526363, "grad_norm": 0.02594764158129692, "learning_rate": 6.895565092989986e-05, "loss": 0.0202, "step": 482 }, { "epoch": 0.3453083109919571, "grad_norm": 0.023989731445908546, "learning_rate": 6.909871244635194e-05, "loss": 0.0189, "step": 483 }, { "epoch": 0.3460232350312779, "grad_norm": 0.02726910263299942, "learning_rate": 6.924177396280401e-05, "loss": 0.0243, "step": 484 }, { "epoch": 0.34673815907059874, "grad_norm": 0.026585185900330544, "learning_rate": 6.938483547925609e-05, "loss": 0.016, "step": 485 }, { "epoch": 0.34673815907059874, "eval_loss": 0.014926137402653694, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 485 }, { "epoch": 0.34745308310991957, "grad_norm": 0.03482940420508385, "learning_rate": 6.952789699570815e-05, "loss": 0.019, "step": 486 }, { "epoch": 0.3481680071492404, "grad_norm": 0.027847424149513245, "learning_rate": 6.967095851216023e-05, "loss": 0.0175, "step": 487 }, { "epoch": 0.34888293118856123, "grad_norm": 0.02637483924627304, "learning_rate": 6.98140200286123e-05, "loss": 0.0184, "step": 488 }, { "epoch": 0.34959785522788206, "grad_norm": 0.02641034498810768, "learning_rate": 6.995708154506438e-05, "loss": 0.0128, "step": 489 }, { "epoch": 0.35031277926720283, "grad_norm": 0.027217496186494827, "learning_rate": 7.010014306151646e-05, "loss": 0.0281, "step": 490 }, { "epoch": 0.35031277926720283, "eval_loss": 0.014825839549303055, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 490 }, { "epoch": 0.35102770330652366, "grad_norm": 0.022761955857276917, "learning_rate": 7.024320457796852e-05, "loss": 0.0169, "step": 491 }, { "epoch": 0.3517426273458445, "grad_norm": 0.023172497749328613, "learning_rate": 7.03862660944206e-05, "loss": 0.0171, "step": 492 }, { "epoch": 0.3524575513851653, "grad_norm": 0.028591593727469444, "learning_rate": 7.052932761087268e-05, "loss": 0.0181, "step": 493 }, { "epoch": 0.35317247542448615, "grad_norm": 0.024305181577801704, "learning_rate": 7.067238912732475e-05, "loss": 0.0258, "step": 494 }, { "epoch": 0.353887399463807, "grad_norm": 0.025321301072835922, "learning_rate": 7.081545064377683e-05, "loss": 0.0212, "step": 495 }, { "epoch": 0.353887399463807, "eval_loss": 0.01461215689778328, "eval_runtime": 4.5919, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 495 }, { "epoch": 0.3546023235031278, "grad_norm": 0.02601565606892109, "learning_rate": 7.095851216022891e-05, "loss": 0.0191, "step": 496 }, { "epoch": 0.3553172475424486, "grad_norm": 0.0251152440905571, "learning_rate": 7.110157367668097e-05, "loss": 0.015, "step": 497 }, { "epoch": 0.3560321715817694, "grad_norm": 0.025103801861405373, "learning_rate": 7.124463519313305e-05, "loss": 0.0157, "step": 498 }, { "epoch": 0.35674709562109025, "grad_norm": 0.032078325748443604, "learning_rate": 7.138769670958512e-05, "loss": 0.0233, "step": 499 }, { "epoch": 0.3574620196604111, "grad_norm": 0.02442081645131111, "learning_rate": 7.15307582260372e-05, "loss": 0.0178, "step": 500 }, { "epoch": 0.3574620196604111, "eval_loss": 0.01479727029800415, "eval_runtime": 4.6083, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 500 }, { "epoch": 0.3581769436997319, "grad_norm": 0.022252729162573814, "learning_rate": 7.167381974248928e-05, "loss": 0.0166, "step": 501 }, { "epoch": 0.35889186773905274, "grad_norm": 0.032646581530570984, "learning_rate": 7.181688125894135e-05, "loss": 0.038, "step": 502 }, { "epoch": 0.35960679177837357, "grad_norm": 0.023852908983826637, "learning_rate": 7.195994277539342e-05, "loss": 0.0171, "step": 503 }, { "epoch": 0.36032171581769434, "grad_norm": 0.023859970271587372, "learning_rate": 7.21030042918455e-05, "loss": 0.0206, "step": 504 }, { "epoch": 0.3610366398570152, "grad_norm": 0.028886616230010986, "learning_rate": 7.224606580829757e-05, "loss": 0.0199, "step": 505 }, { "epoch": 0.3610366398570152, "eval_loss": 0.014492945745587349, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 505 }, { "epoch": 0.361751563896336, "grad_norm": 0.024249356240034103, "learning_rate": 7.238912732474965e-05, "loss": 0.0164, "step": 506 }, { "epoch": 0.36246648793565683, "grad_norm": 0.022851092740893364, "learning_rate": 7.253218884120173e-05, "loss": 0.0194, "step": 507 }, { "epoch": 0.36318141197497766, "grad_norm": 0.030980458483099937, "learning_rate": 7.267525035765379e-05, "loss": 0.0248, "step": 508 }, { "epoch": 0.3638963360142985, "grad_norm": 0.020735226571559906, "learning_rate": 7.281831187410587e-05, "loss": 0.0209, "step": 509 }, { "epoch": 0.3646112600536193, "grad_norm": 0.028116052970290184, "learning_rate": 7.296137339055794e-05, "loss": 0.0141, "step": 510 }, { "epoch": 0.3646112600536193, "eval_loss": 0.01430391613394022, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 510 }, { "epoch": 0.3653261840929401, "grad_norm": 0.03110167197883129, "learning_rate": 7.310443490701002e-05, "loss": 0.0355, "step": 511 }, { "epoch": 0.36604110813226093, "grad_norm": 0.021910877898335457, "learning_rate": 7.32474964234621e-05, "loss": 0.0206, "step": 512 }, { "epoch": 0.36675603217158176, "grad_norm": 0.020496245473623276, "learning_rate": 7.339055793991416e-05, "loss": 0.0139, "step": 513 }, { "epoch": 0.3674709562109026, "grad_norm": 0.027573563158512115, "learning_rate": 7.353361945636624e-05, "loss": 0.0261, "step": 514 }, { "epoch": 0.3681858802502234, "grad_norm": 0.02623247168958187, "learning_rate": 7.367668097281831e-05, "loss": 0.021, "step": 515 }, { "epoch": 0.3681858802502234, "eval_loss": 0.014204764738678932, "eval_runtime": 4.6018, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 515 }, { "epoch": 0.36890080428954425, "grad_norm": 0.027543148025870323, "learning_rate": 7.381974248927039e-05, "loss": 0.0171, "step": 516 }, { "epoch": 0.3696157283288651, "grad_norm": 0.035177379846572876, "learning_rate": 7.396280400572247e-05, "loss": 0.0232, "step": 517 }, { "epoch": 0.37033065236818585, "grad_norm": 0.022856447845697403, "learning_rate": 7.410586552217453e-05, "loss": 0.0164, "step": 518 }, { "epoch": 0.3710455764075067, "grad_norm": 0.024114098399877548, "learning_rate": 7.424892703862662e-05, "loss": 0.0219, "step": 519 }, { "epoch": 0.3717605004468275, "grad_norm": 0.02727317437529564, "learning_rate": 7.439198855507868e-05, "loss": 0.0163, "step": 520 }, { "epoch": 0.3717605004468275, "eval_loss": 0.014371303841471672, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 520 }, { "epoch": 0.37247542448614834, "grad_norm": 0.02574791945517063, "learning_rate": 7.453505007153076e-05, "loss": 0.0255, "step": 521 }, { "epoch": 0.3731903485254692, "grad_norm": 0.022294405847787857, "learning_rate": 7.467811158798284e-05, "loss": 0.0203, "step": 522 }, { "epoch": 0.37390527256479, "grad_norm": 0.03155333548784256, "learning_rate": 7.48211731044349e-05, "loss": 0.0179, "step": 523 }, { "epoch": 0.37462019660411083, "grad_norm": 0.03206067159771919, "learning_rate": 7.496423462088699e-05, "loss": 0.0292, "step": 524 }, { "epoch": 0.3753351206434316, "grad_norm": 0.02397489920258522, "learning_rate": 7.510729613733905e-05, "loss": 0.0281, "step": 525 }, { "epoch": 0.3753351206434316, "eval_loss": 0.014458928257226944, "eval_runtime": 4.6154, "eval_samples_per_second": 10.833, "eval_steps_per_second": 2.817, "step": 525 }, { "epoch": 0.37605004468275244, "grad_norm": 0.02801380679011345, "learning_rate": 7.525035765379113e-05, "loss": 0.0138, "step": 526 }, { "epoch": 0.37676496872207327, "grad_norm": 0.02448059804737568, "learning_rate": 7.539341917024321e-05, "loss": 0.0173, "step": 527 }, { "epoch": 0.3774798927613941, "grad_norm": 0.023408057168126106, "learning_rate": 7.553648068669528e-05, "loss": 0.018, "step": 528 }, { "epoch": 0.37819481680071493, "grad_norm": 0.02208542451262474, "learning_rate": 7.567954220314736e-05, "loss": 0.0163, "step": 529 }, { "epoch": 0.37890974084003576, "grad_norm": 0.023817123845219612, "learning_rate": 7.582260371959943e-05, "loss": 0.017, "step": 530 }, { "epoch": 0.37890974084003576, "eval_loss": 0.014133991673588753, "eval_runtime": 4.5867, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 530 }, { "epoch": 0.3796246648793566, "grad_norm": 0.026265783235430717, "learning_rate": 7.59656652360515e-05, "loss": 0.0176, "step": 531 }, { "epoch": 0.3803395889186774, "grad_norm": 0.02352309040725231, "learning_rate": 7.610872675250358e-05, "loss": 0.021, "step": 532 }, { "epoch": 0.3810545129579982, "grad_norm": 0.030242834240198135, "learning_rate": 7.625178826895566e-05, "loss": 0.0236, "step": 533 }, { "epoch": 0.381769436997319, "grad_norm": 0.025673046708106995, "learning_rate": 7.639484978540773e-05, "loss": 0.0254, "step": 534 }, { "epoch": 0.38248436103663985, "grad_norm": 0.02806665189564228, "learning_rate": 7.65379113018598e-05, "loss": 0.0197, "step": 535 }, { "epoch": 0.38248436103663985, "eval_loss": 0.014310309663414955, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 535 }, { "epoch": 0.3831992850759607, "grad_norm": 0.027680424973368645, "learning_rate": 7.668097281831189e-05, "loss": 0.0241, "step": 536 }, { "epoch": 0.3839142091152815, "grad_norm": 0.01994943618774414, "learning_rate": 7.682403433476395e-05, "loss": 0.018, "step": 537 }, { "epoch": 0.38462913315460234, "grad_norm": 0.02543514221906662, "learning_rate": 7.696709585121603e-05, "loss": 0.0132, "step": 538 }, { "epoch": 0.3853440571939232, "grad_norm": 0.0241459421813488, "learning_rate": 7.71101573676681e-05, "loss": 0.0233, "step": 539 }, { "epoch": 0.38605898123324395, "grad_norm": 0.020523283630609512, "learning_rate": 7.725321888412017e-05, "loss": 0.0181, "step": 540 }, { "epoch": 0.38605898123324395, "eval_loss": 0.014695288613438606, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 540 }, { "epoch": 0.3867739052725648, "grad_norm": 0.02364308014512062, "learning_rate": 7.739628040057226e-05, "loss": 0.0173, "step": 541 }, { "epoch": 0.3874888293118856, "grad_norm": 0.027408121153712273, "learning_rate": 7.753934191702432e-05, "loss": 0.0191, "step": 542 }, { "epoch": 0.38820375335120644, "grad_norm": 0.02361464872956276, "learning_rate": 7.76824034334764e-05, "loss": 0.02, "step": 543 }, { "epoch": 0.38891867739052727, "grad_norm": 0.020285554230213165, "learning_rate": 7.782546494992847e-05, "loss": 0.016, "step": 544 }, { "epoch": 0.3896336014298481, "grad_norm": 0.02384919859468937, "learning_rate": 7.796852646638054e-05, "loss": 0.0204, "step": 545 }, { "epoch": 0.3896336014298481, "eval_loss": 0.014362729154527187, "eval_runtime": 4.5878, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 545 }, { "epoch": 0.3903485254691689, "grad_norm": 0.022700300440192223, "learning_rate": 7.811158798283263e-05, "loss": 0.0195, "step": 546 }, { "epoch": 0.3910634495084897, "grad_norm": 0.02360803633928299, "learning_rate": 7.825464949928469e-05, "loss": 0.0147, "step": 547 }, { "epoch": 0.39177837354781053, "grad_norm": 0.024287667125463486, "learning_rate": 7.839771101573678e-05, "loss": 0.0229, "step": 548 }, { "epoch": 0.39249329758713136, "grad_norm": 0.019094601273536682, "learning_rate": 7.854077253218884e-05, "loss": 0.0144, "step": 549 }, { "epoch": 0.3932082216264522, "grad_norm": 0.021112283691763878, "learning_rate": 7.868383404864091e-05, "loss": 0.0119, "step": 550 }, { "epoch": 0.3932082216264522, "eval_loss": 0.01412997767329216, "eval_runtime": 4.5904, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 550 }, { "epoch": 0.393923145665773, "grad_norm": 0.021841008216142654, "learning_rate": 7.8826895565093e-05, "loss": 0.0151, "step": 551 }, { "epoch": 0.39463806970509385, "grad_norm": 0.02735757827758789, "learning_rate": 7.896995708154506e-05, "loss": 0.02, "step": 552 }, { "epoch": 0.3953529937444147, "grad_norm": 0.02956685982644558, "learning_rate": 7.911301859799715e-05, "loss": 0.021, "step": 553 }, { "epoch": 0.39606791778373546, "grad_norm": 0.019207589328289032, "learning_rate": 7.925608011444922e-05, "loss": 0.0215, "step": 554 }, { "epoch": 0.3967828418230563, "grad_norm": 0.026574745774269104, "learning_rate": 7.939914163090129e-05, "loss": 0.0227, "step": 555 }, { "epoch": 0.3967828418230563, "eval_loss": 0.01443383190780878, "eval_runtime": 4.6188, "eval_samples_per_second": 10.825, "eval_steps_per_second": 2.815, "step": 555 }, { "epoch": 0.3974977658623771, "grad_norm": 0.02198890410363674, "learning_rate": 7.954220314735337e-05, "loss": 0.0186, "step": 556 }, { "epoch": 0.39821268990169795, "grad_norm": 0.021094990894198418, "learning_rate": 7.968526466380543e-05, "loss": 0.0203, "step": 557 }, { "epoch": 0.3989276139410188, "grad_norm": 0.023728732019662857, "learning_rate": 7.982832618025752e-05, "loss": 0.0153, "step": 558 }, { "epoch": 0.3996425379803396, "grad_norm": 0.024050623178482056, "learning_rate": 7.997138769670959e-05, "loss": 0.022, "step": 559 }, { "epoch": 0.40035746201966044, "grad_norm": 0.027505459263920784, "learning_rate": 8.011444921316166e-05, "loss": 0.0212, "step": 560 }, { "epoch": 0.40035746201966044, "eval_loss": 0.014456554315984249, "eval_runtime": 4.6074, "eval_samples_per_second": 10.852, "eval_steps_per_second": 2.822, "step": 560 }, { "epoch": 0.4010723860589812, "grad_norm": 0.022423220798373222, "learning_rate": 8.025751072961374e-05, "loss": 0.0161, "step": 561 }, { "epoch": 0.40178731009830204, "grad_norm": 0.025117523968219757, "learning_rate": 8.04005722460658e-05, "loss": 0.0235, "step": 562 }, { "epoch": 0.40250223413762287, "grad_norm": 0.026917561888694763, "learning_rate": 8.054363376251789e-05, "loss": 0.0111, "step": 563 }, { "epoch": 0.4032171581769437, "grad_norm": 0.01924188621342182, "learning_rate": 8.068669527896996e-05, "loss": 0.0155, "step": 564 }, { "epoch": 0.40393208221626453, "grad_norm": 0.025978002697229385, "learning_rate": 8.082975679542205e-05, "loss": 0.0185, "step": 565 }, { "epoch": 0.40393208221626453, "eval_loss": 0.014628004282712936, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 565 }, { "epoch": 0.40464700625558536, "grad_norm": 0.022502701729536057, "learning_rate": 8.097281831187411e-05, "loss": 0.0169, "step": 566 }, { "epoch": 0.4053619302949062, "grad_norm": 0.023913582786917686, "learning_rate": 8.111587982832617e-05, "loss": 0.0157, "step": 567 }, { "epoch": 0.40607685433422697, "grad_norm": 0.02532842941582203, "learning_rate": 8.125894134477826e-05, "loss": 0.0228, "step": 568 }, { "epoch": 0.4067917783735478, "grad_norm": 0.027906831353902817, "learning_rate": 8.140200286123033e-05, "loss": 0.0318, "step": 569 }, { "epoch": 0.4075067024128686, "grad_norm": 0.03185643255710602, "learning_rate": 8.154506437768242e-05, "loss": 0.021, "step": 570 }, { "epoch": 0.4075067024128686, "eval_loss": 0.014600967988371849, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 570 }, { "epoch": 0.40822162645218946, "grad_norm": 0.020003551617264748, "learning_rate": 8.168812589413448e-05, "loss": 0.0154, "step": 571 }, { "epoch": 0.4089365504915103, "grad_norm": 0.02483760379254818, "learning_rate": 8.183118741058656e-05, "loss": 0.0216, "step": 572 }, { "epoch": 0.4096514745308311, "grad_norm": 0.026173269376158714, "learning_rate": 8.197424892703863e-05, "loss": 0.0189, "step": 573 }, { "epoch": 0.41036639857015195, "grad_norm": 0.020420163869857788, "learning_rate": 8.21173104434907e-05, "loss": 0.02, "step": 574 }, { "epoch": 0.4110813226094727, "grad_norm": 0.021667901426553726, "learning_rate": 8.226037195994279e-05, "loss": 0.0151, "step": 575 }, { "epoch": 0.4110813226094727, "eval_loss": 0.014476409181952477, "eval_runtime": 4.6093, "eval_samples_per_second": 10.848, "eval_steps_per_second": 2.82, "step": 575 }, { "epoch": 0.41179624664879355, "grad_norm": 0.022191612049937248, "learning_rate": 8.240343347639485e-05, "loss": 0.0217, "step": 576 }, { "epoch": 0.4125111706881144, "grad_norm": 0.02811339870095253, "learning_rate": 8.254649499284693e-05, "loss": 0.0195, "step": 577 }, { "epoch": 0.4132260947274352, "grad_norm": 0.025414321571588516, "learning_rate": 8.2689556509299e-05, "loss": 0.0196, "step": 578 }, { "epoch": 0.41394101876675604, "grad_norm": 0.024699656292796135, "learning_rate": 8.283261802575107e-05, "loss": 0.024, "step": 579 }, { "epoch": 0.41465594280607687, "grad_norm": 0.021396536380052567, "learning_rate": 8.297567954220316e-05, "loss": 0.021, "step": 580 }, { "epoch": 0.41465594280607687, "eval_loss": 0.014421781525015831, "eval_runtime": 4.5888, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 580 }, { "epoch": 0.4153708668453977, "grad_norm": 0.018569545820355415, "learning_rate": 8.311874105865522e-05, "loss": 0.0127, "step": 581 }, { "epoch": 0.4160857908847185, "grad_norm": 0.02308753691613674, "learning_rate": 8.32618025751073e-05, "loss": 0.0162, "step": 582 }, { "epoch": 0.4168007149240393, "grad_norm": 0.024251293390989304, "learning_rate": 8.340486409155938e-05, "loss": 0.0229, "step": 583 }, { "epoch": 0.41751563896336014, "grad_norm": 0.02770734578371048, "learning_rate": 8.354792560801145e-05, "loss": 0.0298, "step": 584 }, { "epoch": 0.41823056300268097, "grad_norm": 0.028191978111863136, "learning_rate": 8.369098712446353e-05, "loss": 0.0152, "step": 585 }, { "epoch": 0.41823056300268097, "eval_loss": 0.014693663455545902, "eval_runtime": 4.595, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 585 }, { "epoch": 0.4189454870420018, "grad_norm": 0.02411368116736412, "learning_rate": 8.383404864091559e-05, "loss": 0.0187, "step": 586 }, { "epoch": 0.4196604110813226, "grad_norm": 0.022046590223908424, "learning_rate": 8.397711015736767e-05, "loss": 0.0203, "step": 587 }, { "epoch": 0.42037533512064346, "grad_norm": 0.023166274651885033, "learning_rate": 8.412017167381975e-05, "loss": 0.0185, "step": 588 }, { "epoch": 0.42109025915996423, "grad_norm": 0.02187022753059864, "learning_rate": 8.426323319027182e-05, "loss": 0.0213, "step": 589 }, { "epoch": 0.42180518319928506, "grad_norm": 0.021702125668525696, "learning_rate": 8.44062947067239e-05, "loss": 0.0277, "step": 590 }, { "epoch": 0.42180518319928506, "eval_loss": 0.014249236322939396, "eval_runtime": 4.5917, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 590 }, { "epoch": 0.4225201072386059, "grad_norm": 0.02252824418246746, "learning_rate": 8.454935622317596e-05, "loss": 0.0188, "step": 591 }, { "epoch": 0.4232350312779267, "grad_norm": 0.021665530279278755, "learning_rate": 8.469241773962805e-05, "loss": 0.0202, "step": 592 }, { "epoch": 0.42394995531724755, "grad_norm": 0.025072718039155006, "learning_rate": 8.483547925608012e-05, "loss": 0.0187, "step": 593 }, { "epoch": 0.4246648793565684, "grad_norm": 0.02405088022351265, "learning_rate": 8.49785407725322e-05, "loss": 0.021, "step": 594 }, { "epoch": 0.4253798033958892, "grad_norm": 0.026226801797747612, "learning_rate": 8.512160228898427e-05, "loss": 0.0214, "step": 595 }, { "epoch": 0.4253798033958892, "eval_loss": 0.014429312199354172, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 595 }, { "epoch": 0.42609472743521, "grad_norm": 0.020968452095985413, "learning_rate": 8.526466380543633e-05, "loss": 0.0188, "step": 596 }, { "epoch": 0.4268096514745308, "grad_norm": 0.0233134925365448, "learning_rate": 8.540772532188842e-05, "loss": 0.02, "step": 597 }, { "epoch": 0.42752457551385165, "grad_norm": 0.016125699505209923, "learning_rate": 8.555078683834049e-05, "loss": 0.0169, "step": 598 }, { "epoch": 0.4282394995531725, "grad_norm": 0.020275849848985672, "learning_rate": 8.569384835479256e-05, "loss": 0.0163, "step": 599 }, { "epoch": 0.4289544235924933, "grad_norm": 0.0314834862947464, "learning_rate": 8.583690987124464e-05, "loss": 0.0157, "step": 600 }, { "epoch": 0.4289544235924933, "eval_loss": 0.014335427433252335, "eval_runtime": 4.5988, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 600 }, { "epoch": 0.42966934763181414, "grad_norm": 0.023715589195489883, "learning_rate": 8.597997138769672e-05, "loss": 0.015, "step": 601 }, { "epoch": 0.43038427167113497, "grad_norm": 0.018992142751812935, "learning_rate": 8.61230329041488e-05, "loss": 0.0135, "step": 602 }, { "epoch": 0.43109919571045574, "grad_norm": 0.026263991370797157, "learning_rate": 8.626609442060086e-05, "loss": 0.0198, "step": 603 }, { "epoch": 0.43181411974977657, "grad_norm": 0.03035571239888668, "learning_rate": 8.640915593705294e-05, "loss": 0.0188, "step": 604 }, { "epoch": 0.4325290437890974, "grad_norm": 0.020235901698470116, "learning_rate": 8.655221745350501e-05, "loss": 0.0153, "step": 605 }, { "epoch": 0.4325290437890974, "eval_loss": 0.014507073909044266, "eval_runtime": 4.5893, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 605 }, { "epoch": 0.43324396782841823, "grad_norm": 0.025256073102355003, "learning_rate": 8.669527896995709e-05, "loss": 0.0141, "step": 606 }, { "epoch": 0.43395889186773906, "grad_norm": 0.01912717893719673, "learning_rate": 8.683834048640917e-05, "loss": 0.0169, "step": 607 }, { "epoch": 0.4346738159070599, "grad_norm": 0.022770175710320473, "learning_rate": 8.698140200286123e-05, "loss": 0.0192, "step": 608 }, { "epoch": 0.4353887399463807, "grad_norm": 0.02555985562503338, "learning_rate": 8.71244635193133e-05, "loss": 0.0185, "step": 609 }, { "epoch": 0.4361036639857015, "grad_norm": 0.03775324672460556, "learning_rate": 8.726752503576538e-05, "loss": 0.0235, "step": 610 }, { "epoch": 0.4361036639857015, "eval_loss": 0.014168654568493366, "eval_runtime": 4.6117, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 610 }, { "epoch": 0.4368185880250223, "grad_norm": 0.026313092559576035, "learning_rate": 8.741058655221746e-05, "loss": 0.025, "step": 611 }, { "epoch": 0.43753351206434316, "grad_norm": 0.020955080166459084, "learning_rate": 8.755364806866954e-05, "loss": 0.0237, "step": 612 }, { "epoch": 0.438248436103664, "grad_norm": 0.03089262545108795, "learning_rate": 8.76967095851216e-05, "loss": 0.0281, "step": 613 }, { "epoch": 0.4389633601429848, "grad_norm": 0.03158244118094444, "learning_rate": 8.783977110157368e-05, "loss": 0.022, "step": 614 }, { "epoch": 0.43967828418230565, "grad_norm": 0.022998718544840813, "learning_rate": 8.798283261802575e-05, "loss": 0.0191, "step": 615 }, { "epoch": 0.43967828418230565, "eval_loss": 0.014927368611097336, "eval_runtime": 4.6017, "eval_samples_per_second": 10.866, "eval_steps_per_second": 2.825, "step": 615 }, { "epoch": 0.4403932082216265, "grad_norm": 0.024411024525761604, "learning_rate": 8.812589413447783e-05, "loss": 0.0173, "step": 616 }, { "epoch": 0.44110813226094725, "grad_norm": 0.033707547932863235, "learning_rate": 8.826895565092991e-05, "loss": 0.0186, "step": 617 }, { "epoch": 0.4418230563002681, "grad_norm": 0.024133117869496346, "learning_rate": 8.841201716738198e-05, "loss": 0.016, "step": 618 }, { "epoch": 0.4425379803395889, "grad_norm": 0.021649956703186035, "learning_rate": 8.855507868383405e-05, "loss": 0.0178, "step": 619 }, { "epoch": 0.44325290437890974, "grad_norm": 0.034358102828264236, "learning_rate": 8.869814020028612e-05, "loss": 0.0201, "step": 620 }, { "epoch": 0.44325290437890974, "eval_loss": 0.014856396242976189, "eval_runtime": 4.6107, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.82, "step": 620 }, { "epoch": 0.44396782841823057, "grad_norm": 0.033885300159454346, "learning_rate": 8.88412017167382e-05, "loss": 0.0231, "step": 621 }, { "epoch": 0.4446827524575514, "grad_norm": 0.02660813368856907, "learning_rate": 8.898426323319028e-05, "loss": 0.0266, "step": 622 }, { "epoch": 0.44539767649687223, "grad_norm": 0.028708411380648613, "learning_rate": 8.912732474964235e-05, "loss": 0.0166, "step": 623 }, { "epoch": 0.446112600536193, "grad_norm": 0.030185602605342865, "learning_rate": 8.927038626609443e-05, "loss": 0.0184, "step": 624 }, { "epoch": 0.44682752457551383, "grad_norm": 0.026603084057569504, "learning_rate": 8.94134477825465e-05, "loss": 0.0131, "step": 625 }, { "epoch": 0.44682752457551383, "eval_loss": 0.014097415842115879, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 625 }, { "epoch": 0.44754244861483466, "grad_norm": 0.018286406993865967, "learning_rate": 8.955650929899857e-05, "loss": 0.0218, "step": 626 }, { "epoch": 0.4482573726541555, "grad_norm": 0.02438417822122574, "learning_rate": 8.969957081545065e-05, "loss": 0.0159, "step": 627 }, { "epoch": 0.4489722966934763, "grad_norm": 0.03877907991409302, "learning_rate": 8.984263233190273e-05, "loss": 0.0203, "step": 628 }, { "epoch": 0.44968722073279715, "grad_norm": 0.023460296913981438, "learning_rate": 8.99856938483548e-05, "loss": 0.0193, "step": 629 }, { "epoch": 0.450402144772118, "grad_norm": 0.023340146988630295, "learning_rate": 9.012875536480687e-05, "loss": 0.0224, "step": 630 }, { "epoch": 0.450402144772118, "eval_loss": 0.014926099218428135, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 630 }, { "epoch": 0.45111706881143876, "grad_norm": 0.027752121910452843, "learning_rate": 9.027181688125894e-05, "loss": 0.0219, "step": 631 }, { "epoch": 0.4518319928507596, "grad_norm": 0.02427881956100464, "learning_rate": 9.041487839771102e-05, "loss": 0.0216, "step": 632 }, { "epoch": 0.4525469168900804, "grad_norm": 0.018094541504979134, "learning_rate": 9.05579399141631e-05, "loss": 0.0216, "step": 633 }, { "epoch": 0.45326184092940125, "grad_norm": 0.01985222101211548, "learning_rate": 9.070100143061517e-05, "loss": 0.018, "step": 634 }, { "epoch": 0.4539767649687221, "grad_norm": 0.020941777154803276, "learning_rate": 9.084406294706725e-05, "loss": 0.0237, "step": 635 }, { "epoch": 0.4539767649687221, "eval_loss": 0.0149126211181283, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 635 }, { "epoch": 0.4546916890080429, "grad_norm": 0.02431618794798851, "learning_rate": 9.098712446351931e-05, "loss": 0.0244, "step": 636 }, { "epoch": 0.45540661304736374, "grad_norm": 0.030253667384386063, "learning_rate": 9.113018597997139e-05, "loss": 0.0199, "step": 637 }, { "epoch": 0.4561215370866845, "grad_norm": 0.021024927496910095, "learning_rate": 9.127324749642347e-05, "loss": 0.019, "step": 638 }, { "epoch": 0.45683646112600534, "grad_norm": 0.021203961223363876, "learning_rate": 9.141630901287554e-05, "loss": 0.0188, "step": 639 }, { "epoch": 0.4575513851653262, "grad_norm": 0.025936704128980637, "learning_rate": 9.155937052932762e-05, "loss": 0.0158, "step": 640 }, { "epoch": 0.4575513851653262, "eval_loss": 0.014876837842166424, "eval_runtime": 4.6319, "eval_samples_per_second": 10.795, "eval_steps_per_second": 2.807, "step": 640 }, { "epoch": 0.458266309204647, "grad_norm": 0.0228131003677845, "learning_rate": 9.170243204577968e-05, "loss": 0.018, "step": 641 }, { "epoch": 0.45898123324396783, "grad_norm": 0.020687520503997803, "learning_rate": 9.184549356223176e-05, "loss": 0.0227, "step": 642 }, { "epoch": 0.45969615728328866, "grad_norm": 0.02509278804063797, "learning_rate": 9.198855507868384e-05, "loss": 0.02, "step": 643 }, { "epoch": 0.4604110813226095, "grad_norm": 0.026115868240594864, "learning_rate": 9.213161659513591e-05, "loss": 0.0116, "step": 644 }, { "epoch": 0.46112600536193027, "grad_norm": 0.02611641213297844, "learning_rate": 9.227467811158799e-05, "loss": 0.0187, "step": 645 }, { "epoch": 0.46112600536193027, "eval_loss": 0.01437502633780241, "eval_runtime": 4.6318, "eval_samples_per_second": 10.795, "eval_steps_per_second": 2.807, "step": 645 }, { "epoch": 0.4618409294012511, "grad_norm": 0.024835119023919106, "learning_rate": 9.241773962804005e-05, "loss": 0.0132, "step": 646 }, { "epoch": 0.46255585344057193, "grad_norm": 0.01912013255059719, "learning_rate": 9.256080114449213e-05, "loss": 0.0143, "step": 647 }, { "epoch": 0.46327077747989276, "grad_norm": 0.021361099556088448, "learning_rate": 9.270386266094421e-05, "loss": 0.0208, "step": 648 }, { "epoch": 0.4639857015192136, "grad_norm": 0.019990041851997375, "learning_rate": 9.284692417739628e-05, "loss": 0.0164, "step": 649 }, { "epoch": 0.4647006255585344, "grad_norm": 0.02197304740548134, "learning_rate": 9.298998569384836e-05, "loss": 0.0245, "step": 650 }, { "epoch": 0.4647006255585344, "eval_loss": 0.013904212974011898, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 650 }, { "epoch": 0.46541554959785525, "grad_norm": 0.022024227306246758, "learning_rate": 9.313304721030042e-05, "loss": 0.0165, "step": 651 }, { "epoch": 0.466130473637176, "grad_norm": 0.01853158324956894, "learning_rate": 9.327610872675252e-05, "loss": 0.023, "step": 652 }, { "epoch": 0.46684539767649685, "grad_norm": 0.026004264131188393, "learning_rate": 9.341917024320458e-05, "loss": 0.0237, "step": 653 }, { "epoch": 0.4675603217158177, "grad_norm": 0.020428452640771866, "learning_rate": 9.356223175965666e-05, "loss": 0.0294, "step": 654 }, { "epoch": 0.4682752457551385, "grad_norm": 0.019873062148690224, "learning_rate": 9.370529327610873e-05, "loss": 0.0177, "step": 655 }, { "epoch": 0.4682752457551385, "eval_loss": 0.014116253703832626, "eval_runtime": 4.598, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 655 }, { "epoch": 0.46899016979445934, "grad_norm": 0.021129218861460686, "learning_rate": 9.384835479256081e-05, "loss": 0.0258, "step": 656 }, { "epoch": 0.4697050938337802, "grad_norm": 0.021377943456172943, "learning_rate": 9.399141630901289e-05, "loss": 0.0192, "step": 657 }, { "epoch": 0.470420017873101, "grad_norm": 0.017325900495052338, "learning_rate": 9.413447782546495e-05, "loss": 0.016, "step": 658 }, { "epoch": 0.4711349419124218, "grad_norm": 0.018445631489157677, "learning_rate": 9.427753934191703e-05, "loss": 0.024, "step": 659 }, { "epoch": 0.4718498659517426, "grad_norm": 0.02112962305545807, "learning_rate": 9.44206008583691e-05, "loss": 0.0153, "step": 660 }, { "epoch": 0.4718498659517426, "eval_loss": 0.013968244194984436, "eval_runtime": 4.5935, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 660 }, { "epoch": 0.47256478999106344, "grad_norm": 0.02425629273056984, "learning_rate": 9.456366237482118e-05, "loss": 0.0192, "step": 661 }, { "epoch": 0.47327971403038427, "grad_norm": 0.02423935756087303, "learning_rate": 9.470672389127326e-05, "loss": 0.0183, "step": 662 }, { "epoch": 0.4739946380697051, "grad_norm": 0.023790808394551277, "learning_rate": 9.484978540772532e-05, "loss": 0.0149, "step": 663 }, { "epoch": 0.47470956210902593, "grad_norm": 0.024744974449276924, "learning_rate": 9.49928469241774e-05, "loss": 0.0185, "step": 664 }, { "epoch": 0.47542448614834676, "grad_norm": 0.033499572426080704, "learning_rate": 9.513590844062947e-05, "loss": 0.0178, "step": 665 }, { "epoch": 0.47542448614834676, "eval_loss": 0.013981725089251995, "eval_runtime": 4.5994, "eval_samples_per_second": 10.871, "eval_steps_per_second": 2.826, "step": 665 }, { "epoch": 0.47613941018766753, "grad_norm": 0.016917439177632332, "learning_rate": 9.527896995708155e-05, "loss": 0.0158, "step": 666 }, { "epoch": 0.47685433422698836, "grad_norm": 0.025563456118106842, "learning_rate": 9.542203147353363e-05, "loss": 0.0181, "step": 667 }, { "epoch": 0.4775692582663092, "grad_norm": 0.024769693613052368, "learning_rate": 9.556509298998569e-05, "loss": 0.0262, "step": 668 }, { "epoch": 0.47828418230563, "grad_norm": 0.02031371183693409, "learning_rate": 9.570815450643778e-05, "loss": 0.0192, "step": 669 }, { "epoch": 0.47899910634495085, "grad_norm": 0.01711789146065712, "learning_rate": 9.585121602288984e-05, "loss": 0.0213, "step": 670 }, { "epoch": 0.47899910634495085, "eval_loss": 0.014103885740041733, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 670 }, { "epoch": 0.4797140303842717, "grad_norm": 0.01985233835875988, "learning_rate": 9.599427753934192e-05, "loss": 0.0146, "step": 671 }, { "epoch": 0.4804289544235925, "grad_norm": 0.020358936861157417, "learning_rate": 9.6137339055794e-05, "loss": 0.0203, "step": 672 }, { "epoch": 0.4811438784629133, "grad_norm": 0.0219916719943285, "learning_rate": 9.628040057224606e-05, "loss": 0.0158, "step": 673 }, { "epoch": 0.4818588025022341, "grad_norm": 0.027075476944446564, "learning_rate": 9.642346208869815e-05, "loss": 0.0286, "step": 674 }, { "epoch": 0.48257372654155495, "grad_norm": 0.019410068169236183, "learning_rate": 9.656652360515021e-05, "loss": 0.0185, "step": 675 }, { "epoch": 0.48257372654155495, "eval_loss": 0.014152503572404385, "eval_runtime": 4.5888, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 675 }, { "epoch": 0.4832886505808758, "grad_norm": 0.017798619344830513, "learning_rate": 9.670958512160229e-05, "loss": 0.0214, "step": 676 }, { "epoch": 0.4840035746201966, "grad_norm": 0.017873181030154228, "learning_rate": 9.685264663805437e-05, "loss": 0.0166, "step": 677 }, { "epoch": 0.48471849865951744, "grad_norm": 0.029727734625339508, "learning_rate": 9.699570815450643e-05, "loss": 0.0192, "step": 678 }, { "epoch": 0.48543342269883827, "grad_norm": 0.022099709138274193, "learning_rate": 9.713876967095852e-05, "loss": 0.0187, "step": 679 }, { "epoch": 0.48614834673815904, "grad_norm": 0.01946406066417694, "learning_rate": 9.728183118741059e-05, "loss": 0.0215, "step": 680 }, { "epoch": 0.48614834673815904, "eval_loss": 0.014183642342686653, "eval_runtime": 4.6122, "eval_samples_per_second": 10.841, "eval_steps_per_second": 2.819, "step": 680 }, { "epoch": 0.4868632707774799, "grad_norm": 0.016896788030862808, "learning_rate": 9.742489270386268e-05, "loss": 0.016, "step": 681 }, { "epoch": 0.4875781948168007, "grad_norm": 0.021645333617925644, "learning_rate": 9.756795422031474e-05, "loss": 0.0194, "step": 682 }, { "epoch": 0.48829311885612153, "grad_norm": 0.021029947325587273, "learning_rate": 9.77110157367668e-05, "loss": 0.0201, "step": 683 }, { "epoch": 0.48900804289544236, "grad_norm": 0.020319858565926552, "learning_rate": 9.785407725321889e-05, "loss": 0.0159, "step": 684 }, { "epoch": 0.4897229669347632, "grad_norm": 0.020195379853248596, "learning_rate": 9.799713876967096e-05, "loss": 0.0166, "step": 685 }, { "epoch": 0.4897229669347632, "eval_loss": 0.014280597679316998, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 685 }, { "epoch": 0.490437890974084, "grad_norm": 0.021366789937019348, "learning_rate": 9.814020028612305e-05, "loss": 0.022, "step": 686 }, { "epoch": 0.49115281501340485, "grad_norm": 0.028518470004200935, "learning_rate": 9.828326180257511e-05, "loss": 0.0328, "step": 687 }, { "epoch": 0.4918677390527256, "grad_norm": 0.022547919303178787, "learning_rate": 9.842632331902719e-05, "loss": 0.0271, "step": 688 }, { "epoch": 0.49258266309204646, "grad_norm": 0.021415656432509422, "learning_rate": 9.856938483547926e-05, "loss": 0.025, "step": 689 }, { "epoch": 0.4932975871313673, "grad_norm": 0.01943224109709263, "learning_rate": 9.871244635193133e-05, "loss": 0.0147, "step": 690 }, { "epoch": 0.4932975871313673, "eval_loss": 0.014338159002363682, "eval_runtime": 4.5927, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 690 }, { "epoch": 0.4940125111706881, "grad_norm": 0.02217932604253292, "learning_rate": 9.885550786838342e-05, "loss": 0.018, "step": 691 }, { "epoch": 0.49472743521000895, "grad_norm": 0.021046018227934837, "learning_rate": 9.899856938483548e-05, "loss": 0.0234, "step": 692 }, { "epoch": 0.4954423592493298, "grad_norm": 0.025758149102330208, "learning_rate": 9.914163090128756e-05, "loss": 0.0177, "step": 693 }, { "epoch": 0.4961572832886506, "grad_norm": 0.021605342626571655, "learning_rate": 9.928469241773963e-05, "loss": 0.0252, "step": 694 }, { "epoch": 0.4968722073279714, "grad_norm": 0.017242876812815666, "learning_rate": 9.94277539341917e-05, "loss": 0.0142, "step": 695 }, { "epoch": 0.4968722073279714, "eval_loss": 0.014297021552920341, "eval_runtime": 4.5918, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 695 }, { "epoch": 0.4975871313672922, "grad_norm": 0.019735218957066536, "learning_rate": 9.957081545064379e-05, "loss": 0.0176, "step": 696 }, { "epoch": 0.49830205540661304, "grad_norm": 0.018685946241021156, "learning_rate": 9.971387696709585e-05, "loss": 0.0238, "step": 697 }, { "epoch": 0.4990169794459339, "grad_norm": 0.0228415559977293, "learning_rate": 9.985693848354794e-05, "loss": 0.0198, "step": 698 }, { "epoch": 0.4997319034852547, "grad_norm": 0.014821004122495651, "learning_rate": 0.0001, "loss": 0.016, "step": 699 }, { "epoch": 0.5004468275245755, "grad_norm": 0.014828061684966087, "learning_rate": 9.999999376551801e-05, "loss": 0.0158, "step": 700 }, { "epoch": 0.5004468275245755, "eval_loss": 0.014128150418400764, "eval_runtime": 4.5926, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 700 }, { "epoch": 0.5011617515638963, "grad_norm": 0.020368313416838646, "learning_rate": 9.99999750620736e-05, "loss": 0.0188, "step": 701 }, { "epoch": 0.5018766756032171, "grad_norm": 0.0178313497453928, "learning_rate": 9.999994388967142e-05, "loss": 0.0214, "step": 702 }, { "epoch": 0.502591599642538, "grad_norm": 0.021234774962067604, "learning_rate": 9.999990024831927e-05, "loss": 0.0166, "step": 703 }, { "epoch": 0.5033065236818588, "grad_norm": 0.01646016165614128, "learning_rate": 9.999984413802801e-05, "loss": 0.0175, "step": 704 }, { "epoch": 0.5040214477211796, "grad_norm": 0.021663451567292213, "learning_rate": 9.999977555881163e-05, "loss": 0.0219, "step": 705 }, { "epoch": 0.5040214477211796, "eval_loss": 0.014445667155086994, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 705 }, { "epoch": 0.5047363717605005, "grad_norm": 0.024421261623501778, "learning_rate": 9.999969451068724e-05, "loss": 0.0244, "step": 706 }, { "epoch": 0.5054512957998213, "grad_norm": 0.022461527958512306, "learning_rate": 9.999960099367506e-05, "loss": 0.0173, "step": 707 }, { "epoch": 0.5061662198391421, "grad_norm": 0.0184494461864233, "learning_rate": 9.999949500779841e-05, "loss": 0.0272, "step": 708 }, { "epoch": 0.506881143878463, "grad_norm": 0.019204670563340187, "learning_rate": 9.999937655308372e-05, "loss": 0.016, "step": 709 }, { "epoch": 0.5075960679177838, "grad_norm": 0.01873164251446724, "learning_rate": 9.999924562956052e-05, "loss": 0.0191, "step": 710 }, { "epoch": 0.5075960679177838, "eval_loss": 0.014330726116895676, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 710 }, { "epoch": 0.5083109919571046, "grad_norm": 0.017282230779528618, "learning_rate": 9.999910223726145e-05, "loss": 0.0132, "step": 711 }, { "epoch": 0.5090259159964253, "grad_norm": 0.017883488908410072, "learning_rate": 9.999894637622232e-05, "loss": 0.0196, "step": 712 }, { "epoch": 0.5097408400357462, "grad_norm": 0.01708253100514412, "learning_rate": 9.999877804648194e-05, "loss": 0.0186, "step": 713 }, { "epoch": 0.510455764075067, "grad_norm": 0.018943190574645996, "learning_rate": 9.99985972480823e-05, "loss": 0.0177, "step": 714 }, { "epoch": 0.5111706881143878, "grad_norm": 0.021685753017663956, "learning_rate": 9.999840398106851e-05, "loss": 0.0248, "step": 715 }, { "epoch": 0.5111706881143878, "eval_loss": 0.014527898281812668, "eval_runtime": 4.6202, "eval_samples_per_second": 10.822, "eval_steps_per_second": 2.814, "step": 715 }, { "epoch": 0.5118856121537086, "grad_norm": 0.020881937816739082, "learning_rate": 9.999819824548875e-05, "loss": 0.0209, "step": 716 }, { "epoch": 0.5126005361930295, "grad_norm": 0.02195884846150875, "learning_rate": 9.999798004139435e-05, "loss": 0.0195, "step": 717 }, { "epoch": 0.5133154602323503, "grad_norm": 0.023983770981431007, "learning_rate": 9.999774936883968e-05, "loss": 0.0208, "step": 718 }, { "epoch": 0.5140303842716711, "grad_norm": 0.019993193447589874, "learning_rate": 9.999750622788231e-05, "loss": 0.0247, "step": 719 }, { "epoch": 0.514745308310992, "grad_norm": 0.018592478707432747, "learning_rate": 9.999725061858285e-05, "loss": 0.0174, "step": 720 }, { "epoch": 0.514745308310992, "eval_loss": 0.014228724874556065, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 720 }, { "epoch": 0.5154602323503128, "grad_norm": 0.020369194447994232, "learning_rate": 9.999698254100504e-05, "loss": 0.0146, "step": 721 }, { "epoch": 0.5161751563896336, "grad_norm": 0.024656612426042557, "learning_rate": 9.999670199521576e-05, "loss": 0.0169, "step": 722 }, { "epoch": 0.5168900804289545, "grad_norm": 0.018275298178195953, "learning_rate": 9.999640898128495e-05, "loss": 0.0267, "step": 723 }, { "epoch": 0.5176050044682753, "grad_norm": 0.018544280901551247, "learning_rate": 9.999610349928569e-05, "loss": 0.0201, "step": 724 }, { "epoch": 0.5183199285075961, "grad_norm": 0.019615447148680687, "learning_rate": 9.999578554929415e-05, "loss": 0.0212, "step": 725 }, { "epoch": 0.5183199285075961, "eval_loss": 0.014287865720689297, "eval_runtime": 4.6102, "eval_samples_per_second": 10.846, "eval_steps_per_second": 2.82, "step": 725 }, { "epoch": 0.5190348525469168, "grad_norm": 0.021529382094740868, "learning_rate": 9.999545513138964e-05, "loss": 0.0177, "step": 726 }, { "epoch": 0.5197497765862377, "grad_norm": 0.019816670566797256, "learning_rate": 9.999511224565452e-05, "loss": 0.0205, "step": 727 }, { "epoch": 0.5204647006255585, "grad_norm": 0.017567558214068413, "learning_rate": 9.999475689217435e-05, "loss": 0.0145, "step": 728 }, { "epoch": 0.5211796246648793, "grad_norm": 0.02134581096470356, "learning_rate": 9.999438907103771e-05, "loss": 0.0237, "step": 729 }, { "epoch": 0.5218945487042002, "grad_norm": 0.02216225117444992, "learning_rate": 9.999400878233635e-05, "loss": 0.0237, "step": 730 }, { "epoch": 0.5218945487042002, "eval_loss": 0.013855908066034317, "eval_runtime": 4.5893, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 730 }, { "epoch": 0.522609472743521, "grad_norm": 0.02647196315228939, "learning_rate": 9.99936160261651e-05, "loss": 0.0235, "step": 731 }, { "epoch": 0.5233243967828418, "grad_norm": 0.020326200872659683, "learning_rate": 9.999321080262189e-05, "loss": 0.0176, "step": 732 }, { "epoch": 0.5240393208221626, "grad_norm": 0.018897579982876778, "learning_rate": 9.999279311180778e-05, "loss": 0.0127, "step": 733 }, { "epoch": 0.5247542448614835, "grad_norm": 0.02032695896923542, "learning_rate": 9.999236295382696e-05, "loss": 0.0184, "step": 734 }, { "epoch": 0.5254691689008043, "grad_norm": 0.01705778017640114, "learning_rate": 9.999192032878668e-05, "loss": 0.0176, "step": 735 }, { "epoch": 0.5254691689008043, "eval_loss": 0.013787785544991493, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 735 }, { "epoch": 0.5261840929401251, "grad_norm": 0.019561193883419037, "learning_rate": 9.99914652367973e-05, "loss": 0.0243, "step": 736 }, { "epoch": 0.526899016979446, "grad_norm": 0.01910156011581421, "learning_rate": 9.999099767797235e-05, "loss": 0.0158, "step": 737 }, { "epoch": 0.5276139410187668, "grad_norm": 0.018464738503098488, "learning_rate": 9.999051765242842e-05, "loss": 0.0201, "step": 738 }, { "epoch": 0.5283288650580876, "grad_norm": 0.019149160012602806, "learning_rate": 9.99900251602852e-05, "loss": 0.0186, "step": 739 }, { "epoch": 0.5290437890974083, "grad_norm": 0.01939908042550087, "learning_rate": 9.998952020166554e-05, "loss": 0.0222, "step": 740 }, { "epoch": 0.5290437890974083, "eval_loss": 0.013993253000080585, "eval_runtime": 4.6051, "eval_samples_per_second": 10.858, "eval_steps_per_second": 2.823, "step": 740 }, { "epoch": 0.5297587131367292, "grad_norm": 0.019359469413757324, "learning_rate": 9.998900277669532e-05, "loss": 0.0147, "step": 741 }, { "epoch": 0.53047363717605, "grad_norm": 0.023376626893877983, "learning_rate": 9.998847288550361e-05, "loss": 0.0178, "step": 742 }, { "epoch": 0.5311885612153708, "grad_norm": 0.016504481434822083, "learning_rate": 9.998793052822255e-05, "loss": 0.0145, "step": 743 }, { "epoch": 0.5319034852546917, "grad_norm": 0.015182583592832088, "learning_rate": 9.998737570498737e-05, "loss": 0.0106, "step": 744 }, { "epoch": 0.5326184092940125, "grad_norm": 0.018272764980793, "learning_rate": 9.998680841593646e-05, "loss": 0.0134, "step": 745 }, { "epoch": 0.5326184092940125, "eval_loss": 0.01370643638074398, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 745 }, { "epoch": 0.5333333333333333, "grad_norm": 0.01940307393670082, "learning_rate": 9.998622866121129e-05, "loss": 0.0166, "step": 746 }, { "epoch": 0.5340482573726542, "grad_norm": 0.01699450984597206, "learning_rate": 9.998563644095643e-05, "loss": 0.0152, "step": 747 }, { "epoch": 0.534763181411975, "grad_norm": 0.019640635699033737, "learning_rate": 9.998503175531954e-05, "loss": 0.017, "step": 748 }, { "epoch": 0.5354781054512958, "grad_norm": 0.019528940320014954, "learning_rate": 9.998441460445145e-05, "loss": 0.0208, "step": 749 }, { "epoch": 0.5361930294906166, "grad_norm": 0.02101529948413372, "learning_rate": 9.998378498850606e-05, "loss": 0.0126, "step": 750 }, { "epoch": 0.5361930294906166, "eval_loss": 0.013920355588197708, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 750 }, { "epoch": 0.5369079535299375, "grad_norm": 0.02428918145596981, "learning_rate": 9.998314290764037e-05, "loss": 0.0189, "step": 751 }, { "epoch": 0.5376228775692583, "grad_norm": 0.01598934642970562, "learning_rate": 9.998248836201453e-05, "loss": 0.0134, "step": 752 }, { "epoch": 0.5383378016085791, "grad_norm": 0.02007744088768959, "learning_rate": 9.998182135179173e-05, "loss": 0.012, "step": 753 }, { "epoch": 0.5390527256478999, "grad_norm": 0.018683208152651787, "learning_rate": 9.998114187713834e-05, "loss": 0.0127, "step": 754 }, { "epoch": 0.5397676496872207, "grad_norm": 0.021435115486383438, "learning_rate": 9.998044993822379e-05, "loss": 0.0209, "step": 755 }, { "epoch": 0.5397676496872207, "eval_loss": 0.01399157103151083, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 755 }, { "epoch": 0.5404825737265415, "grad_norm": 0.018849913030862808, "learning_rate": 9.997974553522065e-05, "loss": 0.0138, "step": 756 }, { "epoch": 0.5411974977658623, "grad_norm": 0.02234536036849022, "learning_rate": 9.997902866830457e-05, "loss": 0.0208, "step": 757 }, { "epoch": 0.5419124218051832, "grad_norm": 0.020767640322446823, "learning_rate": 9.997829933765432e-05, "loss": 0.0205, "step": 758 }, { "epoch": 0.542627345844504, "grad_norm": 0.018350644037127495, "learning_rate": 9.997755754345179e-05, "loss": 0.0219, "step": 759 }, { "epoch": 0.5433422698838248, "grad_norm": 0.018013833090662956, "learning_rate": 9.997680328588196e-05, "loss": 0.0204, "step": 760 }, { "epoch": 0.5433422698838248, "eval_loss": 0.013835792429745197, "eval_runtime": 4.5917, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 760 }, { "epoch": 0.5440571939231457, "grad_norm": 0.01911567524075508, "learning_rate": 9.997603656513294e-05, "loss": 0.0142, "step": 761 }, { "epoch": 0.5447721179624665, "grad_norm": 0.016550377011299133, "learning_rate": 9.997525738139593e-05, "loss": 0.0165, "step": 762 }, { "epoch": 0.5454870420017873, "grad_norm": 0.02036173641681671, "learning_rate": 9.997446573486524e-05, "loss": 0.0257, "step": 763 }, { "epoch": 0.5462019660411082, "grad_norm": 0.016214005649089813, "learning_rate": 9.997366162573827e-05, "loss": 0.016, "step": 764 }, { "epoch": 0.546916890080429, "grad_norm": 0.020588072016835213, "learning_rate": 9.997284505421558e-05, "loss": 0.0239, "step": 765 }, { "epoch": 0.546916890080429, "eval_loss": 0.013873417861759663, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 765 }, { "epoch": 0.5476318141197498, "grad_norm": 0.018794871866703033, "learning_rate": 9.997201602050079e-05, "loss": 0.0187, "step": 766 }, { "epoch": 0.5483467381590706, "grad_norm": 0.02044038288295269, "learning_rate": 9.997117452480064e-05, "loss": 0.0171, "step": 767 }, { "epoch": 0.5490616621983915, "grad_norm": 0.018683338537812233, "learning_rate": 9.997032056732498e-05, "loss": 0.0177, "step": 768 }, { "epoch": 0.5497765862377122, "grad_norm": 0.024158377200365067, "learning_rate": 9.996945414828678e-05, "loss": 0.0201, "step": 769 }, { "epoch": 0.550491510277033, "grad_norm": 0.01970498450100422, "learning_rate": 9.996857526790212e-05, "loss": 0.0203, "step": 770 }, { "epoch": 0.550491510277033, "eval_loss": 0.013761192560195923, "eval_runtime": 4.6274, "eval_samples_per_second": 10.805, "eval_steps_per_second": 2.809, "step": 770 }, { "epoch": 0.5512064343163539, "grad_norm": 0.022972047328948975, "learning_rate": 9.996768392639016e-05, "loss": 0.0177, "step": 771 }, { "epoch": 0.5519213583556747, "grad_norm": 0.014331175945699215, "learning_rate": 9.996678012397316e-05, "loss": 0.0119, "step": 772 }, { "epoch": 0.5526362823949955, "grad_norm": 0.020483076572418213, "learning_rate": 9.996586386087653e-05, "loss": 0.019, "step": 773 }, { "epoch": 0.5533512064343163, "grad_norm": 0.019359583035111427, "learning_rate": 9.996493513732877e-05, "loss": 0.0198, "step": 774 }, { "epoch": 0.5540661304736372, "grad_norm": 0.019794445484876633, "learning_rate": 9.996399395356148e-05, "loss": 0.0212, "step": 775 }, { "epoch": 0.5540661304736372, "eval_loss": 0.013635417446494102, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 775 }, { "epoch": 0.554781054512958, "grad_norm": 0.02083546295762062, "learning_rate": 9.996304030980938e-05, "loss": 0.017, "step": 776 }, { "epoch": 0.5554959785522788, "grad_norm": 0.015141095034778118, "learning_rate": 9.996207420631029e-05, "loss": 0.02, "step": 777 }, { "epoch": 0.5562109025915997, "grad_norm": 0.016377927735447884, "learning_rate": 9.996109564330511e-05, "loss": 0.0185, "step": 778 }, { "epoch": 0.5569258266309205, "grad_norm": 0.018165750429034233, "learning_rate": 9.996010462103789e-05, "loss": 0.0209, "step": 779 }, { "epoch": 0.5576407506702413, "grad_norm": 0.0201591607183218, "learning_rate": 9.99591011397558e-05, "loss": 0.0145, "step": 780 }, { "epoch": 0.5576407506702413, "eval_loss": 0.013649163767695427, "eval_runtime": 4.5983, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 780 }, { "epoch": 0.5583556747095622, "grad_norm": 0.021173259243369102, "learning_rate": 9.995808519970903e-05, "loss": 0.0164, "step": 781 }, { "epoch": 0.559070598748883, "grad_norm": 0.018628625199198723, "learning_rate": 9.995705680115097e-05, "loss": 0.0255, "step": 782 }, { "epoch": 0.5597855227882037, "grad_norm": 0.018617838621139526, "learning_rate": 9.995601594433808e-05, "loss": 0.014, "step": 783 }, { "epoch": 0.5605004468275245, "grad_norm": 0.02298041060566902, "learning_rate": 9.995496262952992e-05, "loss": 0.012, "step": 784 }, { "epoch": 0.5612153708668454, "grad_norm": 0.019603131338953972, "learning_rate": 9.995389685698917e-05, "loss": 0.0114, "step": 785 }, { "epoch": 0.5612153708668454, "eval_loss": 0.01350613497197628, "eval_runtime": 4.5897, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 785 }, { "epoch": 0.5619302949061662, "grad_norm": 0.022532891482114792, "learning_rate": 9.99528186269816e-05, "loss": 0.022, "step": 786 }, { "epoch": 0.562645218945487, "grad_norm": 0.02212611585855484, "learning_rate": 9.995172793977612e-05, "loss": 0.02, "step": 787 }, { "epoch": 0.5633601429848079, "grad_norm": 0.014665050432085991, "learning_rate": 9.995062479564472e-05, "loss": 0.0142, "step": 788 }, { "epoch": 0.5640750670241287, "grad_norm": 0.020119283348321915, "learning_rate": 9.994950919486248e-05, "loss": 0.0197, "step": 789 }, { "epoch": 0.5647899910634495, "grad_norm": 0.020351728424429893, "learning_rate": 9.994838113770762e-05, "loss": 0.014, "step": 790 }, { "epoch": 0.5647899910634495, "eval_loss": 0.013602504506707191, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 790 }, { "epoch": 0.5655049151027703, "grad_norm": 0.020657440647482872, "learning_rate": 9.994724062446146e-05, "loss": 0.0183, "step": 791 }, { "epoch": 0.5662198391420912, "grad_norm": 0.02233237586915493, "learning_rate": 9.994608765540841e-05, "loss": 0.0154, "step": 792 }, { "epoch": 0.566934763181412, "grad_norm": 0.02234022133052349, "learning_rate": 9.994492223083603e-05, "loss": 0.0232, "step": 793 }, { "epoch": 0.5676496872207328, "grad_norm": 0.021586386486887932, "learning_rate": 9.994374435103489e-05, "loss": 0.0222, "step": 794 }, { "epoch": 0.5683646112600537, "grad_norm": 0.01988225430250168, "learning_rate": 9.994255401629878e-05, "loss": 0.0179, "step": 795 }, { "epoch": 0.5683646112600537, "eval_loss": 0.013403072021901608, "eval_runtime": 4.6327, "eval_samples_per_second": 10.793, "eval_steps_per_second": 2.806, "step": 795 }, { "epoch": 0.5690795352993745, "grad_norm": 0.014606807380914688, "learning_rate": 9.994135122692454e-05, "loss": 0.0106, "step": 796 }, { "epoch": 0.5697944593386952, "grad_norm": 0.020906202495098114, "learning_rate": 9.994013598321209e-05, "loss": 0.0131, "step": 797 }, { "epoch": 0.570509383378016, "grad_norm": 0.020896639674901962, "learning_rate": 9.99389082854645e-05, "loss": 0.0188, "step": 798 }, { "epoch": 0.5712243074173369, "grad_norm": 0.02007909119129181, "learning_rate": 9.993766813398796e-05, "loss": 0.0132, "step": 799 }, { "epoch": 0.5719392314566577, "grad_norm": 0.01618320494890213, "learning_rate": 9.993641552909171e-05, "loss": 0.0114, "step": 800 }, { "epoch": 0.5719392314566577, "eval_loss": 0.013584423810243607, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 800 }, { "epoch": 0.5726541554959785, "grad_norm": 0.020955080166459084, "learning_rate": 9.993515047108813e-05, "loss": 0.0218, "step": 801 }, { "epoch": 0.5733690795352994, "grad_norm": 0.02007770538330078, "learning_rate": 9.99338729602927e-05, "loss": 0.0198, "step": 802 }, { "epoch": 0.5740840035746202, "grad_norm": 0.015431427396833897, "learning_rate": 9.9932582997024e-05, "loss": 0.0133, "step": 803 }, { "epoch": 0.574798927613941, "grad_norm": 0.022184453904628754, "learning_rate": 9.993128058160373e-05, "loss": 0.0251, "step": 804 }, { "epoch": 0.5755138516532619, "grad_norm": 0.018778368830680847, "learning_rate": 9.992996571435668e-05, "loss": 0.0219, "step": 805 }, { "epoch": 0.5755138516532619, "eval_loss": 0.013593500480055809, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 805 }, { "epoch": 0.5762287756925827, "grad_norm": 0.01692924275994301, "learning_rate": 9.992863839561076e-05, "loss": 0.0141, "step": 806 }, { "epoch": 0.5769436997319035, "grad_norm": 0.019934415817260742, "learning_rate": 9.992729862569695e-05, "loss": 0.0186, "step": 807 }, { "epoch": 0.5776586237712243, "grad_norm": 0.02405727095901966, "learning_rate": 9.992594640494937e-05, "loss": 0.0228, "step": 808 }, { "epoch": 0.5783735478105452, "grad_norm": 0.020116884261369705, "learning_rate": 9.992458173370526e-05, "loss": 0.0195, "step": 809 }, { "epoch": 0.579088471849866, "grad_norm": 0.015803828835487366, "learning_rate": 9.992320461230491e-05, "loss": 0.015, "step": 810 }, { "epoch": 0.579088471849866, "eval_loss": 0.013669641688466072, "eval_runtime": 4.5932, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 810 }, { "epoch": 0.5798033958891867, "grad_norm": 0.02067628875374794, "learning_rate": 9.992181504109177e-05, "loss": 0.0217, "step": 811 }, { "epoch": 0.5805183199285076, "grad_norm": 0.017931275069713593, "learning_rate": 9.992041302041238e-05, "loss": 0.0167, "step": 812 }, { "epoch": 0.5812332439678284, "grad_norm": 0.020607922226190567, "learning_rate": 9.991899855061633e-05, "loss": 0.0208, "step": 813 }, { "epoch": 0.5819481680071492, "grad_norm": 0.01740158535540104, "learning_rate": 9.991757163205637e-05, "loss": 0.0118, "step": 814 }, { "epoch": 0.58266309204647, "grad_norm": 0.014164471998810768, "learning_rate": 9.991613226508838e-05, "loss": 0.0187, "step": 815 }, { "epoch": 0.58266309204647, "eval_loss": 0.01402257289737463, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 815 }, { "epoch": 0.5833780160857909, "grad_norm": 0.018499210476875305, "learning_rate": 9.991468045007128e-05, "loss": 0.0134, "step": 816 }, { "epoch": 0.5840929401251117, "grad_norm": 0.019007215276360512, "learning_rate": 9.991321618736712e-05, "loss": 0.0206, "step": 817 }, { "epoch": 0.5848078641644325, "grad_norm": 0.020621977746486664, "learning_rate": 9.99117394773411e-05, "loss": 0.0204, "step": 818 }, { "epoch": 0.5855227882037534, "grad_norm": 0.01880798302590847, "learning_rate": 9.991025032036141e-05, "loss": 0.0249, "step": 819 }, { "epoch": 0.5862377122430742, "grad_norm": 0.02288273721933365, "learning_rate": 9.990874871679948e-05, "loss": 0.0137, "step": 820 }, { "epoch": 0.5862377122430742, "eval_loss": 0.013764997944235802, "eval_runtime": 4.5837, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 820 }, { "epoch": 0.586952636282395, "grad_norm": 0.02051619254052639, "learning_rate": 9.990723466702971e-05, "loss": 0.0211, "step": 821 }, { "epoch": 0.5876675603217159, "grad_norm": 0.019457373768091202, "learning_rate": 9.990570817142974e-05, "loss": 0.0148, "step": 822 }, { "epoch": 0.5883824843610367, "grad_norm": 0.022560419514775276, "learning_rate": 9.990416923038022e-05, "loss": 0.0255, "step": 823 }, { "epoch": 0.5890974084003575, "grad_norm": 0.018919145688414574, "learning_rate": 9.990261784426493e-05, "loss": 0.0219, "step": 824 }, { "epoch": 0.5898123324396782, "grad_norm": 0.02165442518889904, "learning_rate": 9.990105401347074e-05, "loss": 0.0169, "step": 825 }, { "epoch": 0.5898123324396782, "eval_loss": 0.013688034377992153, "eval_runtime": 4.6105, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 825 }, { "epoch": 0.5905272564789991, "grad_norm": 0.01748502627015114, "learning_rate": 9.989947773838765e-05, "loss": 0.0164, "step": 826 }, { "epoch": 0.5912421805183199, "grad_norm": 0.0181425791233778, "learning_rate": 9.989788901940877e-05, "loss": 0.0195, "step": 827 }, { "epoch": 0.5919571045576407, "grad_norm": 0.018738610669970512, "learning_rate": 9.989628785693026e-05, "loss": 0.0152, "step": 828 }, { "epoch": 0.5926720285969616, "grad_norm": 0.020469799637794495, "learning_rate": 9.989467425135144e-05, "loss": 0.0187, "step": 829 }, { "epoch": 0.5933869526362824, "grad_norm": 0.018702151253819466, "learning_rate": 9.989304820307468e-05, "loss": 0.0226, "step": 830 }, { "epoch": 0.5933869526362824, "eval_loss": 0.013684215024113655, "eval_runtime": 4.6003, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 830 }, { "epoch": 0.5941018766756032, "grad_norm": 0.02245517633855343, "learning_rate": 9.989140971250552e-05, "loss": 0.0174, "step": 831 }, { "epoch": 0.594816800714924, "grad_norm": 0.018902329728007317, "learning_rate": 9.988975878005256e-05, "loss": 0.0175, "step": 832 }, { "epoch": 0.5955317247542449, "grad_norm": 0.021008877083659172, "learning_rate": 9.988809540612747e-05, "loss": 0.0187, "step": 833 }, { "epoch": 0.5962466487935657, "grad_norm": 0.017383934929966927, "learning_rate": 9.988641959114511e-05, "loss": 0.0178, "step": 834 }, { "epoch": 0.5969615728328865, "grad_norm": 0.021124664694070816, "learning_rate": 9.988473133552336e-05, "loss": 0.0145, "step": 835 }, { "epoch": 0.5969615728328865, "eval_loss": 0.013299357146024704, "eval_runtime": 4.6208, "eval_samples_per_second": 10.821, "eval_steps_per_second": 2.813, "step": 835 }, { "epoch": 0.5976764968722074, "grad_norm": 0.019182171672582626, "learning_rate": 9.988303063968323e-05, "loss": 0.0177, "step": 836 }, { "epoch": 0.5983914209115282, "grad_norm": 0.01795237883925438, "learning_rate": 9.988131750404888e-05, "loss": 0.0146, "step": 837 }, { "epoch": 0.599106344950849, "grad_norm": 0.01642397604882717, "learning_rate": 9.98795919290475e-05, "loss": 0.0151, "step": 838 }, { "epoch": 0.5998212689901697, "grad_norm": 0.01689854823052883, "learning_rate": 9.987785391510942e-05, "loss": 0.017, "step": 839 }, { "epoch": 0.6005361930294906, "grad_norm": 0.01611105166375637, "learning_rate": 9.987610346266807e-05, "loss": 0.0146, "step": 840 }, { "epoch": 0.6005361930294906, "eval_loss": 0.01327836699783802, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 840 }, { "epoch": 0.6012511170688114, "grad_norm": 0.020932607352733612, "learning_rate": 9.987434057215996e-05, "loss": 0.0146, "step": 841 }, { "epoch": 0.6019660411081322, "grad_norm": 0.017853396013379097, "learning_rate": 9.987256524402473e-05, "loss": 0.0124, "step": 842 }, { "epoch": 0.6026809651474531, "grad_norm": 0.01636500284075737, "learning_rate": 9.987077747870512e-05, "loss": 0.0165, "step": 843 }, { "epoch": 0.6033958891867739, "grad_norm": 0.020573141053318977, "learning_rate": 9.986897727664692e-05, "loss": 0.0173, "step": 844 }, { "epoch": 0.6041108132260947, "grad_norm": 0.025478817522525787, "learning_rate": 9.986716463829913e-05, "loss": 0.0176, "step": 845 }, { "epoch": 0.6041108132260947, "eval_loss": 0.013048170134425163, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.836, "step": 845 }, { "epoch": 0.6048257372654156, "grad_norm": 0.0196290984749794, "learning_rate": 9.986533956411372e-05, "loss": 0.0211, "step": 846 }, { "epoch": 0.6055406613047364, "grad_norm": 0.021892940625548363, "learning_rate": 9.986350205454586e-05, "loss": 0.0232, "step": 847 }, { "epoch": 0.6062555853440572, "grad_norm": 0.02047782577574253, "learning_rate": 9.98616521100538e-05, "loss": 0.0168, "step": 848 }, { "epoch": 0.606970509383378, "grad_norm": 0.021944131702184677, "learning_rate": 9.985978973109884e-05, "loss": 0.0218, "step": 849 }, { "epoch": 0.6076854334226989, "grad_norm": 0.016735484823584557, "learning_rate": 9.985791491814544e-05, "loss": 0.0134, "step": 850 }, { "epoch": 0.6076854334226989, "eval_loss": 0.013885474763810635, "eval_runtime": 4.5814, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 850 }, { "epoch": 0.6084003574620197, "grad_norm": 0.02362712100148201, "learning_rate": 9.985602767166113e-05, "loss": 0.0146, "step": 851 }, { "epoch": 0.6091152815013405, "grad_norm": 0.021844301372766495, "learning_rate": 9.985412799211658e-05, "loss": 0.0254, "step": 852 }, { "epoch": 0.6098302055406613, "grad_norm": 0.015689833089709282, "learning_rate": 9.985221587998548e-05, "loss": 0.0178, "step": 853 }, { "epoch": 0.6105451295799821, "grad_norm": 0.017171623185276985, "learning_rate": 9.98502913357447e-05, "loss": 0.0179, "step": 854 }, { "epoch": 0.6112600536193029, "grad_norm": 0.016973325982689857, "learning_rate": 9.984835435987419e-05, "loss": 0.0129, "step": 855 }, { "epoch": 0.6112600536193029, "eval_loss": 0.013548342511057854, "eval_runtime": 4.5912, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 855 }, { "epoch": 0.6119749776586237, "grad_norm": 0.022157641127705574, "learning_rate": 9.984640495285699e-05, "loss": 0.0164, "step": 856 }, { "epoch": 0.6126899016979446, "grad_norm": 0.014787044376134872, "learning_rate": 9.984444311517922e-05, "loss": 0.0136, "step": 857 }, { "epoch": 0.6134048257372654, "grad_norm": 0.01873241737484932, "learning_rate": 9.984246884733014e-05, "loss": 0.0212, "step": 858 }, { "epoch": 0.6141197497765862, "grad_norm": 0.01888086460530758, "learning_rate": 9.984048214980208e-05, "loss": 0.0145, "step": 859 }, { "epoch": 0.6148346738159071, "grad_norm": 0.01866905204951763, "learning_rate": 9.98384830230905e-05, "loss": 0.0157, "step": 860 }, { "epoch": 0.6148346738159071, "eval_loss": 0.013571875169873238, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 860 }, { "epoch": 0.6155495978552279, "grad_norm": 0.01872611977159977, "learning_rate": 9.98364714676939e-05, "loss": 0.0207, "step": 861 }, { "epoch": 0.6162645218945487, "grad_norm": 0.01769580878317356, "learning_rate": 9.983444748411398e-05, "loss": 0.0191, "step": 862 }, { "epoch": 0.6169794459338696, "grad_norm": 0.016959181055426598, "learning_rate": 9.983241107285543e-05, "loss": 0.0137, "step": 863 }, { "epoch": 0.6176943699731904, "grad_norm": 0.018694031983613968, "learning_rate": 9.983036223442613e-05, "loss": 0.0139, "step": 864 }, { "epoch": 0.6184092940125112, "grad_norm": 0.01987854763865471, "learning_rate": 9.982830096933697e-05, "loss": 0.0156, "step": 865 }, { "epoch": 0.6184092940125112, "eval_loss": 0.013466768898069859, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 865 }, { "epoch": 0.619124218051832, "grad_norm": 0.017529862001538277, "learning_rate": 9.982622727810202e-05, "loss": 0.0199, "step": 866 }, { "epoch": 0.6198391420911528, "grad_norm": 0.020655227825045586, "learning_rate": 9.98241411612384e-05, "loss": 0.0165, "step": 867 }, { "epoch": 0.6205540661304736, "grad_norm": 0.019189968705177307, "learning_rate": 9.982204261926636e-05, "loss": 0.0194, "step": 868 }, { "epoch": 0.6212689901697944, "grad_norm": 0.02493584156036377, "learning_rate": 9.981993165270922e-05, "loss": 0.0268, "step": 869 }, { "epoch": 0.6219839142091153, "grad_norm": 0.014887025579810143, "learning_rate": 9.981780826209341e-05, "loss": 0.0197, "step": 870 }, { "epoch": 0.6219839142091153, "eval_loss": 0.01340461429208517, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 870 }, { "epoch": 0.6226988382484361, "grad_norm": 0.017642315477132797, "learning_rate": 9.98156724479485e-05, "loss": 0.0158, "step": 871 }, { "epoch": 0.6234137622877569, "grad_norm": 0.01842522621154785, "learning_rate": 9.981352421080705e-05, "loss": 0.0249, "step": 872 }, { "epoch": 0.6241286863270777, "grad_norm": 0.01807309314608574, "learning_rate": 9.981136355120482e-05, "loss": 0.0181, "step": 873 }, { "epoch": 0.6248436103663986, "grad_norm": 0.0179002545773983, "learning_rate": 9.980919046968064e-05, "loss": 0.0208, "step": 874 }, { "epoch": 0.6255585344057194, "grad_norm": 0.02142852172255516, "learning_rate": 9.980700496677642e-05, "loss": 0.0239, "step": 875 }, { "epoch": 0.6255585344057194, "eval_loss": 0.013587336987257004, "eval_runtime": 4.6183, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.815, "step": 875 }, { "epoch": 0.6262734584450402, "grad_norm": 0.016590388491749763, "learning_rate": 9.980480704303718e-05, "loss": 0.0192, "step": 876 }, { "epoch": 0.6269883824843611, "grad_norm": 0.017502179369330406, "learning_rate": 9.980259669901105e-05, "loss": 0.023, "step": 877 }, { "epoch": 0.6277033065236819, "grad_norm": 0.01620560698211193, "learning_rate": 9.980037393524923e-05, "loss": 0.0178, "step": 878 }, { "epoch": 0.6284182305630027, "grad_norm": 0.017715753987431526, "learning_rate": 9.979813875230604e-05, "loss": 0.0162, "step": 879 }, { "epoch": 0.6291331546023236, "grad_norm": 0.0252090897411108, "learning_rate": 9.979589115073887e-05, "loss": 0.0245, "step": 880 }, { "epoch": 0.6291331546023236, "eval_loss": 0.013596376404166222, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 880 }, { "epoch": 0.6298480786416443, "grad_norm": 0.017695730552077293, "learning_rate": 9.979363113110824e-05, "loss": 0.0151, "step": 881 }, { "epoch": 0.6305630026809651, "grad_norm": 0.019449256360530853, "learning_rate": 9.979135869397777e-05, "loss": 0.0239, "step": 882 }, { "epoch": 0.6312779267202859, "grad_norm": 0.016306033357977867, "learning_rate": 9.978907383991411e-05, "loss": 0.0187, "step": 883 }, { "epoch": 0.6319928507596068, "grad_norm": 0.01748051866889, "learning_rate": 9.978677656948711e-05, "loss": 0.0123, "step": 884 }, { "epoch": 0.6327077747989276, "grad_norm": 0.016561226919293404, "learning_rate": 9.978446688326964e-05, "loss": 0.0195, "step": 885 }, { "epoch": 0.6327077747989276, "eval_loss": 0.013731181621551514, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 885 }, { "epoch": 0.6334226988382484, "grad_norm": 0.019130971282720566, "learning_rate": 9.978214478183767e-05, "loss": 0.0163, "step": 886 }, { "epoch": 0.6341376228775693, "grad_norm": 0.017880074679851532, "learning_rate": 9.97798102657703e-05, "loss": 0.0183, "step": 887 }, { "epoch": 0.6348525469168901, "grad_norm": 0.024524275213479996, "learning_rate": 9.97774633356497e-05, "loss": 0.0162, "step": 888 }, { "epoch": 0.6355674709562109, "grad_norm": 0.01558056939393282, "learning_rate": 9.977510399206116e-05, "loss": 0.019, "step": 889 }, { "epoch": 0.6362823949955317, "grad_norm": 0.02019326202571392, "learning_rate": 9.977273223559306e-05, "loss": 0.0194, "step": 890 }, { "epoch": 0.6362823949955317, "eval_loss": 0.013816356658935547, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 890 }, { "epoch": 0.6369973190348526, "grad_norm": 0.020643223077058792, "learning_rate": 9.977034806683684e-05, "loss": 0.0214, "step": 891 }, { "epoch": 0.6377122430741734, "grad_norm": 0.02281387895345688, "learning_rate": 9.976795148638707e-05, "loss": 0.0171, "step": 892 }, { "epoch": 0.6384271671134942, "grad_norm": 0.019297756254673004, "learning_rate": 9.976554249484143e-05, "loss": 0.0217, "step": 893 }, { "epoch": 0.6391420911528151, "grad_norm": 0.01947975531220436, "learning_rate": 9.976312109280064e-05, "loss": 0.0195, "step": 894 }, { "epoch": 0.6398570151921358, "grad_norm": 0.015272825956344604, "learning_rate": 9.976068728086856e-05, "loss": 0.0171, "step": 895 }, { "epoch": 0.6398570151921358, "eval_loss": 0.01327471062541008, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 895 }, { "epoch": 0.6405719392314566, "grad_norm": 0.01475564669817686, "learning_rate": 9.975824105965214e-05, "loss": 0.0194, "step": 896 }, { "epoch": 0.6412868632707774, "grad_norm": 0.020429205149412155, "learning_rate": 9.97557824297614e-05, "loss": 0.0292, "step": 897 }, { "epoch": 0.6420017873100983, "grad_norm": 0.020674167200922966, "learning_rate": 9.97533113918095e-05, "loss": 0.0192, "step": 898 }, { "epoch": 0.6427167113494191, "grad_norm": 0.016586139798164368, "learning_rate": 9.975082794641264e-05, "loss": 0.0115, "step": 899 }, { "epoch": 0.6434316353887399, "grad_norm": 0.02056541107594967, "learning_rate": 9.974833209419015e-05, "loss": 0.0227, "step": 900 }, { "epoch": 0.6434316353887399, "eval_loss": 0.013134147971868515, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 900 }, { "epoch": 0.6441465594280608, "grad_norm": 0.01586805284023285, "learning_rate": 9.974582383576445e-05, "loss": 0.0123, "step": 901 }, { "epoch": 0.6448614834673816, "grad_norm": 0.01689736731350422, "learning_rate": 9.974330317176102e-05, "loss": 0.017, "step": 902 }, { "epoch": 0.6455764075067024, "grad_norm": 0.020502135157585144, "learning_rate": 9.97407701028085e-05, "loss": 0.0208, "step": 903 }, { "epoch": 0.6462913315460233, "grad_norm": 0.015404821373522282, "learning_rate": 9.973822462953856e-05, "loss": 0.0145, "step": 904 }, { "epoch": 0.6470062555853441, "grad_norm": 0.015708046033978462, "learning_rate": 9.973566675258598e-05, "loss": 0.0184, "step": 905 }, { "epoch": 0.6470062555853441, "eval_loss": 0.013001098297536373, "eval_runtime": 4.5937, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 905 }, { "epoch": 0.6477211796246649, "grad_norm": 0.019105568528175354, "learning_rate": 9.973309647258868e-05, "loss": 0.0163, "step": 906 }, { "epoch": 0.6484361036639857, "grad_norm": 0.016114937141537666, "learning_rate": 9.97305137901876e-05, "loss": 0.0184, "step": 907 }, { "epoch": 0.6491510277033066, "grad_norm": 0.014011161401867867, "learning_rate": 9.972791870602681e-05, "loss": 0.0102, "step": 908 }, { "epoch": 0.6498659517426274, "grad_norm": 0.021722758188843727, "learning_rate": 9.972531122075349e-05, "loss": 0.0181, "step": 909 }, { "epoch": 0.6505808757819481, "grad_norm": 0.017364485189318657, "learning_rate": 9.972269133501787e-05, "loss": 0.0147, "step": 910 }, { "epoch": 0.6505808757819481, "eval_loss": 0.013197804801166058, "eval_runtime": 4.5957, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 910 }, { "epoch": 0.651295799821269, "grad_norm": 0.013157202862203121, "learning_rate": 9.972005904947332e-05, "loss": 0.0124, "step": 911 }, { "epoch": 0.6520107238605898, "grad_norm": 0.014906715601682663, "learning_rate": 9.971741436477625e-05, "loss": 0.0129, "step": 912 }, { "epoch": 0.6527256478999106, "grad_norm": 0.019003622233867645, "learning_rate": 9.971475728158621e-05, "loss": 0.0175, "step": 913 }, { "epoch": 0.6534405719392314, "grad_norm": 0.0243982020765543, "learning_rate": 9.97120878005658e-05, "loss": 0.0243, "step": 914 }, { "epoch": 0.6541554959785523, "grad_norm": 0.018996085971593857, "learning_rate": 9.970940592238077e-05, "loss": 0.0196, "step": 915 }, { "epoch": 0.6541554959785523, "eval_loss": 0.013246710412204266, "eval_runtime": 4.5945, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.829, "step": 915 }, { "epoch": 0.6548704200178731, "grad_norm": 0.020707307383418083, "learning_rate": 9.970671164769989e-05, "loss": 0.018, "step": 916 }, { "epoch": 0.6555853440571939, "grad_norm": 0.01396041177213192, "learning_rate": 9.970400497719508e-05, "loss": 0.0181, "step": 917 }, { "epoch": 0.6563002680965148, "grad_norm": 0.015407705679535866, "learning_rate": 9.970128591154131e-05, "loss": 0.0105, "step": 918 }, { "epoch": 0.6570151921358356, "grad_norm": 0.014632593840360641, "learning_rate": 9.969855445141666e-05, "loss": 0.016, "step": 919 }, { "epoch": 0.6577301161751564, "grad_norm": 0.02139771543443203, "learning_rate": 9.96958105975023e-05, "loss": 0.0144, "step": 920 }, { "epoch": 0.6577301161751564, "eval_loss": 0.01323160994797945, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 920 }, { "epoch": 0.6584450402144773, "grad_norm": 0.017749348655343056, "learning_rate": 9.969305435048251e-05, "loss": 0.0146, "step": 921 }, { "epoch": 0.6591599642537981, "grad_norm": 0.019729603081941605, "learning_rate": 9.969028571104462e-05, "loss": 0.0212, "step": 922 }, { "epoch": 0.6598748882931189, "grad_norm": 0.020843500271439552, "learning_rate": 9.968750467987907e-05, "loss": 0.0178, "step": 923 }, { "epoch": 0.6605898123324396, "grad_norm": 0.022862199693918228, "learning_rate": 9.968471125767941e-05, "loss": 0.0184, "step": 924 }, { "epoch": 0.6613047363717605, "grad_norm": 0.015351568348705769, "learning_rate": 9.968190544514224e-05, "loss": 0.0177, "step": 925 }, { "epoch": 0.6613047363717605, "eval_loss": 0.013560887426137924, "eval_runtime": 4.5918, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 925 }, { "epoch": 0.6620196604110813, "grad_norm": 0.017159614711999893, "learning_rate": 9.967908724296729e-05, "loss": 0.0164, "step": 926 }, { "epoch": 0.6627345844504021, "grad_norm": 0.02219214290380478, "learning_rate": 9.967625665185736e-05, "loss": 0.0194, "step": 927 }, { "epoch": 0.663449508489723, "grad_norm": 0.015774821862578392, "learning_rate": 9.967341367251832e-05, "loss": 0.0123, "step": 928 }, { "epoch": 0.6641644325290438, "grad_norm": 0.01665043830871582, "learning_rate": 9.967055830565916e-05, "loss": 0.018, "step": 929 }, { "epoch": 0.6648793565683646, "grad_norm": 0.015385700389742851, "learning_rate": 9.966769055199197e-05, "loss": 0.017, "step": 930 }, { "epoch": 0.6648793565683646, "eval_loss": 0.013350050896406174, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 930 }, { "epoch": 0.6655942806076854, "grad_norm": 0.015621998347342014, "learning_rate": 9.966481041223188e-05, "loss": 0.018, "step": 931 }, { "epoch": 0.6663092046470063, "grad_norm": 0.012603702023625374, "learning_rate": 9.966191788709716e-05, "loss": 0.0138, "step": 932 }, { "epoch": 0.6670241286863271, "grad_norm": 0.019572192803025246, "learning_rate": 9.965901297730913e-05, "loss": 0.0154, "step": 933 }, { "epoch": 0.6677390527256479, "grad_norm": 0.014186172746121883, "learning_rate": 9.965609568359222e-05, "loss": 0.0163, "step": 934 }, { "epoch": 0.6684539767649688, "grad_norm": 0.019891846925020218, "learning_rate": 9.965316600667393e-05, "loss": 0.0179, "step": 935 }, { "epoch": 0.6684539767649688, "eval_loss": 0.013412293046712875, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 935 }, { "epoch": 0.6691689008042896, "grad_norm": 0.015193138271570206, "learning_rate": 9.965022394728487e-05, "loss": 0.0168, "step": 936 }, { "epoch": 0.6698838248436104, "grad_norm": 0.02054636925458908, "learning_rate": 9.964726950615875e-05, "loss": 0.0188, "step": 937 }, { "epoch": 0.6705987488829311, "grad_norm": 0.014719733037054539, "learning_rate": 9.964430268403231e-05, "loss": 0.0164, "step": 938 }, { "epoch": 0.671313672922252, "grad_norm": 0.014401719905436039, "learning_rate": 9.964132348164545e-05, "loss": 0.0152, "step": 939 }, { "epoch": 0.6720285969615728, "grad_norm": 0.021619169041514397, "learning_rate": 9.963833189974109e-05, "loss": 0.0294, "step": 940 }, { "epoch": 0.6720285969615728, "eval_loss": 0.013273729011416435, "eval_runtime": 4.6158, "eval_samples_per_second": 10.832, "eval_steps_per_second": 2.816, "step": 940 }, { "epoch": 0.6727435210008936, "grad_norm": 0.021811800077557564, "learning_rate": 9.963532793906528e-05, "loss": 0.0204, "step": 941 }, { "epoch": 0.6734584450402145, "grad_norm": 0.02223445288836956, "learning_rate": 9.963231160036714e-05, "loss": 0.0204, "step": 942 }, { "epoch": 0.6741733690795353, "grad_norm": 0.019224494695663452, "learning_rate": 9.962928288439891e-05, "loss": 0.0187, "step": 943 }, { "epoch": 0.6748882931188561, "grad_norm": 0.017523838207125664, "learning_rate": 9.962624179191586e-05, "loss": 0.0169, "step": 944 }, { "epoch": 0.675603217158177, "grad_norm": 0.01526600867509842, "learning_rate": 9.962318832367639e-05, "loss": 0.0219, "step": 945 }, { "epoch": 0.675603217158177, "eval_loss": 0.01316066924482584, "eval_runtime": 4.6043, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 945 }, { "epoch": 0.6763181411974978, "grad_norm": 0.016996925696730614, "learning_rate": 9.962012248044195e-05, "loss": 0.0224, "step": 946 }, { "epoch": 0.6770330652368186, "grad_norm": 0.01551771443337202, "learning_rate": 9.96170442629771e-05, "loss": 0.0167, "step": 947 }, { "epoch": 0.6777479892761394, "grad_norm": 0.018257655203342438, "learning_rate": 9.961395367204953e-05, "loss": 0.0219, "step": 948 }, { "epoch": 0.6784629133154603, "grad_norm": 0.01631532981991768, "learning_rate": 9.961085070842993e-05, "loss": 0.0242, "step": 949 }, { "epoch": 0.6791778373547811, "grad_norm": 0.01922004111111164, "learning_rate": 9.960773537289212e-05, "loss": 0.0119, "step": 950 }, { "epoch": 0.6791778373547811, "eval_loss": 0.013287574052810669, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 950 }, { "epoch": 0.6798927613941019, "grad_norm": 0.014233805239200592, "learning_rate": 9.960460766621298e-05, "loss": 0.0263, "step": 951 }, { "epoch": 0.6806076854334226, "grad_norm": 0.018956637009978294, "learning_rate": 9.960146758917254e-05, "loss": 0.0145, "step": 952 }, { "epoch": 0.6813226094727435, "grad_norm": 0.01921582594513893, "learning_rate": 9.959831514255383e-05, "loss": 0.0213, "step": 953 }, { "epoch": 0.6820375335120643, "grad_norm": 0.018830832093954086, "learning_rate": 9.959515032714304e-05, "loss": 0.0205, "step": 954 }, { "epoch": 0.6827524575513851, "grad_norm": 0.01834026165306568, "learning_rate": 9.959197314372937e-05, "loss": 0.0234, "step": 955 }, { "epoch": 0.6827524575513851, "eval_loss": 0.013149144127964973, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 955 }, { "epoch": 0.683467381590706, "grad_norm": 0.01701376773416996, "learning_rate": 9.958878359310517e-05, "loss": 0.0137, "step": 956 }, { "epoch": 0.6841823056300268, "grad_norm": 0.014861451461911201, "learning_rate": 9.958558167606585e-05, "loss": 0.0189, "step": 957 }, { "epoch": 0.6848972296693476, "grad_norm": 0.017918147146701813, "learning_rate": 9.95823673934099e-05, "loss": 0.0152, "step": 958 }, { "epoch": 0.6856121537086685, "grad_norm": 0.02260666713118553, "learning_rate": 9.957914074593889e-05, "loss": 0.0237, "step": 959 }, { "epoch": 0.6863270777479893, "grad_norm": 0.019879426807165146, "learning_rate": 9.957590173445746e-05, "loss": 0.0175, "step": 960 }, { "epoch": 0.6863270777479893, "eval_loss": 0.013304523192346096, "eval_runtime": 4.5969, "eval_samples_per_second": 10.877, "eval_steps_per_second": 2.828, "step": 960 }, { "epoch": 0.6870420017873101, "grad_norm": 0.01552928052842617, "learning_rate": 9.957265035977336e-05, "loss": 0.0184, "step": 961 }, { "epoch": 0.687756925826631, "grad_norm": 0.02207057550549507, "learning_rate": 9.956938662269746e-05, "loss": 0.0185, "step": 962 }, { "epoch": 0.6884718498659518, "grad_norm": 0.01808646321296692, "learning_rate": 9.956611052404363e-05, "loss": 0.0149, "step": 963 }, { "epoch": 0.6891867739052726, "grad_norm": 0.01882527396082878, "learning_rate": 9.956282206462885e-05, "loss": 0.0164, "step": 964 }, { "epoch": 0.6899016979445934, "grad_norm": 0.02707718312740326, "learning_rate": 9.955952124527321e-05, "loss": 0.0181, "step": 965 }, { "epoch": 0.6899016979445934, "eval_loss": 0.01317853294312954, "eval_runtime": 4.5988, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 965 }, { "epoch": 0.6906166219839142, "grad_norm": 0.018309596925973892, "learning_rate": 9.955620806679987e-05, "loss": 0.0221, "step": 966 }, { "epoch": 0.691331546023235, "grad_norm": 0.017888860777020454, "learning_rate": 9.955288253003508e-05, "loss": 0.0173, "step": 967 }, { "epoch": 0.6920464700625558, "grad_norm": 0.01626371219754219, "learning_rate": 9.954954463580812e-05, "loss": 0.0254, "step": 968 }, { "epoch": 0.6927613941018766, "grad_norm": 0.01643555425107479, "learning_rate": 9.954619438495143e-05, "loss": 0.0199, "step": 969 }, { "epoch": 0.6934763181411975, "grad_norm": 0.01552966795861721, "learning_rate": 9.954283177830046e-05, "loss": 0.0184, "step": 970 }, { "epoch": 0.6934763181411975, "eval_loss": 0.012919052504003048, "eval_runtime": 4.5978, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.827, "step": 970 }, { "epoch": 0.6941912421805183, "grad_norm": 0.018453221768140793, "learning_rate": 9.95394568166938e-05, "loss": 0.0254, "step": 971 }, { "epoch": 0.6949061662198391, "grad_norm": 0.019364111125469208, "learning_rate": 9.95360695009731e-05, "loss": 0.0175, "step": 972 }, { "epoch": 0.69562109025916, "grad_norm": 0.01790689490735531, "learning_rate": 9.953266983198306e-05, "loss": 0.0241, "step": 973 }, { "epoch": 0.6963360142984808, "grad_norm": 0.014519740827381611, "learning_rate": 9.952925781057152e-05, "loss": 0.0217, "step": 974 }, { "epoch": 0.6970509383378016, "grad_norm": 0.01221796590834856, "learning_rate": 9.952583343758934e-05, "loss": 0.0112, "step": 975 }, { "epoch": 0.6970509383378016, "eval_loss": 0.01296248845756054, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 975 }, { "epoch": 0.6977658623771225, "grad_norm": 0.016995519399642944, "learning_rate": 9.952239671389048e-05, "loss": 0.0155, "step": 976 }, { "epoch": 0.6984807864164433, "grad_norm": 0.019561054185032845, "learning_rate": 9.951894764033203e-05, "loss": 0.0252, "step": 977 }, { "epoch": 0.6991957104557641, "grad_norm": 0.015065398998558521, "learning_rate": 9.951548621777408e-05, "loss": 0.0237, "step": 978 }, { "epoch": 0.699910634495085, "grad_norm": 0.015538624487817287, "learning_rate": 9.951201244707984e-05, "loss": 0.0248, "step": 979 }, { "epoch": 0.7006255585344057, "grad_norm": 0.021459441632032394, "learning_rate": 9.950852632911562e-05, "loss": 0.0236, "step": 980 }, { "epoch": 0.7006255585344057, "eval_loss": 0.013056116178631783, "eval_runtime": 4.5896, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 980 }, { "epoch": 0.7013404825737265, "grad_norm": 0.016892995685338974, "learning_rate": 9.950502786475078e-05, "loss": 0.0162, "step": 981 }, { "epoch": 0.7020554066130473, "grad_norm": 0.019635247066617012, "learning_rate": 9.950151705485773e-05, "loss": 0.0264, "step": 982 }, { "epoch": 0.7027703306523682, "grad_norm": 0.01426888071000576, "learning_rate": 9.949799390031203e-05, "loss": 0.0173, "step": 983 }, { "epoch": 0.703485254691689, "grad_norm": 0.015148122794926167, "learning_rate": 9.949445840199227e-05, "loss": 0.0183, "step": 984 }, { "epoch": 0.7042001787310098, "grad_norm": 0.01349672582000494, "learning_rate": 9.949091056078012e-05, "loss": 0.0162, "step": 985 }, { "epoch": 0.7042001787310098, "eval_loss": 0.01328415796160698, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 985 }, { "epoch": 0.7049151027703306, "grad_norm": 0.014387589879333973, "learning_rate": 9.948735037756037e-05, "loss": 0.0117, "step": 986 }, { "epoch": 0.7056300268096515, "grad_norm": 0.016436470672488213, "learning_rate": 9.948377785322082e-05, "loss": 0.0229, "step": 987 }, { "epoch": 0.7063449508489723, "grad_norm": 0.018741745501756668, "learning_rate": 9.94801929886524e-05, "loss": 0.0171, "step": 988 }, { "epoch": 0.7070598748882931, "grad_norm": 0.014291156083345413, "learning_rate": 9.94765957847491e-05, "loss": 0.0194, "step": 989 }, { "epoch": 0.707774798927614, "grad_norm": 0.01962197758257389, "learning_rate": 9.9472986242408e-05, "loss": 0.0131, "step": 990 }, { "epoch": 0.707774798927614, "eval_loss": 0.0129645811393857, "eval_runtime": 4.6032, "eval_samples_per_second": 10.862, "eval_steps_per_second": 2.824, "step": 990 }, { "epoch": 0.7084897229669348, "grad_norm": 0.015018360689282417, "learning_rate": 9.946936436252922e-05, "loss": 0.0117, "step": 991 }, { "epoch": 0.7092046470062556, "grad_norm": 0.017634445801377296, "learning_rate": 9.946573014601599e-05, "loss": 0.0275, "step": 992 }, { "epoch": 0.7099195710455765, "grad_norm": 0.019072746858000755, "learning_rate": 9.946208359377462e-05, "loss": 0.0193, "step": 993 }, { "epoch": 0.7106344950848972, "grad_norm": 0.015899088233709335, "learning_rate": 9.945842470671447e-05, "loss": 0.0131, "step": 994 }, { "epoch": 0.711349419124218, "grad_norm": 0.018466776236891747, "learning_rate": 9.9454753485748e-05, "loss": 0.018, "step": 995 }, { "epoch": 0.711349419124218, "eval_loss": 0.013130458071827888, "eval_runtime": 4.5946, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 995 }, { "epoch": 0.7120643431635388, "grad_norm": 0.01506463997066021, "learning_rate": 9.945106993179074e-05, "loss": 0.0116, "step": 996 }, { "epoch": 0.7127792672028597, "grad_norm": 0.020332863554358482, "learning_rate": 9.944737404576128e-05, "loss": 0.0195, "step": 997 }, { "epoch": 0.7134941912421805, "grad_norm": 0.019191335886716843, "learning_rate": 9.94436658285813e-05, "loss": 0.015, "step": 998 }, { "epoch": 0.7142091152815013, "grad_norm": 0.019023068249225616, "learning_rate": 9.943994528117557e-05, "loss": 0.0141, "step": 999 }, { "epoch": 0.7149240393208222, "grad_norm": 0.018235763534903526, "learning_rate": 9.943621240447189e-05, "loss": 0.0133, "step": 1000 }, { "epoch": 0.7149240393208222, "eval_loss": 0.012929772026836872, "eval_runtime": 4.5884, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1000 }, { "epoch": 0.715638963360143, "grad_norm": 0.015487240627408028, "learning_rate": 9.943246719940117e-05, "loss": 0.0124, "step": 1001 }, { "epoch": 0.7163538873994638, "grad_norm": 0.016491392627358437, "learning_rate": 9.942870966689742e-05, "loss": 0.0213, "step": 1002 }, { "epoch": 0.7170688114387846, "grad_norm": 0.018275201320648193, "learning_rate": 9.942493980789762e-05, "loss": 0.0215, "step": 1003 }, { "epoch": 0.7177837354781055, "grad_norm": 0.015154272317886353, "learning_rate": 9.942115762334196e-05, "loss": 0.0151, "step": 1004 }, { "epoch": 0.7184986595174263, "grad_norm": 0.019671974703669548, "learning_rate": 9.941736311417362e-05, "loss": 0.0185, "step": 1005 }, { "epoch": 0.7184986595174263, "eval_loss": 0.012879117392003536, "eval_runtime": 4.5875, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1005 }, { "epoch": 0.7192135835567471, "grad_norm": 0.017227543517947197, "learning_rate": 9.941355628133887e-05, "loss": 0.021, "step": 1006 }, { "epoch": 0.719928507596068, "grad_norm": 0.01779993064701557, "learning_rate": 9.940973712578705e-05, "loss": 0.0222, "step": 1007 }, { "epoch": 0.7206434316353887, "grad_norm": 0.01641230285167694, "learning_rate": 9.940590564847058e-05, "loss": 0.0209, "step": 1008 }, { "epoch": 0.7213583556747095, "grad_norm": 0.015454607084393501, "learning_rate": 9.940206185034495e-05, "loss": 0.0226, "step": 1009 }, { "epoch": 0.7220732797140303, "grad_norm": 0.017235999926924706, "learning_rate": 9.939820573236872e-05, "loss": 0.0171, "step": 1010 }, { "epoch": 0.7220732797140303, "eval_loss": 0.013088079169392586, "eval_runtime": 4.5914, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 1010 }, { "epoch": 0.7227882037533512, "grad_norm": 0.014527088031172752, "learning_rate": 9.939433729550354e-05, "loss": 0.0197, "step": 1011 }, { "epoch": 0.723503127792672, "grad_norm": 0.017687540501356125, "learning_rate": 9.939045654071411e-05, "loss": 0.0154, "step": 1012 }, { "epoch": 0.7242180518319928, "grad_norm": 0.014927665702998638, "learning_rate": 9.938656346896821e-05, "loss": 0.0105, "step": 1013 }, { "epoch": 0.7249329758713137, "grad_norm": 0.01453486829996109, "learning_rate": 9.938265808123669e-05, "loss": 0.0194, "step": 1014 }, { "epoch": 0.7256478999106345, "grad_norm": 0.012870998121798038, "learning_rate": 9.937874037849347e-05, "loss": 0.0188, "step": 1015 }, { "epoch": 0.7256478999106345, "eval_loss": 0.012787445448338985, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 1015 }, { "epoch": 0.7263628239499553, "grad_norm": 0.019519731402397156, "learning_rate": 9.937481036171554e-05, "loss": 0.0274, "step": 1016 }, { "epoch": 0.7270777479892762, "grad_norm": 0.01645946502685547, "learning_rate": 9.937086803188299e-05, "loss": 0.0263, "step": 1017 }, { "epoch": 0.727792672028597, "grad_norm": 0.019587570801377296, "learning_rate": 9.936691338997894e-05, "loss": 0.0208, "step": 1018 }, { "epoch": 0.7285075960679178, "grad_norm": 0.01838281750679016, "learning_rate": 9.936294643698957e-05, "loss": 0.0149, "step": 1019 }, { "epoch": 0.7292225201072386, "grad_norm": 0.015341089107096195, "learning_rate": 9.93589671739042e-05, "loss": 0.0254, "step": 1020 }, { "epoch": 0.7292225201072386, "eval_loss": 0.01304134912788868, "eval_runtime": 4.6021, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 1020 }, { "epoch": 0.7299374441465595, "grad_norm": 0.015371466055512428, "learning_rate": 9.935497560171516e-05, "loss": 0.0134, "step": 1021 }, { "epoch": 0.7306523681858802, "grad_norm": 0.01509457640349865, "learning_rate": 9.935097172141785e-05, "loss": 0.0163, "step": 1022 }, { "epoch": 0.731367292225201, "grad_norm": 0.01874482072889805, "learning_rate": 9.934695553401077e-05, "loss": 0.0213, "step": 1023 }, { "epoch": 0.7320822162645219, "grad_norm": 0.01784847490489483, "learning_rate": 9.934292704049546e-05, "loss": 0.0147, "step": 1024 }, { "epoch": 0.7327971403038427, "grad_norm": 0.016628554090857506, "learning_rate": 9.933888624187655e-05, "loss": 0.0163, "step": 1025 }, { "epoch": 0.7327971403038427, "eval_loss": 0.012877585366368294, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 1025 }, { "epoch": 0.7335120643431635, "grad_norm": 0.01942230761051178, "learning_rate": 9.933483313916174e-05, "loss": 0.0137, "step": 1026 }, { "epoch": 0.7342269883824843, "grad_norm": 0.017600059509277344, "learning_rate": 9.933076773336179e-05, "loss": 0.0333, "step": 1027 }, { "epoch": 0.7349419124218052, "grad_norm": 0.024424372240900993, "learning_rate": 9.932669002549051e-05, "loss": 0.0159, "step": 1028 }, { "epoch": 0.735656836461126, "grad_norm": 0.021110862493515015, "learning_rate": 9.932260001656482e-05, "loss": 0.0216, "step": 1029 }, { "epoch": 0.7363717605004468, "grad_norm": 0.013889040797948837, "learning_rate": 9.931849770760466e-05, "loss": 0.0175, "step": 1030 }, { "epoch": 0.7363717605004468, "eval_loss": 0.012919253669679165, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 1030 }, { "epoch": 0.7370866845397677, "grad_norm": 0.01945733278989792, "learning_rate": 9.931438309963308e-05, "loss": 0.0138, "step": 1031 }, { "epoch": 0.7378016085790885, "grad_norm": 0.015440138056874275, "learning_rate": 9.931025619367616e-05, "loss": 0.015, "step": 1032 }, { "epoch": 0.7385165326184093, "grad_norm": 0.019123395904898643, "learning_rate": 9.930611699076309e-05, "loss": 0.0208, "step": 1033 }, { "epoch": 0.7392314566577302, "grad_norm": 0.016466479748487473, "learning_rate": 9.930196549192608e-05, "loss": 0.0174, "step": 1034 }, { "epoch": 0.739946380697051, "grad_norm": 0.02327265776693821, "learning_rate": 9.929780169820041e-05, "loss": 0.0174, "step": 1035 }, { "epoch": 0.739946380697051, "eval_loss": 0.012794435024261475, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 1035 }, { "epoch": 0.7406613047363717, "grad_norm": 0.020832885056734085, "learning_rate": 9.92936256106245e-05, "loss": 0.0138, "step": 1036 }, { "epoch": 0.7413762287756925, "grad_norm": 0.021356064826250076, "learning_rate": 9.928943723023973e-05, "loss": 0.0196, "step": 1037 }, { "epoch": 0.7420911528150134, "grad_norm": 0.022449277341365814, "learning_rate": 9.928523655809062e-05, "loss": 0.0225, "step": 1038 }, { "epoch": 0.7428060768543342, "grad_norm": 0.02295546792447567, "learning_rate": 9.928102359522473e-05, "loss": 0.0165, "step": 1039 }, { "epoch": 0.743521000893655, "grad_norm": 0.01574295572936535, "learning_rate": 9.927679834269266e-05, "loss": 0.0124, "step": 1040 }, { "epoch": 0.743521000893655, "eval_loss": 0.01299627497792244, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1040 }, { "epoch": 0.7442359249329759, "grad_norm": 0.015496481209993362, "learning_rate": 9.927256080154812e-05, "loss": 0.0178, "step": 1041 }, { "epoch": 0.7449508489722967, "grad_norm": 0.019593220204114914, "learning_rate": 9.926831097284788e-05, "loss": 0.0194, "step": 1042 }, { "epoch": 0.7456657730116175, "grad_norm": 0.021282188594341278, "learning_rate": 9.926404885765175e-05, "loss": 0.0196, "step": 1043 }, { "epoch": 0.7463806970509383, "grad_norm": 0.015659915283322334, "learning_rate": 9.92597744570226e-05, "loss": 0.0169, "step": 1044 }, { "epoch": 0.7470956210902592, "grad_norm": 0.017424210906028748, "learning_rate": 9.925548777202635e-05, "loss": 0.015, "step": 1045 }, { "epoch": 0.7470956210902592, "eval_loss": 0.013067936524748802, "eval_runtime": 4.5861, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1045 }, { "epoch": 0.74781054512958, "grad_norm": 0.014710649847984314, "learning_rate": 9.925118880373208e-05, "loss": 0.0141, "step": 1046 }, { "epoch": 0.7485254691689008, "grad_norm": 0.015139307826757431, "learning_rate": 9.924687755321182e-05, "loss": 0.0123, "step": 1047 }, { "epoch": 0.7492403932082217, "grad_norm": 0.015488872304558754, "learning_rate": 9.924255402154072e-05, "loss": 0.0234, "step": 1048 }, { "epoch": 0.7499553172475425, "grad_norm": 0.014725388027727604, "learning_rate": 9.923821820979695e-05, "loss": 0.0226, "step": 1049 }, { "epoch": 0.7506702412868632, "grad_norm": 0.01600436307489872, "learning_rate": 9.923387011906182e-05, "loss": 0.0187, "step": 1050 }, { "epoch": 0.7506702412868632, "eval_loss": 0.013181782327592373, "eval_runtime": 4.6111, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 1050 }, { "epoch": 0.751385165326184, "grad_norm": 0.0163298100233078, "learning_rate": 9.922950975041963e-05, "loss": 0.0157, "step": 1051 }, { "epoch": 0.7521000893655049, "grad_norm": 0.01643788255751133, "learning_rate": 9.922513710495773e-05, "loss": 0.0135, "step": 1052 }, { "epoch": 0.7528150134048257, "grad_norm": 0.014153429307043552, "learning_rate": 9.922075218376664e-05, "loss": 0.0119, "step": 1053 }, { "epoch": 0.7535299374441465, "grad_norm": 0.017814205959439278, "learning_rate": 9.921635498793982e-05, "loss": 0.0239, "step": 1054 }, { "epoch": 0.7542448614834674, "grad_norm": 0.017219269648194313, "learning_rate": 9.921194551857383e-05, "loss": 0.0169, "step": 1055 }, { "epoch": 0.7542448614834674, "eval_loss": 0.013192305341362953, "eval_runtime": 4.5945, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 1055 }, { "epoch": 0.7549597855227882, "grad_norm": 0.02366054058074951, "learning_rate": 9.920752377676834e-05, "loss": 0.0163, "step": 1056 }, { "epoch": 0.755674709562109, "grad_norm": 0.020575348287820816, "learning_rate": 9.920308976362602e-05, "loss": 0.0226, "step": 1057 }, { "epoch": 0.7563896336014299, "grad_norm": 0.016648348420858383, "learning_rate": 9.919864348025261e-05, "loss": 0.0207, "step": 1058 }, { "epoch": 0.7571045576407507, "grad_norm": 0.015018022619187832, "learning_rate": 9.919418492775694e-05, "loss": 0.0149, "step": 1059 }, { "epoch": 0.7578194816800715, "grad_norm": 0.017270827665925026, "learning_rate": 9.918971410725087e-05, "loss": 0.0236, "step": 1060 }, { "epoch": 0.7578194816800715, "eval_loss": 0.013197843916714191, "eval_runtime": 4.6362, "eval_samples_per_second": 10.785, "eval_steps_per_second": 2.804, "step": 1060 }, { "epoch": 0.7585344057193923, "grad_norm": 0.019134074449539185, "learning_rate": 9.918523101984933e-05, "loss": 0.0158, "step": 1061 }, { "epoch": 0.7592493297587132, "grad_norm": 0.012906530871987343, "learning_rate": 9.918073566667032e-05, "loss": 0.0125, "step": 1062 }, { "epoch": 0.759964253798034, "grad_norm": 0.015632933005690575, "learning_rate": 9.917622804883489e-05, "loss": 0.0257, "step": 1063 }, { "epoch": 0.7606791778373548, "grad_norm": 0.01914781704545021, "learning_rate": 9.917170816746712e-05, "loss": 0.0178, "step": 1064 }, { "epoch": 0.7613941018766756, "grad_norm": 0.01811205968260765, "learning_rate": 9.916717602369419e-05, "loss": 0.0174, "step": 1065 }, { "epoch": 0.7613941018766756, "eval_loss": 0.0132159274071455, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 1065 }, { "epoch": 0.7621090259159964, "grad_norm": 0.015020554885268211, "learning_rate": 9.916263161864634e-05, "loss": 0.0131, "step": 1066 }, { "epoch": 0.7628239499553172, "grad_norm": 0.01748267374932766, "learning_rate": 9.915807495345681e-05, "loss": 0.0198, "step": 1067 }, { "epoch": 0.763538873994638, "grad_norm": 0.016704736277461052, "learning_rate": 9.9153506029262e-05, "loss": 0.0206, "step": 1068 }, { "epoch": 0.7642537980339589, "grad_norm": 0.02370535582304001, "learning_rate": 9.914892484720124e-05, "loss": 0.0189, "step": 1069 }, { "epoch": 0.7649687220732797, "grad_norm": 0.01850125752389431, "learning_rate": 9.914433140841701e-05, "loss": 0.0141, "step": 1070 }, { "epoch": 0.7649687220732797, "eval_loss": 0.013115628622472286, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1070 }, { "epoch": 0.7656836461126005, "grad_norm": 0.016201503574848175, "learning_rate": 9.913972571405482e-05, "loss": 0.016, "step": 1071 }, { "epoch": 0.7663985701519214, "grad_norm": 0.016707967966794968, "learning_rate": 9.913510776526323e-05, "loss": 0.0151, "step": 1072 }, { "epoch": 0.7671134941912422, "grad_norm": 0.02286737784743309, "learning_rate": 9.913047756319387e-05, "loss": 0.0211, "step": 1073 }, { "epoch": 0.767828418230563, "grad_norm": 0.02424740605056286, "learning_rate": 9.912583510900143e-05, "loss": 0.0211, "step": 1074 }, { "epoch": 0.7685433422698839, "grad_norm": 0.018522141501307487, "learning_rate": 9.912118040384359e-05, "loss": 0.0152, "step": 1075 }, { "epoch": 0.7685433422698839, "eval_loss": 0.01324245985597372, "eval_runtime": 4.5882, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 1075 }, { "epoch": 0.7692582663092047, "grad_norm": 0.021939905360341072, "learning_rate": 9.911651344888118e-05, "loss": 0.0213, "step": 1076 }, { "epoch": 0.7699731903485255, "grad_norm": 0.018602414056658745, "learning_rate": 9.911183424527801e-05, "loss": 0.0246, "step": 1077 }, { "epoch": 0.7706881143878463, "grad_norm": 0.014333096332848072, "learning_rate": 9.910714279420102e-05, "loss": 0.0183, "step": 1078 }, { "epoch": 0.7714030384271671, "grad_norm": 0.018102655187249184, "learning_rate": 9.910243909682013e-05, "loss": 0.0123, "step": 1079 }, { "epoch": 0.7721179624664879, "grad_norm": 0.022231295704841614, "learning_rate": 9.909772315430837e-05, "loss": 0.0169, "step": 1080 }, { "epoch": 0.7721179624664879, "eval_loss": 0.013282675296068192, "eval_runtime": 4.593, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1080 }, { "epoch": 0.7728328865058087, "grad_norm": 0.014143445529043674, "learning_rate": 9.909299496784177e-05, "loss": 0.0188, "step": 1081 }, { "epoch": 0.7735478105451296, "grad_norm": 0.015772515907883644, "learning_rate": 9.908825453859945e-05, "loss": 0.0204, "step": 1082 }, { "epoch": 0.7742627345844504, "grad_norm": 0.016723453998565674, "learning_rate": 9.908350186776356e-05, "loss": 0.0171, "step": 1083 }, { "epoch": 0.7749776586237712, "grad_norm": 0.020076585933566093, "learning_rate": 9.907873695651936e-05, "loss": 0.0244, "step": 1084 }, { "epoch": 0.775692582663092, "grad_norm": 0.014836159534752369, "learning_rate": 9.907395980605509e-05, "loss": 0.0125, "step": 1085 }, { "epoch": 0.775692582663092, "eval_loss": 0.013181062415242195, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1085 }, { "epoch": 0.7764075067024129, "grad_norm": 0.013040046207606792, "learning_rate": 9.906917041756207e-05, "loss": 0.0123, "step": 1086 }, { "epoch": 0.7771224307417337, "grad_norm": 0.02214786224067211, "learning_rate": 9.906436879223469e-05, "loss": 0.0194, "step": 1087 }, { "epoch": 0.7778373547810545, "grad_norm": 0.018048595637083054, "learning_rate": 9.905955493127036e-05, "loss": 0.021, "step": 1088 }, { "epoch": 0.7785522788203754, "grad_norm": 0.01766500063240528, "learning_rate": 9.905472883586958e-05, "loss": 0.0158, "step": 1089 }, { "epoch": 0.7792672028596962, "grad_norm": 0.014359849505126476, "learning_rate": 9.904989050723583e-05, "loss": 0.0133, "step": 1090 }, { "epoch": 0.7792672028596962, "eval_loss": 0.012881400063633919, "eval_runtime": 4.5921, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 1090 }, { "epoch": 0.779982126899017, "grad_norm": 0.01590855047106743, "learning_rate": 9.904503994657575e-05, "loss": 0.017, "step": 1091 }, { "epoch": 0.7806970509383379, "grad_norm": 0.02012617699801922, "learning_rate": 9.904017715509894e-05, "loss": 0.0236, "step": 1092 }, { "epoch": 0.7814119749776586, "grad_norm": 0.017871851101517677, "learning_rate": 9.903530213401806e-05, "loss": 0.0168, "step": 1093 }, { "epoch": 0.7821268990169794, "grad_norm": 0.014688068069517612, "learning_rate": 9.903041488454888e-05, "loss": 0.0119, "step": 1094 }, { "epoch": 0.7828418230563002, "grad_norm": 0.01447278168052435, "learning_rate": 9.902551540791015e-05, "loss": 0.0228, "step": 1095 }, { "epoch": 0.7828418230563002, "eval_loss": 0.012807986699044704, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1095 }, { "epoch": 0.7835567470956211, "grad_norm": 0.01373728085309267, "learning_rate": 9.902060370532371e-05, "loss": 0.0125, "step": 1096 }, { "epoch": 0.7842716711349419, "grad_norm": 0.015243184752762318, "learning_rate": 9.901567977801444e-05, "loss": 0.0154, "step": 1097 }, { "epoch": 0.7849865951742627, "grad_norm": 0.01745017245411873, "learning_rate": 9.901074362721024e-05, "loss": 0.0149, "step": 1098 }, { "epoch": 0.7857015192135836, "grad_norm": 0.015697868540883064, "learning_rate": 9.900579525414213e-05, "loss": 0.0197, "step": 1099 }, { "epoch": 0.7864164432529044, "grad_norm": 0.016915321350097656, "learning_rate": 9.900083466004409e-05, "loss": 0.0213, "step": 1100 }, { "epoch": 0.7864164432529044, "eval_loss": 0.012908066622912884, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 1100 }, { "epoch": 0.7871313672922252, "grad_norm": 0.016940834000706673, "learning_rate": 9.89958618461532e-05, "loss": 0.0164, "step": 1101 }, { "epoch": 0.787846291331546, "grad_norm": 0.02571275271475315, "learning_rate": 9.899087681370958e-05, "loss": 0.0151, "step": 1102 }, { "epoch": 0.7885612153708669, "grad_norm": 0.013230395503342152, "learning_rate": 9.898587956395638e-05, "loss": 0.0153, "step": 1103 }, { "epoch": 0.7892761394101877, "grad_norm": 0.013887111097574234, "learning_rate": 9.898087009813985e-05, "loss": 0.0164, "step": 1104 }, { "epoch": 0.7899910634495085, "grad_norm": 0.013946586288511753, "learning_rate": 9.897584841750921e-05, "loss": 0.0234, "step": 1105 }, { "epoch": 0.7899910634495085, "eval_loss": 0.013114603236317635, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1105 }, { "epoch": 0.7907059874888294, "grad_norm": 0.018889471888542175, "learning_rate": 9.897081452331678e-05, "loss": 0.0187, "step": 1106 }, { "epoch": 0.7914209115281501, "grad_norm": 0.016870107501745224, "learning_rate": 9.89657684168179e-05, "loss": 0.0266, "step": 1107 }, { "epoch": 0.7921358355674709, "grad_norm": 0.014644318260252476, "learning_rate": 9.896071009927097e-05, "loss": 0.0169, "step": 1108 }, { "epoch": 0.7928507596067917, "grad_norm": 0.014136535115540028, "learning_rate": 9.895563957193743e-05, "loss": 0.0198, "step": 1109 }, { "epoch": 0.7935656836461126, "grad_norm": 0.019501252099871635, "learning_rate": 9.895055683608175e-05, "loss": 0.0125, "step": 1110 }, { "epoch": 0.7935656836461126, "eval_loss": 0.013110597617924213, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1110 }, { "epoch": 0.7942806076854334, "grad_norm": 0.01990175060927868, "learning_rate": 9.894546189297148e-05, "loss": 0.022, "step": 1111 }, { "epoch": 0.7949955317247542, "grad_norm": 0.019982697442173958, "learning_rate": 9.894035474387719e-05, "loss": 0.0258, "step": 1112 }, { "epoch": 0.7957104557640751, "grad_norm": 0.019178247079253197, "learning_rate": 9.893523539007248e-05, "loss": 0.0214, "step": 1113 }, { "epoch": 0.7964253798033959, "grad_norm": 0.01770785264670849, "learning_rate": 9.893010383283403e-05, "loss": 0.0155, "step": 1114 }, { "epoch": 0.7971403038427167, "grad_norm": 0.015462000854313374, "learning_rate": 9.892496007344154e-05, "loss": 0.0183, "step": 1115 }, { "epoch": 0.7971403038427167, "eval_loss": 0.013326255604624748, "eval_runtime": 4.5845, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1115 }, { "epoch": 0.7978552278820376, "grad_norm": 0.019994208589196205, "learning_rate": 9.891980411317774e-05, "loss": 0.0154, "step": 1116 }, { "epoch": 0.7985701519213584, "grad_norm": 0.013033030554652214, "learning_rate": 9.891463595332843e-05, "loss": 0.0163, "step": 1117 }, { "epoch": 0.7992850759606792, "grad_norm": 0.018755175173282623, "learning_rate": 9.890945559518247e-05, "loss": 0.0175, "step": 1118 }, { "epoch": 0.8, "grad_norm": 0.015444181859493256, "learning_rate": 9.890426304003169e-05, "loss": 0.0199, "step": 1119 }, { "epoch": 0.8007149240393209, "grad_norm": 0.016540994867682457, "learning_rate": 9.889905828917103e-05, "loss": 0.0213, "step": 1120 }, { "epoch": 0.8007149240393209, "eval_loss": 0.013161853887140751, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 1120 }, { "epoch": 0.8014298480786416, "grad_norm": 0.014170285314321518, "learning_rate": 9.889384134389844e-05, "loss": 0.0135, "step": 1121 }, { "epoch": 0.8021447721179624, "grad_norm": 0.015628360211849213, "learning_rate": 9.888861220551493e-05, "loss": 0.0149, "step": 1122 }, { "epoch": 0.8028596961572833, "grad_norm": 0.012522336095571518, "learning_rate": 9.88833708753245e-05, "loss": 0.014, "step": 1123 }, { "epoch": 0.8035746201966041, "grad_norm": 0.015576213598251343, "learning_rate": 9.88781173546343e-05, "loss": 0.0282, "step": 1124 }, { "epoch": 0.8042895442359249, "grad_norm": 0.014803001657128334, "learning_rate": 9.887285164475438e-05, "loss": 0.0153, "step": 1125 }, { "epoch": 0.8042895442359249, "eval_loss": 0.013174030929803848, "eval_runtime": 4.6032, "eval_samples_per_second": 10.862, "eval_steps_per_second": 2.824, "step": 1125 }, { "epoch": 0.8050044682752457, "grad_norm": 0.015012080781161785, "learning_rate": 9.886757374699792e-05, "loss": 0.0127, "step": 1126 }, { "epoch": 0.8057193923145666, "grad_norm": 0.014878165908157825, "learning_rate": 9.886228366268113e-05, "loss": 0.0181, "step": 1127 }, { "epoch": 0.8064343163538874, "grad_norm": 0.01872696541249752, "learning_rate": 9.885698139312325e-05, "loss": 0.0192, "step": 1128 }, { "epoch": 0.8071492403932082, "grad_norm": 0.02491520531475544, "learning_rate": 9.885166693964654e-05, "loss": 0.013, "step": 1129 }, { "epoch": 0.8078641644325291, "grad_norm": 0.01643911749124527, "learning_rate": 9.884634030357633e-05, "loss": 0.0152, "step": 1130 }, { "epoch": 0.8078641644325291, "eval_loss": 0.013372992165386677, "eval_runtime": 4.586, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1130 }, { "epoch": 0.8085790884718499, "grad_norm": 0.016015231609344482, "learning_rate": 9.884100148624096e-05, "loss": 0.017, "step": 1131 }, { "epoch": 0.8092940125111707, "grad_norm": 0.01695406623184681, "learning_rate": 9.883565048897184e-05, "loss": 0.0154, "step": 1132 }, { "epoch": 0.8100089365504916, "grad_norm": 0.020747680217027664, "learning_rate": 9.883028731310335e-05, "loss": 0.0199, "step": 1133 }, { "epoch": 0.8107238605898124, "grad_norm": 0.017813468351960182, "learning_rate": 9.8824911959973e-05, "loss": 0.0174, "step": 1134 }, { "epoch": 0.8114387846291331, "grad_norm": 0.016800599172711372, "learning_rate": 9.88195244309213e-05, "loss": 0.0226, "step": 1135 }, { "epoch": 0.8114387846291331, "eval_loss": 0.013069902546703815, "eval_runtime": 4.595, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 1135 }, { "epoch": 0.8121537086684539, "grad_norm": 0.017410941421985626, "learning_rate": 9.881412472729175e-05, "loss": 0.0176, "step": 1136 }, { "epoch": 0.8128686327077748, "grad_norm": 0.015332773327827454, "learning_rate": 9.880871285043094e-05, "loss": 0.0123, "step": 1137 }, { "epoch": 0.8135835567470956, "grad_norm": 0.018235869705677032, "learning_rate": 9.880328880168849e-05, "loss": 0.0199, "step": 1138 }, { "epoch": 0.8142984807864164, "grad_norm": 0.015194379724562168, "learning_rate": 9.879785258241706e-05, "loss": 0.0147, "step": 1139 }, { "epoch": 0.8150134048257373, "grad_norm": 0.018128421157598495, "learning_rate": 9.879240419397226e-05, "loss": 0.0232, "step": 1140 }, { "epoch": 0.8150134048257373, "eval_loss": 0.012914870865643024, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 1140 }, { "epoch": 0.8157283288650581, "grad_norm": 0.01793074980378151, "learning_rate": 9.878694363771288e-05, "loss": 0.0279, "step": 1141 }, { "epoch": 0.8164432529043789, "grad_norm": 0.01896866038441658, "learning_rate": 9.878147091500065e-05, "loss": 0.02, "step": 1142 }, { "epoch": 0.8171581769436997, "grad_norm": 0.01430155523121357, "learning_rate": 9.877598602720033e-05, "loss": 0.0135, "step": 1143 }, { "epoch": 0.8178731009830206, "grad_norm": 0.0192506555467844, "learning_rate": 9.877048897567975e-05, "loss": 0.0189, "step": 1144 }, { "epoch": 0.8185880250223414, "grad_norm": 0.015980703756213188, "learning_rate": 9.876497976180979e-05, "loss": 0.0112, "step": 1145 }, { "epoch": 0.8185880250223414, "eval_loss": 0.012754159048199654, "eval_runtime": 4.6083, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 1145 }, { "epoch": 0.8193029490616622, "grad_norm": 0.011213653720915318, "learning_rate": 9.875945838696429e-05, "loss": 0.0099, "step": 1146 }, { "epoch": 0.8200178731009831, "grad_norm": 0.017215635627508163, "learning_rate": 9.875392485252019e-05, "loss": 0.0179, "step": 1147 }, { "epoch": 0.8207327971403039, "grad_norm": 0.014757749624550343, "learning_rate": 9.874837915985741e-05, "loss": 0.0143, "step": 1148 }, { "epoch": 0.8214477211796246, "grad_norm": 0.01666787639260292, "learning_rate": 9.8742821310359e-05, "loss": 0.0235, "step": 1149 }, { "epoch": 0.8221626452189454, "grad_norm": 0.015065059065818787, "learning_rate": 9.873725130541089e-05, "loss": 0.0158, "step": 1150 }, { "epoch": 0.8221626452189454, "eval_loss": 0.012948859483003616, "eval_runtime": 4.6205, "eval_samples_per_second": 10.821, "eval_steps_per_second": 2.814, "step": 1150 }, { "epoch": 0.8228775692582663, "grad_norm": 0.01628118008375168, "learning_rate": 9.873166914640217e-05, "loss": 0.0109, "step": 1151 }, { "epoch": 0.8235924932975871, "grad_norm": 0.015529094263911247, "learning_rate": 9.872607483472491e-05, "loss": 0.0202, "step": 1152 }, { "epoch": 0.8243074173369079, "grad_norm": 0.014502808451652527, "learning_rate": 9.872046837177422e-05, "loss": 0.0184, "step": 1153 }, { "epoch": 0.8250223413762288, "grad_norm": 0.01686498336493969, "learning_rate": 9.87148497589482e-05, "loss": 0.0167, "step": 1154 }, { "epoch": 0.8257372654155496, "grad_norm": 0.014175710268318653, "learning_rate": 9.870921899764807e-05, "loss": 0.0151, "step": 1155 }, { "epoch": 0.8257372654155496, "eval_loss": 0.013245487585663795, "eval_runtime": 4.5934, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 1155 }, { "epoch": 0.8264521894548704, "grad_norm": 0.0165728572756052, "learning_rate": 9.870357608927799e-05, "loss": 0.0187, "step": 1156 }, { "epoch": 0.8271671134941913, "grad_norm": 0.020035745576024055, "learning_rate": 9.869792103524518e-05, "loss": 0.0169, "step": 1157 }, { "epoch": 0.8278820375335121, "grad_norm": 0.01765435002744198, "learning_rate": 9.86922538369599e-05, "loss": 0.0195, "step": 1158 }, { "epoch": 0.8285969615728329, "grad_norm": 0.01660754531621933, "learning_rate": 9.868657449583546e-05, "loss": 0.0168, "step": 1159 }, { "epoch": 0.8293118856121537, "grad_norm": 0.015591271221637726, "learning_rate": 9.868088301328812e-05, "loss": 0.0132, "step": 1160 }, { "epoch": 0.8293118856121537, "eval_loss": 0.013407688587903976, "eval_runtime": 4.6444, "eval_samples_per_second": 10.766, "eval_steps_per_second": 2.799, "step": 1160 }, { "epoch": 0.8300268096514746, "grad_norm": 0.0199967622756958, "learning_rate": 9.867517939073728e-05, "loss": 0.0242, "step": 1161 }, { "epoch": 0.8307417336907954, "grad_norm": 0.013963688164949417, "learning_rate": 9.866946362960525e-05, "loss": 0.0157, "step": 1162 }, { "epoch": 0.8314566577301161, "grad_norm": 0.014425266534090042, "learning_rate": 9.866373573131744e-05, "loss": 0.0166, "step": 1163 }, { "epoch": 0.832171581769437, "grad_norm": 0.017364637926220894, "learning_rate": 9.865799569730228e-05, "loss": 0.0144, "step": 1164 }, { "epoch": 0.8328865058087578, "grad_norm": 0.018696824088692665, "learning_rate": 9.865224352899119e-05, "loss": 0.0277, "step": 1165 }, { "epoch": 0.8328865058087578, "eval_loss": 0.0130561338737607, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 1165 }, { "epoch": 0.8336014298480786, "grad_norm": 0.014379307627677917, "learning_rate": 9.864647922781867e-05, "loss": 0.0135, "step": 1166 }, { "epoch": 0.8343163538873994, "grad_norm": 0.020553480833768845, "learning_rate": 9.864070279522222e-05, "loss": 0.0178, "step": 1167 }, { "epoch": 0.8350312779267203, "grad_norm": 0.020927799865603447, "learning_rate": 9.863491423264233e-05, "loss": 0.0198, "step": 1168 }, { "epoch": 0.8357462019660411, "grad_norm": 0.016653938218951225, "learning_rate": 9.862911354152257e-05, "loss": 0.0163, "step": 1169 }, { "epoch": 0.8364611260053619, "grad_norm": 0.013644501566886902, "learning_rate": 9.862330072330952e-05, "loss": 0.0145, "step": 1170 }, { "epoch": 0.8364611260053619, "eval_loss": 0.013134140521287918, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 1170 }, { "epoch": 0.8371760500446828, "grad_norm": 0.02110767923295498, "learning_rate": 9.861747577945275e-05, "loss": 0.0152, "step": 1171 }, { "epoch": 0.8378909740840036, "grad_norm": 0.015295229852199554, "learning_rate": 9.86116387114049e-05, "loss": 0.012, "step": 1172 }, { "epoch": 0.8386058981233244, "grad_norm": 0.014971633441746235, "learning_rate": 9.86057895206216e-05, "loss": 0.0178, "step": 1173 }, { "epoch": 0.8393208221626453, "grad_norm": 0.019601067528128624, "learning_rate": 9.859992820856155e-05, "loss": 0.0151, "step": 1174 }, { "epoch": 0.8400357462019661, "grad_norm": 0.015777163207530975, "learning_rate": 9.85940547766864e-05, "loss": 0.0143, "step": 1175 }, { "epoch": 0.8400357462019661, "eval_loss": 0.012908750213682652, "eval_runtime": 4.5796, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 1175 }, { "epoch": 0.8407506702412869, "grad_norm": 0.02289997600018978, "learning_rate": 9.858816922646087e-05, "loss": 0.0206, "step": 1176 }, { "epoch": 0.8414655942806076, "grad_norm": 0.01767389476299286, "learning_rate": 9.858227155935271e-05, "loss": 0.0214, "step": 1177 }, { "epoch": 0.8421805183199285, "grad_norm": 0.021135015413165092, "learning_rate": 9.857636177683267e-05, "loss": 0.0157, "step": 1178 }, { "epoch": 0.8428954423592493, "grad_norm": 0.022501885890960693, "learning_rate": 9.857043988037453e-05, "loss": 0.0222, "step": 1179 }, { "epoch": 0.8436103663985701, "grad_norm": 0.01853802055120468, "learning_rate": 9.856450587145508e-05, "loss": 0.0167, "step": 1180 }, { "epoch": 0.8436103663985701, "eval_loss": 0.013267473317682743, "eval_runtime": 4.6085, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 1180 }, { "epoch": 0.844325290437891, "grad_norm": 0.016138358041644096, "learning_rate": 9.855855975155412e-05, "loss": 0.0169, "step": 1181 }, { "epoch": 0.8450402144772118, "grad_norm": 0.01742742396891117, "learning_rate": 9.855260152215454e-05, "loss": 0.0143, "step": 1182 }, { "epoch": 0.8457551385165326, "grad_norm": 0.01472804881632328, "learning_rate": 9.854663118474217e-05, "loss": 0.0108, "step": 1183 }, { "epoch": 0.8464700625558534, "grad_norm": 0.01440014224499464, "learning_rate": 9.854064874080588e-05, "loss": 0.0187, "step": 1184 }, { "epoch": 0.8471849865951743, "grad_norm": 0.0144980913028121, "learning_rate": 9.853465419183759e-05, "loss": 0.0174, "step": 1185 }, { "epoch": 0.8471849865951743, "eval_loss": 0.01345520094037056, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1185 }, { "epoch": 0.8478999106344951, "grad_norm": 0.020818186923861504, "learning_rate": 9.852864753933218e-05, "loss": 0.0197, "step": 1186 }, { "epoch": 0.8486148346738159, "grad_norm": 0.017506172880530357, "learning_rate": 9.852262878478761e-05, "loss": 0.021, "step": 1187 }, { "epoch": 0.8493297587131368, "grad_norm": 0.017795799300074577, "learning_rate": 9.851659792970484e-05, "loss": 0.0156, "step": 1188 }, { "epoch": 0.8500446827524576, "grad_norm": 0.014222699217498302, "learning_rate": 9.851055497558783e-05, "loss": 0.0126, "step": 1189 }, { "epoch": 0.8507596067917784, "grad_norm": 0.016292179003357887, "learning_rate": 9.850449992394357e-05, "loss": 0.0189, "step": 1190 }, { "epoch": 0.8507596067917784, "eval_loss": 0.013417894020676613, "eval_runtime": 4.5892, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 1190 }, { "epoch": 0.8514745308310991, "grad_norm": 0.01913430169224739, "learning_rate": 9.849843277628206e-05, "loss": 0.0226, "step": 1191 }, { "epoch": 0.85218945487042, "grad_norm": 0.013465292751789093, "learning_rate": 9.849235353411632e-05, "loss": 0.0113, "step": 1192 }, { "epoch": 0.8529043789097408, "grad_norm": 0.01708909124135971, "learning_rate": 9.84862621989624e-05, "loss": 0.0198, "step": 1193 }, { "epoch": 0.8536193029490616, "grad_norm": 0.015194946900010109, "learning_rate": 9.848015877233934e-05, "loss": 0.0215, "step": 1194 }, { "epoch": 0.8543342269883825, "grad_norm": 0.016921667382121086, "learning_rate": 9.847404325576921e-05, "loss": 0.0148, "step": 1195 }, { "epoch": 0.8543342269883825, "eval_loss": 0.013260171748697758, "eval_runtime": 4.5951, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 1195 }, { "epoch": 0.8550491510277033, "grad_norm": 0.01788436248898506, "learning_rate": 9.846791565077709e-05, "loss": 0.0196, "step": 1196 }, { "epoch": 0.8557640750670241, "grad_norm": 0.013895484618842602, "learning_rate": 9.846177595889109e-05, "loss": 0.0122, "step": 1197 }, { "epoch": 0.856478999106345, "grad_norm": 0.017512673512101173, "learning_rate": 9.845562418164232e-05, "loss": 0.0162, "step": 1198 }, { "epoch": 0.8571939231456658, "grad_norm": 0.016499364748597145, "learning_rate": 9.844946032056487e-05, "loss": 0.0204, "step": 1199 }, { "epoch": 0.8579088471849866, "grad_norm": 0.018832750618457794, "learning_rate": 9.844328437719594e-05, "loss": 0.0207, "step": 1200 }, { "epoch": 0.8579088471849866, "eval_loss": 0.013223315589129925, "eval_runtime": 4.5928, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 1200 }, { "epoch": 0.8586237712243074, "grad_norm": 0.013194214552640915, "learning_rate": 9.843709635307563e-05, "loss": 0.0144, "step": 1201 }, { "epoch": 0.8593386952636283, "grad_norm": 0.017569920048117638, "learning_rate": 9.843089624974715e-05, "loss": 0.0185, "step": 1202 }, { "epoch": 0.8600536193029491, "grad_norm": 0.015556905418634415, "learning_rate": 9.842468406875663e-05, "loss": 0.0184, "step": 1203 }, { "epoch": 0.8607685433422699, "grad_norm": 0.01998225413262844, "learning_rate": 9.841845981165329e-05, "loss": 0.0284, "step": 1204 }, { "epoch": 0.8614834673815907, "grad_norm": 0.016948221251368523, "learning_rate": 9.841222347998933e-05, "loss": 0.0193, "step": 1205 }, { "epoch": 0.8614834673815907, "eval_loss": 0.013377540744841099, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1205 }, { "epoch": 0.8621983914209115, "grad_norm": 0.016791630536317825, "learning_rate": 9.840597507531997e-05, "loss": 0.0142, "step": 1206 }, { "epoch": 0.8629133154602323, "grad_norm": 0.019164733588695526, "learning_rate": 9.839971459920338e-05, "loss": 0.0183, "step": 1207 }, { "epoch": 0.8636282394995531, "grad_norm": 0.017145274206995964, "learning_rate": 9.839344205320087e-05, "loss": 0.0165, "step": 1208 }, { "epoch": 0.864343163538874, "grad_norm": 0.01810084655880928, "learning_rate": 9.838715743887662e-05, "loss": 0.0214, "step": 1209 }, { "epoch": 0.8650580875781948, "grad_norm": 0.01597752422094345, "learning_rate": 9.838086075779791e-05, "loss": 0.0148, "step": 1210 }, { "epoch": 0.8650580875781948, "eval_loss": 0.013448036275804043, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 1210 }, { "epoch": 0.8657730116175156, "grad_norm": 0.015885191038250923, "learning_rate": 9.837455201153501e-05, "loss": 0.0139, "step": 1211 }, { "epoch": 0.8664879356568365, "grad_norm": 0.017858343198895454, "learning_rate": 9.836823120166116e-05, "loss": 0.0185, "step": 1212 }, { "epoch": 0.8672028596961573, "grad_norm": 0.020826734602451324, "learning_rate": 9.836189832975268e-05, "loss": 0.0213, "step": 1213 }, { "epoch": 0.8679177837354781, "grad_norm": 0.0165763720870018, "learning_rate": 9.835555339738882e-05, "loss": 0.0166, "step": 1214 }, { "epoch": 0.868632707774799, "grad_norm": 0.014637845568358898, "learning_rate": 9.83491964061519e-05, "loss": 0.0167, "step": 1215 }, { "epoch": 0.868632707774799, "eval_loss": 0.013437869027256966, "eval_runtime": 4.5978, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.827, "step": 1215 }, { "epoch": 0.8693476318141198, "grad_norm": 0.015151319094002247, "learning_rate": 9.834282735762721e-05, "loss": 0.0163, "step": 1216 }, { "epoch": 0.8700625558534406, "grad_norm": 0.01661631464958191, "learning_rate": 9.833644625340306e-05, "loss": 0.0222, "step": 1217 }, { "epoch": 0.8707774798927614, "grad_norm": 0.01849173940718174, "learning_rate": 9.833005309507077e-05, "loss": 0.014, "step": 1218 }, { "epoch": 0.8714924039320823, "grad_norm": 0.01480170153081417, "learning_rate": 9.832364788422464e-05, "loss": 0.0143, "step": 1219 }, { "epoch": 0.872207327971403, "grad_norm": 0.01379022840410471, "learning_rate": 9.831723062246203e-05, "loss": 0.0154, "step": 1220 }, { "epoch": 0.872207327971403, "eval_loss": 0.013411223888397217, "eval_runtime": 4.6004, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 1220 }, { "epoch": 0.8729222520107238, "grad_norm": 0.01828802563250065, "learning_rate": 9.831080131138325e-05, "loss": 0.0228, "step": 1221 }, { "epoch": 0.8736371760500447, "grad_norm": 0.020180990919470787, "learning_rate": 9.830435995259164e-05, "loss": 0.0185, "step": 1222 }, { "epoch": 0.8743521000893655, "grad_norm": 0.017111308872699738, "learning_rate": 9.829790654769355e-05, "loss": 0.0218, "step": 1223 }, { "epoch": 0.8750670241286863, "grad_norm": 0.017275040969252586, "learning_rate": 9.829144109829832e-05, "loss": 0.0147, "step": 1224 }, { "epoch": 0.8757819481680071, "grad_norm": 0.02119617350399494, "learning_rate": 9.82849636060183e-05, "loss": 0.0235, "step": 1225 }, { "epoch": 0.8757819481680071, "eval_loss": 0.013308217748999596, "eval_runtime": 4.5929, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1225 }, { "epoch": 0.876496872207328, "grad_norm": 0.015948539599776268, "learning_rate": 9.827847407246885e-05, "loss": 0.0198, "step": 1226 }, { "epoch": 0.8772117962466488, "grad_norm": 0.013654090464115143, "learning_rate": 9.82719724992683e-05, "loss": 0.0202, "step": 1227 }, { "epoch": 0.8779267202859696, "grad_norm": 0.01647047884762287, "learning_rate": 9.826545888803802e-05, "loss": 0.0137, "step": 1228 }, { "epoch": 0.8786416443252905, "grad_norm": 0.019747061654925346, "learning_rate": 9.825893324040239e-05, "loss": 0.0162, "step": 1229 }, { "epoch": 0.8793565683646113, "grad_norm": 0.016837041825056076, "learning_rate": 9.825239555798874e-05, "loss": 0.0126, "step": 1230 }, { "epoch": 0.8793565683646113, "eval_loss": 0.0129969771951437, "eval_runtime": 4.6014, "eval_samples_per_second": 10.866, "eval_steps_per_second": 2.825, "step": 1230 }, { "epoch": 0.8800714924039321, "grad_norm": 0.01726115308701992, "learning_rate": 9.824584584242746e-05, "loss": 0.0179, "step": 1231 }, { "epoch": 0.880786416443253, "grad_norm": 0.022714652121067047, "learning_rate": 9.82392840953519e-05, "loss": 0.0229, "step": 1232 }, { "epoch": 0.8815013404825738, "grad_norm": 0.01539556309580803, "learning_rate": 9.823271031839842e-05, "loss": 0.018, "step": 1233 }, { "epoch": 0.8822162645218945, "grad_norm": 0.019126055762171745, "learning_rate": 9.822612451320639e-05, "loss": 0.0154, "step": 1234 }, { "epoch": 0.8829311885612153, "grad_norm": 0.013105323538184166, "learning_rate": 9.821952668141817e-05, "loss": 0.0097, "step": 1235 }, { "epoch": 0.8829311885612153, "eval_loss": 0.0129700917750597, "eval_runtime": 4.5895, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 1235 }, { "epoch": 0.8836461126005362, "grad_norm": 0.01699787750840187, "learning_rate": 9.821291682467912e-05, "loss": 0.0183, "step": 1236 }, { "epoch": 0.884361036639857, "grad_norm": 0.0187484472990036, "learning_rate": 9.820629494463762e-05, "loss": 0.0224, "step": 1237 }, { "epoch": 0.8850759606791778, "grad_norm": 0.015566004440188408, "learning_rate": 9.8199661042945e-05, "loss": 0.0162, "step": 1238 }, { "epoch": 0.8857908847184987, "grad_norm": 0.014897791668772697, "learning_rate": 9.819301512125565e-05, "loss": 0.0148, "step": 1239 }, { "epoch": 0.8865058087578195, "grad_norm": 0.0173040721565485, "learning_rate": 9.818635718122691e-05, "loss": 0.0212, "step": 1240 }, { "epoch": 0.8865058087578195, "eval_loss": 0.012411043979227543, "eval_runtime": 4.6018, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 1240 }, { "epoch": 0.8872207327971403, "grad_norm": 0.01746375672519207, "learning_rate": 9.817968722451911e-05, "loss": 0.0155, "step": 1241 }, { "epoch": 0.8879356568364611, "grad_norm": 0.013922331854701042, "learning_rate": 9.817300525279562e-05, "loss": 0.0147, "step": 1242 }, { "epoch": 0.888650580875782, "grad_norm": 0.01479959674179554, "learning_rate": 9.81663112677228e-05, "loss": 0.0193, "step": 1243 }, { "epoch": 0.8893655049151028, "grad_norm": 0.015101766213774681, "learning_rate": 9.815960527096996e-05, "loss": 0.0186, "step": 1244 }, { "epoch": 0.8900804289544236, "grad_norm": 0.013870811089873314, "learning_rate": 9.815288726420948e-05, "loss": 0.0212, "step": 1245 }, { "epoch": 0.8900804289544236, "eval_loss": 0.012881721369922161, "eval_runtime": 4.5861, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1245 }, { "epoch": 0.8907953529937445, "grad_norm": 0.016681673005223274, "learning_rate": 9.814615724911665e-05, "loss": 0.0173, "step": 1246 }, { "epoch": 0.8915102770330653, "grad_norm": 0.012372423894703388, "learning_rate": 9.81394152273698e-05, "loss": 0.0122, "step": 1247 }, { "epoch": 0.892225201072386, "grad_norm": 0.018017999827861786, "learning_rate": 9.813266120065028e-05, "loss": 0.0145, "step": 1248 }, { "epoch": 0.8929401251117068, "grad_norm": 0.01750170812010765, "learning_rate": 9.812589517064236e-05, "loss": 0.0117, "step": 1249 }, { "epoch": 0.8936550491510277, "grad_norm": 0.012318171560764313, "learning_rate": 9.811911713903339e-05, "loss": 0.0118, "step": 1250 }, { "epoch": 0.8936550491510277, "eval_loss": 0.013212180696427822, "eval_runtime": 4.5891, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 1250 }, { "epoch": 0.8943699731903485, "grad_norm": 0.02341502346098423, "learning_rate": 9.811232710751365e-05, "loss": 0.0208, "step": 1251 }, { "epoch": 0.8950848972296693, "grad_norm": 0.019531317055225372, "learning_rate": 9.810552507777643e-05, "loss": 0.0137, "step": 1252 }, { "epoch": 0.8957998212689902, "grad_norm": 0.0151118915528059, "learning_rate": 9.809871105151805e-05, "loss": 0.0168, "step": 1253 }, { "epoch": 0.896514745308311, "grad_norm": 0.019991623237729073, "learning_rate": 9.809188503043774e-05, "loss": 0.0264, "step": 1254 }, { "epoch": 0.8972296693476318, "grad_norm": 0.017251526936888695, "learning_rate": 9.808504701623778e-05, "loss": 0.0168, "step": 1255 }, { "epoch": 0.8972296693476318, "eval_loss": 0.013003915548324585, "eval_runtime": 4.6114, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 1255 }, { "epoch": 0.8979445933869526, "grad_norm": 0.01665487140417099, "learning_rate": 9.807819701062344e-05, "loss": 0.0216, "step": 1256 }, { "epoch": 0.8986595174262735, "grad_norm": 0.01606437750160694, "learning_rate": 9.807133501530296e-05, "loss": 0.023, "step": 1257 }, { "epoch": 0.8993744414655943, "grad_norm": 0.016007443889975548, "learning_rate": 9.806446103198761e-05, "loss": 0.0147, "step": 1258 }, { "epoch": 0.9000893655049151, "grad_norm": 0.018504707142710686, "learning_rate": 9.805757506239157e-05, "loss": 0.0132, "step": 1259 }, { "epoch": 0.900804289544236, "grad_norm": 0.017411774024367332, "learning_rate": 9.80506771082321e-05, "loss": 0.016, "step": 1260 }, { "epoch": 0.900804289544236, "eval_loss": 0.012881890870630741, "eval_runtime": 4.5911, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 1260 }, { "epoch": 0.9015192135835568, "grad_norm": 0.016499711200594902, "learning_rate": 9.804376717122939e-05, "loss": 0.0228, "step": 1261 }, { "epoch": 0.9022341376228775, "grad_norm": 0.013530652038753033, "learning_rate": 9.803684525310661e-05, "loss": 0.0109, "step": 1262 }, { "epoch": 0.9029490616621983, "grad_norm": 0.01475375983864069, "learning_rate": 9.802991135558999e-05, "loss": 0.0169, "step": 1263 }, { "epoch": 0.9036639857015192, "grad_norm": 0.016262995079159737, "learning_rate": 9.802296548040868e-05, "loss": 0.0216, "step": 1264 }, { "epoch": 0.90437890974084, "grad_norm": 0.020124206319451332, "learning_rate": 9.801600762929481e-05, "loss": 0.0179, "step": 1265 }, { "epoch": 0.90437890974084, "eval_loss": 0.01291989628225565, "eval_runtime": 4.6136, "eval_samples_per_second": 10.837, "eval_steps_per_second": 2.818, "step": 1265 }, { "epoch": 0.9050938337801608, "grad_norm": 0.01884431578218937, "learning_rate": 9.800903780398357e-05, "loss": 0.0227, "step": 1266 }, { "epoch": 0.9058087578194817, "grad_norm": 0.01509191282093525, "learning_rate": 9.800205600621306e-05, "loss": 0.018, "step": 1267 }, { "epoch": 0.9065236818588025, "grad_norm": 0.01858326978981495, "learning_rate": 9.799506223772441e-05, "loss": 0.0205, "step": 1268 }, { "epoch": 0.9072386058981233, "grad_norm": 0.01501715648919344, "learning_rate": 9.798805650026173e-05, "loss": 0.0142, "step": 1269 }, { "epoch": 0.9079535299374442, "grad_norm": 0.016340190544724464, "learning_rate": 9.798103879557207e-05, "loss": 0.0181, "step": 1270 }, { "epoch": 0.9079535299374442, "eval_loss": 0.013258466497063637, "eval_runtime": 4.5929, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1270 }, { "epoch": 0.908668453976765, "grad_norm": 0.016513215377926826, "learning_rate": 9.797400912540553e-05, "loss": 0.0123, "step": 1271 }, { "epoch": 0.9093833780160858, "grad_norm": 0.014735297299921513, "learning_rate": 9.796696749151516e-05, "loss": 0.0131, "step": 1272 }, { "epoch": 0.9100983020554066, "grad_norm": 0.01801518350839615, "learning_rate": 9.795991389565697e-05, "loss": 0.0136, "step": 1273 }, { "epoch": 0.9108132260947275, "grad_norm": 0.013725166209042072, "learning_rate": 9.795284833959002e-05, "loss": 0.014, "step": 1274 }, { "epoch": 0.9115281501340483, "grad_norm": 0.02017209120094776, "learning_rate": 9.79457708250763e-05, "loss": 0.0182, "step": 1275 }, { "epoch": 0.9115281501340483, "eval_loss": 0.01280316710472107, "eval_runtime": 4.613, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.818, "step": 1275 }, { "epoch": 0.912243074173369, "grad_norm": 0.01659076102077961, "learning_rate": 9.79386813538808e-05, "loss": 0.011, "step": 1276 }, { "epoch": 0.9129579982126899, "grad_norm": 0.017437756061553955, "learning_rate": 9.793157992777146e-05, "loss": 0.0208, "step": 1277 }, { "epoch": 0.9136729222520107, "grad_norm": 0.0201569814234972, "learning_rate": 9.792446654851927e-05, "loss": 0.0152, "step": 1278 }, { "epoch": 0.9143878462913315, "grad_norm": 0.017390333116054535, "learning_rate": 9.791734121789813e-05, "loss": 0.0193, "step": 1279 }, { "epoch": 0.9151027703306523, "grad_norm": 0.013607950881123543, "learning_rate": 9.791020393768495e-05, "loss": 0.0104, "step": 1280 }, { "epoch": 0.9151027703306523, "eval_loss": 0.013112986460328102, "eval_runtime": 4.5982, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 1280 }, { "epoch": 0.9158176943699732, "grad_norm": 0.01759437285363674, "learning_rate": 9.790305470965964e-05, "loss": 0.0143, "step": 1281 }, { "epoch": 0.916532618409294, "grad_norm": 0.01919795200228691, "learning_rate": 9.789589353560504e-05, "loss": 0.0228, "step": 1282 }, { "epoch": 0.9172475424486148, "grad_norm": 0.01979047618806362, "learning_rate": 9.788872041730702e-05, "loss": 0.0122, "step": 1283 }, { "epoch": 0.9179624664879357, "grad_norm": 0.021483000367879868, "learning_rate": 9.788153535655441e-05, "loss": 0.016, "step": 1284 }, { "epoch": 0.9186773905272565, "grad_norm": 0.017526909708976746, "learning_rate": 9.787433835513901e-05, "loss": 0.0165, "step": 1285 }, { "epoch": 0.9186773905272565, "eval_loss": 0.012857168912887573, "eval_runtime": 4.5906, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 1285 }, { "epoch": 0.9193923145665773, "grad_norm": 0.018127351999282837, "learning_rate": 9.786712941485558e-05, "loss": 0.0138, "step": 1286 }, { "epoch": 0.9201072386058982, "grad_norm": 0.01464176271110773, "learning_rate": 9.785990853750193e-05, "loss": 0.0113, "step": 1287 }, { "epoch": 0.920822162645219, "grad_norm": 0.01758602075278759, "learning_rate": 9.785267572487875e-05, "loss": 0.0139, "step": 1288 }, { "epoch": 0.9215370866845398, "grad_norm": 0.014918595552444458, "learning_rate": 9.784543097878977e-05, "loss": 0.0183, "step": 1289 }, { "epoch": 0.9222520107238605, "grad_norm": 0.017024129629135132, "learning_rate": 9.783817430104169e-05, "loss": 0.0175, "step": 1290 }, { "epoch": 0.9222520107238605, "eval_loss": 0.013115543872117996, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 1290 }, { "epoch": 0.9229669347631814, "grad_norm": 0.016920577734708786, "learning_rate": 9.783090569344417e-05, "loss": 0.0112, "step": 1291 }, { "epoch": 0.9236818588025022, "grad_norm": 0.015683166682720184, "learning_rate": 9.782362515780983e-05, "loss": 0.016, "step": 1292 }, { "epoch": 0.924396782841823, "grad_norm": 0.01833806000649929, "learning_rate": 9.781633269595431e-05, "loss": 0.0151, "step": 1293 }, { "epoch": 0.9251117068811439, "grad_norm": 0.021099021658301353, "learning_rate": 9.780902830969619e-05, "loss": 0.0169, "step": 1294 }, { "epoch": 0.9258266309204647, "grad_norm": 0.012118694372475147, "learning_rate": 9.780171200085704e-05, "loss": 0.0184, "step": 1295 }, { "epoch": 0.9258266309204647, "eval_loss": 0.0128885917365551, "eval_runtime": 4.6166, "eval_samples_per_second": 10.83, "eval_steps_per_second": 2.816, "step": 1295 }, { "epoch": 0.9265415549597855, "grad_norm": 0.018263380974531174, "learning_rate": 9.779438377126137e-05, "loss": 0.016, "step": 1296 }, { "epoch": 0.9272564789991063, "grad_norm": 0.01789204403758049, "learning_rate": 9.778704362273673e-05, "loss": 0.0201, "step": 1297 }, { "epoch": 0.9279714030384272, "grad_norm": 0.017316849902272224, "learning_rate": 9.777969155711356e-05, "loss": 0.0115, "step": 1298 }, { "epoch": 0.928686327077748, "grad_norm": 0.019348422065377235, "learning_rate": 9.777232757622534e-05, "loss": 0.027, "step": 1299 }, { "epoch": 0.9294012511170688, "grad_norm": 0.019310900941491127, "learning_rate": 9.776495168190848e-05, "loss": 0.0261, "step": 1300 }, { "epoch": 0.9294012511170688, "eval_loss": 0.012848464772105217, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 1300 }, { "epoch": 0.9301161751563897, "grad_norm": 0.012809370644390583, "learning_rate": 9.775756387600239e-05, "loss": 0.013, "step": 1301 }, { "epoch": 0.9308310991957105, "grad_norm": 0.016938401386141777, "learning_rate": 9.775016416034941e-05, "loss": 0.0278, "step": 1302 }, { "epoch": 0.9315460232350313, "grad_norm": 0.020758697763085365, "learning_rate": 9.77427525367949e-05, "loss": 0.017, "step": 1303 }, { "epoch": 0.932260947274352, "grad_norm": 0.01596921868622303, "learning_rate": 9.773532900718716e-05, "loss": 0.0139, "step": 1304 }, { "epoch": 0.9329758713136729, "grad_norm": 0.017425954341888428, "learning_rate": 9.772789357337746e-05, "loss": 0.0198, "step": 1305 }, { "epoch": 0.9329758713136729, "eval_loss": 0.012322510592639446, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1305 }, { "epoch": 0.9336907953529937, "grad_norm": 0.015796778723597527, "learning_rate": 9.772044623722006e-05, "loss": 0.0241, "step": 1306 }, { "epoch": 0.9344057193923145, "grad_norm": 0.015051514841616154, "learning_rate": 9.771298700057214e-05, "loss": 0.0233, "step": 1307 }, { "epoch": 0.9351206434316354, "grad_norm": 0.013092206791043282, "learning_rate": 9.770551586529388e-05, "loss": 0.0111, "step": 1308 }, { "epoch": 0.9358355674709562, "grad_norm": 0.016229448840022087, "learning_rate": 9.769803283324846e-05, "loss": 0.0122, "step": 1309 }, { "epoch": 0.936550491510277, "grad_norm": 0.014611151069402695, "learning_rate": 9.769053790630199e-05, "loss": 0.0186, "step": 1310 }, { "epoch": 0.936550491510277, "eval_loss": 0.012424023821949959, "eval_runtime": 4.5854, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1310 }, { "epoch": 0.9372654155495979, "grad_norm": 0.018073545768857002, "learning_rate": 9.76830310863235e-05, "loss": 0.0156, "step": 1311 }, { "epoch": 0.9379803395889187, "grad_norm": 0.016361122950911522, "learning_rate": 9.767551237518509e-05, "loss": 0.0171, "step": 1312 }, { "epoch": 0.9386952636282395, "grad_norm": 0.017078639939427376, "learning_rate": 9.766798177476175e-05, "loss": 0.026, "step": 1313 }, { "epoch": 0.9394101876675603, "grad_norm": 0.016845649108290672, "learning_rate": 9.766043928693145e-05, "loss": 0.0206, "step": 1314 }, { "epoch": 0.9401251117068812, "grad_norm": 0.01470094546675682, "learning_rate": 9.765288491357513e-05, "loss": 0.0173, "step": 1315 }, { "epoch": 0.9401251117068812, "eval_loss": 0.01269129104912281, "eval_runtime": 4.5909, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 1315 }, { "epoch": 0.940840035746202, "grad_norm": 0.018358290195465088, "learning_rate": 9.764531865657672e-05, "loss": 0.0173, "step": 1316 }, { "epoch": 0.9415549597855228, "grad_norm": 0.012079339474439621, "learning_rate": 9.763774051782305e-05, "loss": 0.0134, "step": 1317 }, { "epoch": 0.9422698838248436, "grad_norm": 0.021367639303207397, "learning_rate": 9.763015049920397e-05, "loss": 0.0165, "step": 1318 }, { "epoch": 0.9429848078641644, "grad_norm": 0.015948576852679253, "learning_rate": 9.762254860261228e-05, "loss": 0.0179, "step": 1319 }, { "epoch": 0.9436997319034852, "grad_norm": 0.015145447105169296, "learning_rate": 9.761493482994372e-05, "loss": 0.0248, "step": 1320 }, { "epoch": 0.9436997319034852, "eval_loss": 0.012395847588777542, "eval_runtime": 4.5932, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1320 }, { "epoch": 0.944414655942806, "grad_norm": 0.01723407581448555, "learning_rate": 9.760730918309701e-05, "loss": 0.0165, "step": 1321 }, { "epoch": 0.9451295799821269, "grad_norm": 0.019361617043614388, "learning_rate": 9.759967166397385e-05, "loss": 0.0187, "step": 1322 }, { "epoch": 0.9458445040214477, "grad_norm": 0.01842077076435089, "learning_rate": 9.759202227447886e-05, "loss": 0.0139, "step": 1323 }, { "epoch": 0.9465594280607685, "grad_norm": 0.018709121271967888, "learning_rate": 9.758436101651964e-05, "loss": 0.0193, "step": 1324 }, { "epoch": 0.9472743521000894, "grad_norm": 0.016777481883764267, "learning_rate": 9.757668789200675e-05, "loss": 0.0197, "step": 1325 }, { "epoch": 0.9472743521000894, "eval_loss": 0.01261009369045496, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1325 }, { "epoch": 0.9479892761394102, "grad_norm": 0.015147408470511436, "learning_rate": 9.75690029028537e-05, "loss": 0.0163, "step": 1326 }, { "epoch": 0.948704200178731, "grad_norm": 0.017130548134446144, "learning_rate": 9.756130605097698e-05, "loss": 0.0215, "step": 1327 }, { "epoch": 0.9494191242180519, "grad_norm": 0.020249858498573303, "learning_rate": 9.755359733829603e-05, "loss": 0.0222, "step": 1328 }, { "epoch": 0.9501340482573727, "grad_norm": 0.017197923734784126, "learning_rate": 9.754587676673322e-05, "loss": 0.0196, "step": 1329 }, { "epoch": 0.9508489722966935, "grad_norm": 0.01321079209446907, "learning_rate": 9.753814433821392e-05, "loss": 0.0161, "step": 1330 }, { "epoch": 0.9508489722966935, "eval_loss": 0.012607271783053875, "eval_runtime": 4.5941, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 1330 }, { "epoch": 0.9515638963360143, "grad_norm": 0.013235106132924557, "learning_rate": 9.753040005466643e-05, "loss": 0.012, "step": 1331 }, { "epoch": 0.9522788203753351, "grad_norm": 0.017336487770080566, "learning_rate": 9.752264391802203e-05, "loss": 0.0113, "step": 1332 }, { "epoch": 0.9529937444146559, "grad_norm": 0.012607183307409286, "learning_rate": 9.75148759302149e-05, "loss": 0.0113, "step": 1333 }, { "epoch": 0.9537086684539767, "grad_norm": 0.015608541667461395, "learning_rate": 9.750709609318227e-05, "loss": 0.0184, "step": 1334 }, { "epoch": 0.9544235924932976, "grad_norm": 0.01746016927063465, "learning_rate": 9.749930440886423e-05, "loss": 0.0144, "step": 1335 }, { "epoch": 0.9544235924932976, "eval_loss": 0.012661738321185112, "eval_runtime": 4.6096, "eval_samples_per_second": 10.847, "eval_steps_per_second": 2.82, "step": 1335 }, { "epoch": 0.9551385165326184, "grad_norm": 0.01505228504538536, "learning_rate": 9.749150087920387e-05, "loss": 0.0202, "step": 1336 }, { "epoch": 0.9558534405719392, "grad_norm": 0.014959008432924747, "learning_rate": 9.748368550614724e-05, "loss": 0.0178, "step": 1337 }, { "epoch": 0.95656836461126, "grad_norm": 0.01650477759540081, "learning_rate": 9.747585829164332e-05, "loss": 0.0135, "step": 1338 }, { "epoch": 0.9572832886505809, "grad_norm": 0.016196785494685173, "learning_rate": 9.746801923764408e-05, "loss": 0.0166, "step": 1339 }, { "epoch": 0.9579982126899017, "grad_norm": 0.01849726215004921, "learning_rate": 9.746016834610438e-05, "loss": 0.0189, "step": 1340 }, { "epoch": 0.9579982126899017, "eval_loss": 0.012566771358251572, "eval_runtime": 4.6129, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.818, "step": 1340 }, { "epoch": 0.9587131367292225, "grad_norm": 0.015146259218454361, "learning_rate": 9.74523056189821e-05, "loss": 0.0157, "step": 1341 }, { "epoch": 0.9594280607685434, "grad_norm": 0.013381030410528183, "learning_rate": 9.744443105823801e-05, "loss": 0.0157, "step": 1342 }, { "epoch": 0.9601429848078642, "grad_norm": 0.016266796737909317, "learning_rate": 9.743654466583592e-05, "loss": 0.0182, "step": 1343 }, { "epoch": 0.960857908847185, "grad_norm": 0.0165368914604187, "learning_rate": 9.742864644374247e-05, "loss": 0.0129, "step": 1344 }, { "epoch": 0.9615728328865059, "grad_norm": 0.017238983884453773, "learning_rate": 9.742073639392733e-05, "loss": 0.0206, "step": 1345 }, { "epoch": 0.9615728328865059, "eval_loss": 0.012478298507630825, "eval_runtime": 4.607, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 1345 }, { "epoch": 0.9622877569258266, "grad_norm": 0.01487336028367281, "learning_rate": 9.741281451836313e-05, "loss": 0.0254, "step": 1346 }, { "epoch": 0.9630026809651474, "grad_norm": 0.014625550247728825, "learning_rate": 9.740488081902539e-05, "loss": 0.0188, "step": 1347 }, { "epoch": 0.9637176050044682, "grad_norm": 0.016524532809853554, "learning_rate": 9.739693529789263e-05, "loss": 0.0153, "step": 1348 }, { "epoch": 0.9644325290437891, "grad_norm": 0.01775636337697506, "learning_rate": 9.73889779569463e-05, "loss": 0.0173, "step": 1349 }, { "epoch": 0.9651474530831099, "grad_norm": 0.01865781843662262, "learning_rate": 9.738100879817076e-05, "loss": 0.0174, "step": 1350 }, { "epoch": 0.9651474530831099, "eval_loss": 0.01280543114989996, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 1350 }, { "epoch": 0.9658623771224307, "grad_norm": 0.01987210102379322, "learning_rate": 9.73730278235534e-05, "loss": 0.0188, "step": 1351 }, { "epoch": 0.9665773011617516, "grad_norm": 0.01862185448408127, "learning_rate": 9.73650350350845e-05, "loss": 0.0136, "step": 1352 }, { "epoch": 0.9672922252010724, "grad_norm": 0.01471274346113205, "learning_rate": 9.735703043475726e-05, "loss": 0.0154, "step": 1353 }, { "epoch": 0.9680071492403932, "grad_norm": 0.014311583712697029, "learning_rate": 9.73490140245679e-05, "loss": 0.0166, "step": 1354 }, { "epoch": 0.968722073279714, "grad_norm": 0.018130483105778694, "learning_rate": 9.734098580651553e-05, "loss": 0.0163, "step": 1355 }, { "epoch": 0.968722073279714, "eval_loss": 0.012665856629610062, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1355 }, { "epoch": 0.9694369973190349, "grad_norm": 0.01842976175248623, "learning_rate": 9.733294578260224e-05, "loss": 0.0126, "step": 1356 }, { "epoch": 0.9701519213583557, "grad_norm": 0.017097827047109604, "learning_rate": 9.7324893954833e-05, "loss": 0.0181, "step": 1357 }, { "epoch": 0.9708668453976765, "grad_norm": 0.017096178606152534, "learning_rate": 9.731683032521584e-05, "loss": 0.0185, "step": 1358 }, { "epoch": 0.9715817694369974, "grad_norm": 0.017754849046468735, "learning_rate": 9.73087548957616e-05, "loss": 0.0149, "step": 1359 }, { "epoch": 0.9722966934763181, "grad_norm": 0.017811819911003113, "learning_rate": 9.730066766848413e-05, "loss": 0.0174, "step": 1360 }, { "epoch": 0.9722966934763181, "eval_loss": 0.01261552982032299, "eval_runtime": 4.5895, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 1360 }, { "epoch": 0.9730116175156389, "grad_norm": 0.016067972406744957, "learning_rate": 9.729256864540025e-05, "loss": 0.013, "step": 1361 }, { "epoch": 0.9737265415549597, "grad_norm": 0.014152728952467442, "learning_rate": 9.728445782852966e-05, "loss": 0.0148, "step": 1362 }, { "epoch": 0.9744414655942806, "grad_norm": 0.014653116464614868, "learning_rate": 9.727633521989505e-05, "loss": 0.0163, "step": 1363 }, { "epoch": 0.9751563896336014, "grad_norm": 0.01565997302532196, "learning_rate": 9.726820082152204e-05, "loss": 0.0149, "step": 1364 }, { "epoch": 0.9758713136729222, "grad_norm": 0.021191231906414032, "learning_rate": 9.726005463543913e-05, "loss": 0.0151, "step": 1365 }, { "epoch": 0.9758713136729222, "eval_loss": 0.012383759953081608, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1365 }, { "epoch": 0.9765862377122431, "grad_norm": 0.016044190153479576, "learning_rate": 9.725189666367787e-05, "loss": 0.0174, "step": 1366 }, { "epoch": 0.9773011617515639, "grad_norm": 0.0133839575573802, "learning_rate": 9.724372690827263e-05, "loss": 0.0173, "step": 1367 }, { "epoch": 0.9780160857908847, "grad_norm": 0.015328564681112766, "learning_rate": 9.723554537126083e-05, "loss": 0.0178, "step": 1368 }, { "epoch": 0.9787310098302056, "grad_norm": 0.015755649656057358, "learning_rate": 9.722735205468277e-05, "loss": 0.0161, "step": 1369 }, { "epoch": 0.9794459338695264, "grad_norm": 0.01477291714400053, "learning_rate": 9.721914696058165e-05, "loss": 0.0128, "step": 1370 }, { "epoch": 0.9794459338695264, "eval_loss": 0.012403419241309166, "eval_runtime": 4.5921, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 1370 }, { "epoch": 0.9801608579088472, "grad_norm": 0.014423100277781487, "learning_rate": 9.721093009100369e-05, "loss": 0.0133, "step": 1371 }, { "epoch": 0.980875781948168, "grad_norm": 0.01872345805168152, "learning_rate": 9.720270144799799e-05, "loss": 0.0111, "step": 1372 }, { "epoch": 0.9815907059874889, "grad_norm": 0.021227596327662468, "learning_rate": 9.719446103361662e-05, "loss": 0.0186, "step": 1373 }, { "epoch": 0.9823056300268097, "grad_norm": 0.014445452019572258, "learning_rate": 9.718620884991454e-05, "loss": 0.0212, "step": 1374 }, { "epoch": 0.9830205540661304, "grad_norm": 0.014993205666542053, "learning_rate": 9.71779448989497e-05, "loss": 0.017, "step": 1375 }, { "epoch": 0.9830205540661304, "eval_loss": 0.012145678512752056, "eval_runtime": 4.5875, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1375 }, { "epoch": 0.9837354781054513, "grad_norm": 0.022048041224479675, "learning_rate": 9.716966918278295e-05, "loss": 0.0199, "step": 1376 }, { "epoch": 0.9844504021447721, "grad_norm": 0.014498112723231316, "learning_rate": 9.716138170347808e-05, "loss": 0.0169, "step": 1377 }, { "epoch": 0.9851653261840929, "grad_norm": 0.01477633323520422, "learning_rate": 9.715308246310181e-05, "loss": 0.0111, "step": 1378 }, { "epoch": 0.9858802502234137, "grad_norm": 0.021706100553274155, "learning_rate": 9.714477146372383e-05, "loss": 0.0274, "step": 1379 }, { "epoch": 0.9865951742627346, "grad_norm": 0.014189481735229492, "learning_rate": 9.713644870741668e-05, "loss": 0.0172, "step": 1380 }, { "epoch": 0.9865951742627346, "eval_loss": 0.012036348693072796, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1380 }, { "epoch": 0.9873100983020554, "grad_norm": 0.014897498302161694, "learning_rate": 9.712811419625591e-05, "loss": 0.0172, "step": 1381 }, { "epoch": 0.9880250223413762, "grad_norm": 0.014075960963964462, "learning_rate": 9.711976793231999e-05, "loss": 0.0192, "step": 1382 }, { "epoch": 0.9887399463806971, "grad_norm": 0.016334088519215584, "learning_rate": 9.711140991769027e-05, "loss": 0.0214, "step": 1383 }, { "epoch": 0.9894548704200179, "grad_norm": 0.016760678961873055, "learning_rate": 9.71030401544511e-05, "loss": 0.0183, "step": 1384 }, { "epoch": 0.9901697944593387, "grad_norm": 0.017383402213454247, "learning_rate": 9.709465864468972e-05, "loss": 0.0182, "step": 1385 }, { "epoch": 0.9901697944593387, "eval_loss": 0.01196974702179432, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 1385 }, { "epoch": 0.9908847184986596, "grad_norm": 0.013206923380494118, "learning_rate": 9.708626539049628e-05, "loss": 0.018, "step": 1386 }, { "epoch": 0.9915996425379804, "grad_norm": 0.013950688764452934, "learning_rate": 9.70778603939639e-05, "loss": 0.0149, "step": 1387 }, { "epoch": 0.9923145665773012, "grad_norm": 0.014561481773853302, "learning_rate": 9.706944365718861e-05, "loss": 0.0141, "step": 1388 }, { "epoch": 0.9930294906166219, "grad_norm": 0.020603612065315247, "learning_rate": 9.706101518226938e-05, "loss": 0.0204, "step": 1389 }, { "epoch": 0.9937444146559428, "grad_norm": 0.015244817361235619, "learning_rate": 9.705257497130806e-05, "loss": 0.0231, "step": 1390 }, { "epoch": 0.9937444146559428, "eval_loss": 0.01193149946630001, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 1390 }, { "epoch": 0.9944593386952636, "grad_norm": 0.014603791758418083, "learning_rate": 9.704412302640952e-05, "loss": 0.0132, "step": 1391 }, { "epoch": 0.9951742627345844, "grad_norm": 0.018756719306111336, "learning_rate": 9.703565934968146e-05, "loss": 0.022, "step": 1392 }, { "epoch": 0.9958891867739053, "grad_norm": 0.018533237278461456, "learning_rate": 9.702718394323455e-05, "loss": 0.0195, "step": 1393 }, { "epoch": 0.9966041108132261, "grad_norm": 0.017961207777261734, "learning_rate": 9.70186968091824e-05, "loss": 0.0173, "step": 1394 }, { "epoch": 0.9973190348525469, "grad_norm": 0.016270499676465988, "learning_rate": 9.701019794964151e-05, "loss": 0.0167, "step": 1395 }, { "epoch": 0.9973190348525469, "eval_loss": 0.012068296782672405, "eval_runtime": 4.6146, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 1395 }, { "epoch": 0.9980339588918677, "grad_norm": 0.012491200119256973, "learning_rate": 9.700168736673132e-05, "loss": 0.0099, "step": 1396 }, { "epoch": 0.9987488829311886, "grad_norm": 0.016000665724277496, "learning_rate": 9.69931650625742e-05, "loss": 0.0118, "step": 1397 }, { "epoch": 0.9994638069705094, "grad_norm": 0.01695716753602028, "learning_rate": 9.698463103929542e-05, "loss": 0.0149, "step": 1398 }, { "epoch": 1.0001787310098302, "grad_norm": 0.018545350059866905, "learning_rate": 9.697608529902322e-05, "loss": 0.0161, "step": 1399 }, { "epoch": 1.000893655049151, "grad_norm": 0.01560596376657486, "learning_rate": 9.69675278438887e-05, "loss": 0.0216, "step": 1400 }, { "epoch": 1.000893655049151, "eval_loss": 0.01208866760134697, "eval_runtime": 4.6577, "eval_samples_per_second": 10.735, "eval_steps_per_second": 2.791, "step": 1400 }, { "epoch": 1.001608579088472, "grad_norm": 0.013943064026534557, "learning_rate": 9.695895867602591e-05, "loss": 0.0161, "step": 1401 }, { "epoch": 1.0023235031277926, "grad_norm": 0.01745019108057022, "learning_rate": 9.695037779757184e-05, "loss": 0.0161, "step": 1402 }, { "epoch": 1.0030384271671136, "grad_norm": 0.015333679504692554, "learning_rate": 9.694178521066639e-05, "loss": 0.0261, "step": 1403 }, { "epoch": 1.0037533512064343, "grad_norm": 0.015246083028614521, "learning_rate": 9.693318091745235e-05, "loss": 0.0183, "step": 1404 }, { "epoch": 1.0044682752457552, "grad_norm": 0.015694212168455124, "learning_rate": 9.692456492007548e-05, "loss": 0.0226, "step": 1405 }, { "epoch": 1.0044682752457552, "eval_loss": 0.012034466490149498, "eval_runtime": 4.5896, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 1405 }, { "epoch": 1.005183199285076, "grad_norm": 0.017148736864328384, "learning_rate": 9.69159372206844e-05, "loss": 0.0141, "step": 1406 }, { "epoch": 1.0058981233243969, "grad_norm": 0.014574551954865456, "learning_rate": 9.690729782143071e-05, "loss": 0.0122, "step": 1407 }, { "epoch": 1.0066130473637176, "grad_norm": 0.012545274570584297, "learning_rate": 9.689864672446887e-05, "loss": 0.0145, "step": 1408 }, { "epoch": 1.0073279714030385, "grad_norm": 0.01747596263885498, "learning_rate": 9.68899839319563e-05, "loss": 0.0204, "step": 1409 }, { "epoch": 1.0080428954423593, "grad_norm": 0.017007114365696907, "learning_rate": 9.688130944605332e-05, "loss": 0.012, "step": 1410 }, { "epoch": 1.0080428954423593, "eval_loss": 0.01196687575429678, "eval_runtime": 4.5929, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1410 }, { "epoch": 1.00875781948168, "grad_norm": 0.014272141270339489, "learning_rate": 9.687262326892317e-05, "loss": 0.0131, "step": 1411 }, { "epoch": 1.009472743521001, "grad_norm": 0.014900906011462212, "learning_rate": 9.686392540273198e-05, "loss": 0.0131, "step": 1412 }, { "epoch": 1.0101876675603216, "grad_norm": 0.013925229199230671, "learning_rate": 9.685521584964884e-05, "loss": 0.0162, "step": 1413 }, { "epoch": 1.0109025915996426, "grad_norm": 0.013790036551654339, "learning_rate": 9.684649461184574e-05, "loss": 0.0163, "step": 1414 }, { "epoch": 1.0116175156389633, "grad_norm": 0.01308388076722622, "learning_rate": 9.683776169149755e-05, "loss": 0.0148, "step": 1415 }, { "epoch": 1.0116175156389633, "eval_loss": 0.012101860716938972, "eval_runtime": 4.6005, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 1415 }, { "epoch": 1.0123324396782842, "grad_norm": 0.015927108004689217, "learning_rate": 9.68290170907821e-05, "loss": 0.0144, "step": 1416 }, { "epoch": 1.013047363717605, "grad_norm": 0.0160032007843256, "learning_rate": 9.682026081188009e-05, "loss": 0.0161, "step": 1417 }, { "epoch": 1.013762287756926, "grad_norm": 0.014655036851763725, "learning_rate": 9.68114928569752e-05, "loss": 0.0278, "step": 1418 }, { "epoch": 1.0144772117962466, "grad_norm": 0.01834024302661419, "learning_rate": 9.680271322825392e-05, "loss": 0.0235, "step": 1419 }, { "epoch": 1.0151921358355676, "grad_norm": 0.016547827050089836, "learning_rate": 9.679392192790573e-05, "loss": 0.0139, "step": 1420 }, { "epoch": 1.0151921358355676, "eval_loss": 0.012046534568071365, "eval_runtime": 4.5992, "eval_samples_per_second": 10.871, "eval_steps_per_second": 2.827, "step": 1420 }, { "epoch": 1.0159070598748883, "grad_norm": 0.016205711290240288, "learning_rate": 9.678511895812302e-05, "loss": 0.0141, "step": 1421 }, { "epoch": 1.0166219839142092, "grad_norm": 0.01671687699854374, "learning_rate": 9.677630432110103e-05, "loss": 0.01, "step": 1422 }, { "epoch": 1.01733690795353, "grad_norm": 0.015802498906850815, "learning_rate": 9.676747801903797e-05, "loss": 0.0108, "step": 1423 }, { "epoch": 1.0180518319928507, "grad_norm": 0.02007474936544895, "learning_rate": 9.675864005413495e-05, "loss": 0.0193, "step": 1424 }, { "epoch": 1.0187667560321716, "grad_norm": 0.01713225618004799, "learning_rate": 9.674979042859593e-05, "loss": 0.0174, "step": 1425 }, { "epoch": 1.0187667560321716, "eval_loss": 0.012188945896923542, "eval_runtime": 4.5897, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 1425 }, { "epoch": 1.0194816800714923, "grad_norm": 0.015377532690763474, "learning_rate": 9.674092914462788e-05, "loss": 0.0267, "step": 1426 }, { "epoch": 1.0201966041108133, "grad_norm": 0.012830967083573341, "learning_rate": 9.673205620444058e-05, "loss": 0.0117, "step": 1427 }, { "epoch": 1.020911528150134, "grad_norm": 0.015536744147539139, "learning_rate": 9.672317161024678e-05, "loss": 0.0256, "step": 1428 }, { "epoch": 1.021626452189455, "grad_norm": 0.019377654418349266, "learning_rate": 9.67142753642621e-05, "loss": 0.0145, "step": 1429 }, { "epoch": 1.0223413762287756, "grad_norm": 0.014385178685188293, "learning_rate": 9.670536746870507e-05, "loss": 0.0159, "step": 1430 }, { "epoch": 1.0223413762287756, "eval_loss": 0.01243637129664421, "eval_runtime": 4.6118, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 1430 }, { "epoch": 1.0230563002680966, "grad_norm": 0.01608143374323845, "learning_rate": 9.669644792579716e-05, "loss": 0.018, "step": 1431 }, { "epoch": 1.0237712243074173, "grad_norm": 0.016003984957933426, "learning_rate": 9.66875167377627e-05, "loss": 0.0161, "step": 1432 }, { "epoch": 1.0244861483467382, "grad_norm": 0.017158720642328262, "learning_rate": 9.667857390682896e-05, "loss": 0.0252, "step": 1433 }, { "epoch": 1.025201072386059, "grad_norm": 0.01600365713238716, "learning_rate": 9.666961943522609e-05, "loss": 0.0259, "step": 1434 }, { "epoch": 1.02591599642538, "grad_norm": 0.015557283535599709, "learning_rate": 9.666065332518715e-05, "loss": 0.0209, "step": 1435 }, { "epoch": 1.02591599642538, "eval_loss": 0.012248494662344456, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1435 }, { "epoch": 1.0266309204647006, "grad_norm": 0.01773863472044468, "learning_rate": 9.665167557894807e-05, "loss": 0.0183, "step": 1436 }, { "epoch": 1.0273458445040216, "grad_norm": 0.01630120910704136, "learning_rate": 9.664268619874777e-05, "loss": 0.0138, "step": 1437 }, { "epoch": 1.0280607685433423, "grad_norm": 0.016204847022891045, "learning_rate": 9.663368518682798e-05, "loss": 0.0139, "step": 1438 }, { "epoch": 1.028775692582663, "grad_norm": 0.01192471757531166, "learning_rate": 9.662467254543337e-05, "loss": 0.0163, "step": 1439 }, { "epoch": 1.029490616621984, "grad_norm": 0.013005653396248817, "learning_rate": 9.661564827681153e-05, "loss": 0.0126, "step": 1440 }, { "epoch": 1.029490616621984, "eval_loss": 0.01205227430909872, "eval_runtime": 4.6189, "eval_samples_per_second": 10.825, "eval_steps_per_second": 2.815, "step": 1440 }, { "epoch": 1.0302055406613047, "grad_norm": 0.019784728065133095, "learning_rate": 9.660661238321288e-05, "loss": 0.0238, "step": 1441 }, { "epoch": 1.0309204647006256, "grad_norm": 0.01577785797417164, "learning_rate": 9.659756486689082e-05, "loss": 0.0212, "step": 1442 }, { "epoch": 1.0316353887399463, "grad_norm": 0.012285633012652397, "learning_rate": 9.658850573010161e-05, "loss": 0.0088, "step": 1443 }, { "epoch": 1.0323503127792673, "grad_norm": 0.01634279265999794, "learning_rate": 9.65794349751044e-05, "loss": 0.0154, "step": 1444 }, { "epoch": 1.033065236818588, "grad_norm": 0.021031349897384644, "learning_rate": 9.657035260416125e-05, "loss": 0.0225, "step": 1445 }, { "epoch": 1.033065236818588, "eval_loss": 0.012052839621901512, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 1445 }, { "epoch": 1.033780160857909, "grad_norm": 0.017529835924506187, "learning_rate": 9.656125861953711e-05, "loss": 0.014, "step": 1446 }, { "epoch": 1.0344950848972296, "grad_norm": 0.01729937642812729, "learning_rate": 9.655215302349984e-05, "loss": 0.0159, "step": 1447 }, { "epoch": 1.0352100089365506, "grad_norm": 0.015499744564294815, "learning_rate": 9.654303581832021e-05, "loss": 0.025, "step": 1448 }, { "epoch": 1.0359249329758713, "grad_norm": 0.016597507521510124, "learning_rate": 9.653390700627181e-05, "loss": 0.0197, "step": 1449 }, { "epoch": 1.0366398570151922, "grad_norm": 0.01471409760415554, "learning_rate": 9.652476658963122e-05, "loss": 0.0161, "step": 1450 }, { "epoch": 1.0366398570151922, "eval_loss": 0.012187792919576168, "eval_runtime": 4.6046, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 1450 }, { "epoch": 1.037354781054513, "grad_norm": 0.015982763841748238, "learning_rate": 9.651561457067784e-05, "loss": 0.0183, "step": 1451 }, { "epoch": 1.0380697050938337, "grad_norm": 0.01433226466178894, "learning_rate": 9.650645095169404e-05, "loss": 0.0169, "step": 1452 }, { "epoch": 1.0387846291331546, "grad_norm": 0.012669232673943043, "learning_rate": 9.649727573496498e-05, "loss": 0.0114, "step": 1453 }, { "epoch": 1.0394995531724753, "grad_norm": 0.01876886747777462, "learning_rate": 9.64880889227788e-05, "loss": 0.0228, "step": 1454 }, { "epoch": 1.0402144772117963, "grad_norm": 0.016278943046927452, "learning_rate": 9.647889051742649e-05, "loss": 0.0207, "step": 1455 }, { "epoch": 1.0402144772117963, "eval_loss": 0.01217654813081026, "eval_runtime": 4.5854, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1455 }, { "epoch": 1.040929401251117, "grad_norm": 0.01443054061383009, "learning_rate": 9.646968052120196e-05, "loss": 0.0158, "step": 1456 }, { "epoch": 1.041644325290438, "grad_norm": 0.018767626956105232, "learning_rate": 9.646045893640196e-05, "loss": 0.0197, "step": 1457 }, { "epoch": 1.0423592493297587, "grad_norm": 0.015319926664233208, "learning_rate": 9.645122576532621e-05, "loss": 0.0209, "step": 1458 }, { "epoch": 1.0430741733690796, "grad_norm": 0.01774417981505394, "learning_rate": 9.644198101027721e-05, "loss": 0.0185, "step": 1459 }, { "epoch": 1.0437890974084003, "grad_norm": 0.019774233922362328, "learning_rate": 9.643272467356047e-05, "loss": 0.0276, "step": 1460 }, { "epoch": 1.0437890974084003, "eval_loss": 0.012439858168363571, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1460 }, { "epoch": 1.0445040214477213, "grad_norm": 0.01902852952480316, "learning_rate": 9.64234567574843e-05, "loss": 0.0214, "step": 1461 }, { "epoch": 1.045218945487042, "grad_norm": 0.015265647321939468, "learning_rate": 9.64141772643599e-05, "loss": 0.0172, "step": 1462 }, { "epoch": 1.045933869526363, "grad_norm": 0.015961283817887306, "learning_rate": 9.640488619650146e-05, "loss": 0.0152, "step": 1463 }, { "epoch": 1.0466487935656836, "grad_norm": 0.01369656901806593, "learning_rate": 9.639558355622589e-05, "loss": 0.0221, "step": 1464 }, { "epoch": 1.0473637176050046, "grad_norm": 0.01831701211631298, "learning_rate": 9.638626934585312e-05, "loss": 0.0126, "step": 1465 }, { "epoch": 1.0473637176050046, "eval_loss": 0.012201813980937004, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 1465 }, { "epoch": 1.0480786416443253, "grad_norm": 0.01871369406580925, "learning_rate": 9.637694356770594e-05, "loss": 0.0217, "step": 1466 }, { "epoch": 1.048793565683646, "grad_norm": 0.015181940980255604, "learning_rate": 9.636760622410997e-05, "loss": 0.0181, "step": 1467 }, { "epoch": 1.049508489722967, "grad_norm": 0.015288752503693104, "learning_rate": 9.635825731739377e-05, "loss": 0.0208, "step": 1468 }, { "epoch": 1.0502234137622877, "grad_norm": 0.012857263907790184, "learning_rate": 9.634889684988875e-05, "loss": 0.0094, "step": 1469 }, { "epoch": 1.0509383378016086, "grad_norm": 0.0171213336288929, "learning_rate": 9.633952482392923e-05, "loss": 0.0168, "step": 1470 }, { "epoch": 1.0509383378016086, "eval_loss": 0.012191025540232658, "eval_runtime": 4.5903, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 1470 }, { "epoch": 1.0516532618409293, "grad_norm": 0.014951729215681553, "learning_rate": 9.63301412418524e-05, "loss": 0.0114, "step": 1471 }, { "epoch": 1.0523681858802503, "grad_norm": 0.01716873236000538, "learning_rate": 9.63207461059983e-05, "loss": 0.0268, "step": 1472 }, { "epoch": 1.053083109919571, "grad_norm": 0.014852144755423069, "learning_rate": 9.631133941870993e-05, "loss": 0.0121, "step": 1473 }, { "epoch": 1.053798033958892, "grad_norm": 0.018351174890995026, "learning_rate": 9.630192118233309e-05, "loss": 0.0118, "step": 1474 }, { "epoch": 1.0545129579982127, "grad_norm": 0.013546477071940899, "learning_rate": 9.62924913992165e-05, "loss": 0.0163, "step": 1475 }, { "epoch": 1.0545129579982127, "eval_loss": 0.01218891330063343, "eval_runtime": 4.5898, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 1475 }, { "epoch": 1.0552278820375336, "grad_norm": 0.015062359161674976, "learning_rate": 9.628305007171176e-05, "loss": 0.0229, "step": 1476 }, { "epoch": 1.0559428060768543, "grad_norm": 0.0176682248711586, "learning_rate": 9.627359720217334e-05, "loss": 0.0156, "step": 1477 }, { "epoch": 1.0566577301161753, "grad_norm": 0.015113751403987408, "learning_rate": 9.626413279295858e-05, "loss": 0.015, "step": 1478 }, { "epoch": 1.057372654155496, "grad_norm": 0.01360116247087717, "learning_rate": 9.625465684642772e-05, "loss": 0.0149, "step": 1479 }, { "epoch": 1.058087578194817, "grad_norm": 0.017338382080197334, "learning_rate": 9.624516936494385e-05, "loss": 0.0231, "step": 1480 }, { "epoch": 1.058087578194817, "eval_loss": 0.01217577327042818, "eval_runtime": 4.6086, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 1480 }, { "epoch": 1.0588025022341376, "grad_norm": 0.012014620937407017, "learning_rate": 9.623567035087296e-05, "loss": 0.0145, "step": 1481 }, { "epoch": 1.0595174262734584, "grad_norm": 0.01646295003592968, "learning_rate": 9.622615980658391e-05, "loss": 0.0152, "step": 1482 }, { "epoch": 1.0602323503127793, "grad_norm": 0.01772437058389187, "learning_rate": 9.621663773444842e-05, "loss": 0.0124, "step": 1483 }, { "epoch": 1.0609472743521, "grad_norm": 0.015519833192229271, "learning_rate": 9.620710413684111e-05, "loss": 0.0156, "step": 1484 }, { "epoch": 1.061662198391421, "grad_norm": 0.015078241936862469, "learning_rate": 9.619755901613948e-05, "loss": 0.0205, "step": 1485 }, { "epoch": 1.061662198391421, "eval_loss": 0.012112184427678585, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 1485 }, { "epoch": 1.0623771224307417, "grad_norm": 0.01593601703643799, "learning_rate": 9.618800237472385e-05, "loss": 0.0152, "step": 1486 }, { "epoch": 1.0630920464700626, "grad_norm": 0.015779083594679832, "learning_rate": 9.617843421497745e-05, "loss": 0.0101, "step": 1487 }, { "epoch": 1.0638069705093833, "grad_norm": 0.012951752170920372, "learning_rate": 9.616885453928641e-05, "loss": 0.009, "step": 1488 }, { "epoch": 1.0645218945487043, "grad_norm": 0.016753247007727623, "learning_rate": 9.615926335003969e-05, "loss": 0.0176, "step": 1489 }, { "epoch": 1.065236818588025, "grad_norm": 0.014958434738218784, "learning_rate": 9.614966064962911e-05, "loss": 0.0174, "step": 1490 }, { "epoch": 1.065236818588025, "eval_loss": 0.012151597067713737, "eval_runtime": 4.6081, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 1490 }, { "epoch": 1.065951742627346, "grad_norm": 0.018417365849018097, "learning_rate": 9.614004644044942e-05, "loss": 0.0197, "step": 1491 }, { "epoch": 1.0666666666666667, "grad_norm": 0.018814783543348312, "learning_rate": 9.613042072489819e-05, "loss": 0.0099, "step": 1492 }, { "epoch": 1.0673815907059874, "grad_norm": 0.01447888370603323, "learning_rate": 9.612078350537586e-05, "loss": 0.0109, "step": 1493 }, { "epoch": 1.0680965147453083, "grad_norm": 0.017534315586090088, "learning_rate": 9.611113478428577e-05, "loss": 0.0199, "step": 1494 }, { "epoch": 1.068811438784629, "grad_norm": 0.019298193976283073, "learning_rate": 9.610147456403411e-05, "loss": 0.0186, "step": 1495 }, { "epoch": 1.068811438784629, "eval_loss": 0.01232554018497467, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1495 }, { "epoch": 1.06952636282395, "grad_norm": 0.01747329719364643, "learning_rate": 9.609180284702994e-05, "loss": 0.0269, "step": 1496 }, { "epoch": 1.0702412868632707, "grad_norm": 0.01710679568350315, "learning_rate": 9.608211963568518e-05, "loss": 0.0116, "step": 1497 }, { "epoch": 1.0709562109025916, "grad_norm": 0.015533632598817348, "learning_rate": 9.607242493241461e-05, "loss": 0.021, "step": 1498 }, { "epoch": 1.0716711349419124, "grad_norm": 0.016028018668293953, "learning_rate": 9.606271873963591e-05, "loss": 0.0148, "step": 1499 }, { "epoch": 1.0723860589812333, "grad_norm": 0.02269427478313446, "learning_rate": 9.605300105976959e-05, "loss": 0.0161, "step": 1500 }, { "epoch": 1.0723860589812333, "eval_loss": 0.012111329473555088, "eval_runtime": 4.6219, "eval_samples_per_second": 10.818, "eval_steps_per_second": 2.813, "step": 1500 }, { "epoch": 1.073100983020554, "grad_norm": 0.014093977399170399, "learning_rate": 9.604327189523906e-05, "loss": 0.0113, "step": 1501 }, { "epoch": 1.073815907059875, "grad_norm": 0.014205203391611576, "learning_rate": 9.603353124847054e-05, "loss": 0.0191, "step": 1502 }, { "epoch": 1.0745308310991957, "grad_norm": 0.01261715404689312, "learning_rate": 9.602377912189318e-05, "loss": 0.0131, "step": 1503 }, { "epoch": 1.0752457551385166, "grad_norm": 0.014648751355707645, "learning_rate": 9.601401551793891e-05, "loss": 0.0195, "step": 1504 }, { "epoch": 1.0759606791778373, "grad_norm": 0.01895512454211712, "learning_rate": 9.600424043904263e-05, "loss": 0.0201, "step": 1505 }, { "epoch": 1.0759606791778373, "eval_loss": 0.01209766324609518, "eval_runtime": 4.5987, "eval_samples_per_second": 10.873, "eval_steps_per_second": 2.827, "step": 1505 }, { "epoch": 1.0766756032171583, "grad_norm": 0.012491218745708466, "learning_rate": 9.599445388764199e-05, "loss": 0.0126, "step": 1506 }, { "epoch": 1.077390527256479, "grad_norm": 0.017487814649939537, "learning_rate": 9.598465586617758e-05, "loss": 0.0154, "step": 1507 }, { "epoch": 1.0781054512957997, "grad_norm": 0.013109483756124973, "learning_rate": 9.597484637709281e-05, "loss": 0.0109, "step": 1508 }, { "epoch": 1.0788203753351207, "grad_norm": 0.016225546598434448, "learning_rate": 9.596502542283398e-05, "loss": 0.0206, "step": 1509 }, { "epoch": 1.0795352993744414, "grad_norm": 0.01822725310921669, "learning_rate": 9.595519300585025e-05, "loss": 0.0259, "step": 1510 }, { "epoch": 1.0795352993744414, "eval_loss": 0.012293267995119095, "eval_runtime": 4.5909, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 1510 }, { "epoch": 1.0802502234137623, "grad_norm": 0.011270814575254917, "learning_rate": 9.594534912859357e-05, "loss": 0.0099, "step": 1511 }, { "epoch": 1.080965147453083, "grad_norm": 0.01370084099471569, "learning_rate": 9.59354937935188e-05, "loss": 0.0219, "step": 1512 }, { "epoch": 1.081680071492404, "grad_norm": 0.018225504085421562, "learning_rate": 9.592562700308372e-05, "loss": 0.0184, "step": 1513 }, { "epoch": 1.0823949955317247, "grad_norm": 0.013587875291705132, "learning_rate": 9.591574875974884e-05, "loss": 0.0087, "step": 1514 }, { "epoch": 1.0831099195710456, "grad_norm": 0.016850406304001808, "learning_rate": 9.590585906597763e-05, "loss": 0.0191, "step": 1515 }, { "epoch": 1.0831099195710456, "eval_loss": 0.012477572076022625, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1515 }, { "epoch": 1.0838248436103664, "grad_norm": 0.017979402095079422, "learning_rate": 9.589595792423634e-05, "loss": 0.0099, "step": 1516 }, { "epoch": 1.0845397676496873, "grad_norm": 0.01811646670103073, "learning_rate": 9.588604533699415e-05, "loss": 0.0134, "step": 1517 }, { "epoch": 1.085254691689008, "grad_norm": 0.012201692909002304, "learning_rate": 9.587612130672301e-05, "loss": 0.0177, "step": 1518 }, { "epoch": 1.085969615728329, "grad_norm": 0.01908016763627529, "learning_rate": 9.58661858358978e-05, "loss": 0.0131, "step": 1519 }, { "epoch": 1.0866845397676497, "grad_norm": 0.018176473677158356, "learning_rate": 9.58562389269962e-05, "loss": 0.0182, "step": 1520 }, { "epoch": 1.0866845397676497, "eval_loss": 0.012455731630325317, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 1520 }, { "epoch": 1.0873994638069706, "grad_norm": 0.017395980656147003, "learning_rate": 9.584628058249878e-05, "loss": 0.0154, "step": 1521 }, { "epoch": 1.0881143878462913, "grad_norm": 0.016161702573299408, "learning_rate": 9.583631080488893e-05, "loss": 0.018, "step": 1522 }, { "epoch": 1.088829311885612, "grad_norm": 0.01303598377853632, "learning_rate": 9.582632959665292e-05, "loss": 0.0134, "step": 1523 }, { "epoch": 1.089544235924933, "grad_norm": 0.015012257732450962, "learning_rate": 9.581633696027985e-05, "loss": 0.0124, "step": 1524 }, { "epoch": 1.0902591599642537, "grad_norm": 0.014321183785796165, "learning_rate": 9.580633289826166e-05, "loss": 0.0202, "step": 1525 }, { "epoch": 1.0902591599642537, "eval_loss": 0.012348951771855354, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 1525 }, { "epoch": 1.0909740840035747, "grad_norm": 0.018148109316825867, "learning_rate": 9.57963174130932e-05, "loss": 0.019, "step": 1526 }, { "epoch": 1.0916890080428954, "grad_norm": 0.013260588981211185, "learning_rate": 9.578629050727208e-05, "loss": 0.0101, "step": 1527 }, { "epoch": 1.0924039320822163, "grad_norm": 0.015087808482348919, "learning_rate": 9.577625218329882e-05, "loss": 0.0185, "step": 1528 }, { "epoch": 1.093118856121537, "grad_norm": 0.022630278021097183, "learning_rate": 9.576620244367675e-05, "loss": 0.0126, "step": 1529 }, { "epoch": 1.093833780160858, "grad_norm": 0.01566842570900917, "learning_rate": 9.57561412909121e-05, "loss": 0.0174, "step": 1530 }, { "epoch": 1.093833780160858, "eval_loss": 0.012342050671577454, "eval_runtime": 4.6022, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 1530 }, { "epoch": 1.0945487042001787, "grad_norm": 0.011260483413934708, "learning_rate": 9.574606872751392e-05, "loss": 0.0132, "step": 1531 }, { "epoch": 1.0952636282394996, "grad_norm": 0.017549142241477966, "learning_rate": 9.573598475599405e-05, "loss": 0.0196, "step": 1532 }, { "epoch": 1.0959785522788204, "grad_norm": 0.014812183566391468, "learning_rate": 9.572588937886726e-05, "loss": 0.0209, "step": 1533 }, { "epoch": 1.0966934763181413, "grad_norm": 0.012753150425851345, "learning_rate": 9.571578259865111e-05, "loss": 0.0165, "step": 1534 }, { "epoch": 1.097408400357462, "grad_norm": 0.016046080738306046, "learning_rate": 9.570566441786604e-05, "loss": 0.0109, "step": 1535 }, { "epoch": 1.097408400357462, "eval_loss": 0.012489423155784607, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1535 }, { "epoch": 1.098123324396783, "grad_norm": 0.015458988025784492, "learning_rate": 9.569553483903531e-05, "loss": 0.0241, "step": 1536 }, { "epoch": 1.0988382484361037, "grad_norm": 0.017843030393123627, "learning_rate": 9.568539386468501e-05, "loss": 0.021, "step": 1537 }, { "epoch": 1.0995531724754244, "grad_norm": 0.014807204715907574, "learning_rate": 9.56752414973441e-05, "loss": 0.018, "step": 1538 }, { "epoch": 1.1002680965147453, "grad_norm": 0.012612146325409412, "learning_rate": 9.566507773954439e-05, "loss": 0.0121, "step": 1539 }, { "epoch": 1.100983020554066, "grad_norm": 0.016222748905420303, "learning_rate": 9.565490259382046e-05, "loss": 0.0171, "step": 1540 }, { "epoch": 1.100983020554066, "eval_loss": 0.012396078556776047, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 1540 }, { "epoch": 1.101697944593387, "grad_norm": 0.01382988691329956, "learning_rate": 9.564471606270984e-05, "loss": 0.0133, "step": 1541 }, { "epoch": 1.1024128686327077, "grad_norm": 0.011246897280216217, "learning_rate": 9.56345181487528e-05, "loss": 0.0172, "step": 1542 }, { "epoch": 1.1031277926720287, "grad_norm": 0.017425598576664925, "learning_rate": 9.56243088544925e-05, "loss": 0.0184, "step": 1543 }, { "epoch": 1.1038427167113494, "grad_norm": 0.011746547184884548, "learning_rate": 9.561408818247492e-05, "loss": 0.0117, "step": 1544 }, { "epoch": 1.1045576407506703, "grad_norm": 0.01496782898902893, "learning_rate": 9.560385613524889e-05, "loss": 0.0165, "step": 1545 }, { "epoch": 1.1045576407506703, "eval_loss": 0.012030269019305706, "eval_runtime": 4.5872, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 1545 }, { "epoch": 1.105272564789991, "grad_norm": 0.013846750371158123, "learning_rate": 9.559361271536609e-05, "loss": 0.0161, "step": 1546 }, { "epoch": 1.105987488829312, "grad_norm": 0.014123071916401386, "learning_rate": 9.558335792538098e-05, "loss": 0.0176, "step": 1547 }, { "epoch": 1.1067024128686327, "grad_norm": 0.017124686390161514, "learning_rate": 9.557309176785092e-05, "loss": 0.0126, "step": 1548 }, { "epoch": 1.1074173369079536, "grad_norm": 0.011276185512542725, "learning_rate": 9.556281424533606e-05, "loss": 0.0185, "step": 1549 }, { "epoch": 1.1081322609472744, "grad_norm": 0.01819572225213051, "learning_rate": 9.55525253603994e-05, "loss": 0.023, "step": 1550 }, { "epoch": 1.1081322609472744, "eval_loss": 0.011978447437286377, "eval_runtime": 4.5953, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 1550 }, { "epoch": 1.1088471849865953, "grad_norm": 0.01414346694946289, "learning_rate": 9.55422251156068e-05, "loss": 0.0189, "step": 1551 }, { "epoch": 1.109562109025916, "grad_norm": 0.014213754795491695, "learning_rate": 9.55319135135269e-05, "loss": 0.0118, "step": 1552 }, { "epoch": 1.1102770330652367, "grad_norm": 0.013674567453563213, "learning_rate": 9.552159055673121e-05, "loss": 0.0184, "step": 1553 }, { "epoch": 1.1109919571045577, "grad_norm": 0.013213490135967731, "learning_rate": 9.551125624779406e-05, "loss": 0.0092, "step": 1554 }, { "epoch": 1.1117068811438784, "grad_norm": 0.014367647469043732, "learning_rate": 9.550091058929263e-05, "loss": 0.0131, "step": 1555 }, { "epoch": 1.1117068811438784, "eval_loss": 0.012006821110844612, "eval_runtime": 4.595, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 1555 }, { "epoch": 1.1124218051831993, "grad_norm": 0.014593611471354961, "learning_rate": 9.549055358380687e-05, "loss": 0.0195, "step": 1556 }, { "epoch": 1.11313672922252, "grad_norm": 0.02047288417816162, "learning_rate": 9.548018523391965e-05, "loss": 0.0225, "step": 1557 }, { "epoch": 1.113851653261841, "grad_norm": 0.014159639365971088, "learning_rate": 9.54698055422166e-05, "loss": 0.0155, "step": 1558 }, { "epoch": 1.1145665773011617, "grad_norm": 0.013446444645524025, "learning_rate": 9.54594145112862e-05, "loss": 0.0085, "step": 1559 }, { "epoch": 1.1152815013404827, "grad_norm": 0.01599978655576706, "learning_rate": 9.544901214371975e-05, "loss": 0.0118, "step": 1560 }, { "epoch": 1.1152815013404827, "eval_loss": 0.01169600896537304, "eval_runtime": 4.6115, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 1560 }, { "epoch": 1.1159964253798034, "grad_norm": 0.017309531569480896, "learning_rate": 9.543859844211141e-05, "loss": 0.0177, "step": 1561 }, { "epoch": 1.1167113494191243, "grad_norm": 0.01814572699368, "learning_rate": 9.542817340905809e-05, "loss": 0.0108, "step": 1562 }, { "epoch": 1.117426273458445, "grad_norm": 0.017358817160129547, "learning_rate": 9.541773704715964e-05, "loss": 0.0203, "step": 1563 }, { "epoch": 1.1181411974977657, "grad_norm": 0.017439493909478188, "learning_rate": 9.540728935901866e-05, "loss": 0.0138, "step": 1564 }, { "epoch": 1.1188561215370867, "grad_norm": 0.020946335047483444, "learning_rate": 9.539683034724054e-05, "loss": 0.0212, "step": 1565 }, { "epoch": 1.1188561215370867, "eval_loss": 0.01202012412250042, "eval_runtime": 4.596, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 1565 }, { "epoch": 1.1195710455764074, "grad_norm": 0.018464187160134315, "learning_rate": 9.53863600144336e-05, "loss": 0.0182, "step": 1566 }, { "epoch": 1.1202859696157283, "grad_norm": 0.01706024631857872, "learning_rate": 9.537587836320887e-05, "loss": 0.0192, "step": 1567 }, { "epoch": 1.121000893655049, "grad_norm": 0.018074534833431244, "learning_rate": 9.536538539618031e-05, "loss": 0.0179, "step": 1568 }, { "epoch": 1.12171581769437, "grad_norm": 0.017336366698145866, "learning_rate": 9.53548811159646e-05, "loss": 0.0179, "step": 1569 }, { "epoch": 1.1224307417336907, "grad_norm": 0.018226640298962593, "learning_rate": 9.534436552518133e-05, "loss": 0.014, "step": 1570 }, { "epoch": 1.1224307417336907, "eval_loss": 0.01217772625386715, "eval_runtime": 4.5961, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.828, "step": 1570 }, { "epoch": 1.1231456657730117, "grad_norm": 0.018371954560279846, "learning_rate": 9.533383862645284e-05, "loss": 0.0174, "step": 1571 }, { "epoch": 1.1238605898123324, "grad_norm": 0.011502498760819435, "learning_rate": 9.532330042240434e-05, "loss": 0.0135, "step": 1572 }, { "epoch": 1.1245755138516533, "grad_norm": 0.015541790053248405, "learning_rate": 9.531275091566382e-05, "loss": 0.0145, "step": 1573 }, { "epoch": 1.125290437890974, "grad_norm": 0.02032175473868847, "learning_rate": 9.530219010886213e-05, "loss": 0.0187, "step": 1574 }, { "epoch": 1.126005361930295, "grad_norm": 0.013857194222509861, "learning_rate": 9.529161800463291e-05, "loss": 0.017, "step": 1575 }, { "epoch": 1.126005361930295, "eval_loss": 0.012314243242144585, "eval_runtime": 4.5949, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 1575 }, { "epoch": 1.1267202859696157, "grad_norm": 0.02120121568441391, "learning_rate": 9.528103460561262e-05, "loss": 0.0195, "step": 1576 }, { "epoch": 1.1274352100089367, "grad_norm": 0.018549341708421707, "learning_rate": 9.527043991444053e-05, "loss": 0.0158, "step": 1577 }, { "epoch": 1.1281501340482574, "grad_norm": 0.013569529168307781, "learning_rate": 9.525983393375876e-05, "loss": 0.0145, "step": 1578 }, { "epoch": 1.128865058087578, "grad_norm": 0.0186674315482378, "learning_rate": 9.52492166662122e-05, "loss": 0.0186, "step": 1579 }, { "epoch": 1.129579982126899, "grad_norm": 0.015909641981124878, "learning_rate": 9.52385881144486e-05, "loss": 0.0157, "step": 1580 }, { "epoch": 1.129579982126899, "eval_loss": 0.012228206731379032, "eval_runtime": 4.6121, "eval_samples_per_second": 10.841, "eval_steps_per_second": 2.819, "step": 1580 }, { "epoch": 1.1302949061662197, "grad_norm": 0.0190535020083189, "learning_rate": 9.522794828111848e-05, "loss": 0.0198, "step": 1581 }, { "epoch": 1.1310098302055407, "grad_norm": 0.013092410750687122, "learning_rate": 9.52172971688752e-05, "loss": 0.0248, "step": 1582 }, { "epoch": 1.1317247542448614, "grad_norm": 0.015755973756313324, "learning_rate": 9.520663478037492e-05, "loss": 0.0089, "step": 1583 }, { "epoch": 1.1324396782841823, "grad_norm": 0.016492322087287903, "learning_rate": 9.519596111827664e-05, "loss": 0.0175, "step": 1584 }, { "epoch": 1.133154602323503, "grad_norm": 0.013163993135094643, "learning_rate": 9.518527618524213e-05, "loss": 0.0136, "step": 1585 }, { "epoch": 1.133154602323503, "eval_loss": 0.011969818733632565, "eval_runtime": 4.5911, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 1585 }, { "epoch": 1.133869526362824, "grad_norm": 0.014753281138837337, "learning_rate": 9.5174579983936e-05, "loss": 0.013, "step": 1586 }, { "epoch": 1.1345844504021447, "grad_norm": 0.017515774816274643, "learning_rate": 9.516387251702565e-05, "loss": 0.0197, "step": 1587 }, { "epoch": 1.1352993744414657, "grad_norm": 0.01795933209359646, "learning_rate": 9.515315378718133e-05, "loss": 0.0169, "step": 1588 }, { "epoch": 1.1360142984807864, "grad_norm": 0.020088287070393562, "learning_rate": 9.514242379707602e-05, "loss": 0.0131, "step": 1589 }, { "epoch": 1.1367292225201073, "grad_norm": 0.014534253627061844, "learning_rate": 9.51316825493856e-05, "loss": 0.0158, "step": 1590 }, { "epoch": 1.1367292225201073, "eval_loss": 0.011986306868493557, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1590 }, { "epoch": 1.137444146559428, "grad_norm": 0.014347931370139122, "learning_rate": 9.51209300467887e-05, "loss": 0.0161, "step": 1591 }, { "epoch": 1.138159070598749, "grad_norm": 0.013881289400160313, "learning_rate": 9.511016629196676e-05, "loss": 0.0139, "step": 1592 }, { "epoch": 1.1388739946380697, "grad_norm": 0.019408101215958595, "learning_rate": 9.509939128760406e-05, "loss": 0.0177, "step": 1593 }, { "epoch": 1.1395889186773904, "grad_norm": 0.013764169998466969, "learning_rate": 9.508860503638764e-05, "loss": 0.017, "step": 1594 }, { "epoch": 1.1403038427167114, "grad_norm": 0.013661406002938747, "learning_rate": 9.50778075410074e-05, "loss": 0.0127, "step": 1595 }, { "epoch": 1.1403038427167114, "eval_loss": 0.012039771303534508, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 1595 }, { "epoch": 1.141018766756032, "grad_norm": 0.01669054850935936, "learning_rate": 9.506699880415597e-05, "loss": 0.0187, "step": 1596 }, { "epoch": 1.141733690795353, "grad_norm": 0.015329303219914436, "learning_rate": 9.505617882852884e-05, "loss": 0.0178, "step": 1597 }, { "epoch": 1.1424486148346737, "grad_norm": 0.013113804161548615, "learning_rate": 9.50453476168243e-05, "loss": 0.0157, "step": 1598 }, { "epoch": 1.1431635388739947, "grad_norm": 0.01827033795416355, "learning_rate": 9.503450517174344e-05, "loss": 0.0146, "step": 1599 }, { "epoch": 1.1438784629133154, "grad_norm": 0.014730091206729412, "learning_rate": 9.50236514959901e-05, "loss": 0.0169, "step": 1600 }, { "epoch": 1.1438784629133154, "eval_loss": 0.012252332642674446, "eval_runtime": 4.5912, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.832, "step": 1600 }, { "epoch": 1.1445933869526363, "grad_norm": 0.016247497871518135, "learning_rate": 9.5012786592271e-05, "loss": 0.0176, "step": 1601 }, { "epoch": 1.145308310991957, "grad_norm": 0.020303044468164444, "learning_rate": 9.500191046329561e-05, "loss": 0.022, "step": 1602 }, { "epoch": 1.146023235031278, "grad_norm": 0.014910767786204815, "learning_rate": 9.499102311177622e-05, "loss": 0.0097, "step": 1603 }, { "epoch": 1.1467381590705987, "grad_norm": 0.018635468557476997, "learning_rate": 9.498012454042786e-05, "loss": 0.0214, "step": 1604 }, { "epoch": 1.1474530831099194, "grad_norm": 0.01342886220663786, "learning_rate": 9.496921475196848e-05, "loss": 0.0117, "step": 1605 }, { "epoch": 1.1474530831099194, "eval_loss": 0.012212568894028664, "eval_runtime": 4.5972, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 1605 }, { "epoch": 1.1481680071492404, "grad_norm": 0.01740073785185814, "learning_rate": 9.49582937491187e-05, "loss": 0.0138, "step": 1606 }, { "epoch": 1.1488829311885613, "grad_norm": 0.017415938898921013, "learning_rate": 9.494736153460203e-05, "loss": 0.0171, "step": 1607 }, { "epoch": 1.149597855227882, "grad_norm": 0.012892560102045536, "learning_rate": 9.493641811114472e-05, "loss": 0.011, "step": 1608 }, { "epoch": 1.1503127792672028, "grad_norm": 0.014154138043522835, "learning_rate": 9.492546348147583e-05, "loss": 0.0161, "step": 1609 }, { "epoch": 1.1510277033065237, "grad_norm": 0.01871982216835022, "learning_rate": 9.491449764832723e-05, "loss": 0.0162, "step": 1610 }, { "epoch": 1.1510277033065237, "eval_loss": 0.01211416907608509, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 1610 }, { "epoch": 1.1517426273458444, "grad_norm": 0.013002400286495686, "learning_rate": 9.490352061443354e-05, "loss": 0.0111, "step": 1611 }, { "epoch": 1.1524575513851654, "grad_norm": 0.017501605674624443, "learning_rate": 9.489253238253225e-05, "loss": 0.0141, "step": 1612 }, { "epoch": 1.153172475424486, "grad_norm": 0.02342977747321129, "learning_rate": 9.488153295536357e-05, "loss": 0.0168, "step": 1613 }, { "epoch": 1.153887399463807, "grad_norm": 0.017331084236502647, "learning_rate": 9.487052233567054e-05, "loss": 0.0162, "step": 1614 }, { "epoch": 1.1546023235031277, "grad_norm": 0.015860745683312416, "learning_rate": 9.485950052619897e-05, "loss": 0.0161, "step": 1615 }, { "epoch": 1.1546023235031277, "eval_loss": 0.012241239659488201, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1615 }, { "epoch": 1.1553172475424487, "grad_norm": 0.02026781067252159, "learning_rate": 9.484846752969748e-05, "loss": 0.0132, "step": 1616 }, { "epoch": 1.1560321715817694, "grad_norm": 0.016692565754055977, "learning_rate": 9.483742334891746e-05, "loss": 0.0136, "step": 1617 }, { "epoch": 1.1567470956210903, "grad_norm": 0.015538191422820091, "learning_rate": 9.482636798661311e-05, "loss": 0.0221, "step": 1618 }, { "epoch": 1.157462019660411, "grad_norm": 0.019147446379065514, "learning_rate": 9.48153014455414e-05, "loss": 0.012, "step": 1619 }, { "epoch": 1.1581769436997318, "grad_norm": 0.019121931865811348, "learning_rate": 9.480422372846212e-05, "loss": 0.0162, "step": 1620 }, { "epoch": 1.1581769436997318, "eval_loss": 0.012264718301594257, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 1620 }, { "epoch": 1.1588918677390527, "grad_norm": 0.015839848667383194, "learning_rate": 9.479313483813779e-05, "loss": 0.0186, "step": 1621 }, { "epoch": 1.1596067917783737, "grad_norm": 0.013525371439754963, "learning_rate": 9.478203477733376e-05, "loss": 0.0143, "step": 1622 }, { "epoch": 1.1603217158176944, "grad_norm": 0.017085010185837746, "learning_rate": 9.477092354881818e-05, "loss": 0.0179, "step": 1623 }, { "epoch": 1.161036639857015, "grad_norm": 0.01467293780297041, "learning_rate": 9.475980115536192e-05, "loss": 0.0116, "step": 1624 }, { "epoch": 1.161751563896336, "grad_norm": 0.018744491040706635, "learning_rate": 9.474866759973871e-05, "loss": 0.0144, "step": 1625 }, { "epoch": 1.161751563896336, "eval_loss": 0.012321384623646736, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1625 }, { "epoch": 1.1624664879356568, "grad_norm": 0.01485452800989151, "learning_rate": 9.473752288472499e-05, "loss": 0.0212, "step": 1626 }, { "epoch": 1.1631814119749777, "grad_norm": 0.017830410972237587, "learning_rate": 9.472636701310005e-05, "loss": 0.0169, "step": 1627 }, { "epoch": 1.1638963360142984, "grad_norm": 0.016119813546538353, "learning_rate": 9.471519998764593e-05, "loss": 0.0136, "step": 1628 }, { "epoch": 1.1646112600536194, "grad_norm": 0.014500639401376247, "learning_rate": 9.470402181114746e-05, "loss": 0.0148, "step": 1629 }, { "epoch": 1.16532618409294, "grad_norm": 0.016214875504374504, "learning_rate": 9.469283248639222e-05, "loss": 0.0211, "step": 1630 }, { "epoch": 1.16532618409294, "eval_loss": 0.012037081643939018, "eval_runtime": 4.5913, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 1630 }, { "epoch": 1.166041108132261, "grad_norm": 0.015095865353941917, "learning_rate": 9.468163201617062e-05, "loss": 0.0167, "step": 1631 }, { "epoch": 1.1667560321715817, "grad_norm": 0.012522408738732338, "learning_rate": 9.467042040327582e-05, "loss": 0.0138, "step": 1632 }, { "epoch": 1.1674709562109027, "grad_norm": 0.017134439200162888, "learning_rate": 9.465919765050375e-05, "loss": 0.0194, "step": 1633 }, { "epoch": 1.1681858802502234, "grad_norm": 0.01334018912166357, "learning_rate": 9.464796376065314e-05, "loss": 0.0122, "step": 1634 }, { "epoch": 1.1689008042895441, "grad_norm": 0.014379623346030712, "learning_rate": 9.46367187365255e-05, "loss": 0.0148, "step": 1635 }, { "epoch": 1.1689008042895441, "eval_loss": 0.01228685025125742, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 1635 }, { "epoch": 1.169615728328865, "grad_norm": 0.01682630181312561, "learning_rate": 9.462546258092511e-05, "loss": 0.0203, "step": 1636 }, { "epoch": 1.1703306523681858, "grad_norm": 0.012595698237419128, "learning_rate": 9.4614195296659e-05, "loss": 0.016, "step": 1637 }, { "epoch": 1.1710455764075067, "grad_norm": 0.015002705156803131, "learning_rate": 9.460291688653702e-05, "loss": 0.0199, "step": 1638 }, { "epoch": 1.1717605004468274, "grad_norm": 0.016054118052124977, "learning_rate": 9.459162735337174e-05, "loss": 0.0176, "step": 1639 }, { "epoch": 1.1724754244861484, "grad_norm": 0.016581131145358086, "learning_rate": 9.458032669997857e-05, "loss": 0.0118, "step": 1640 }, { "epoch": 1.1724754244861484, "eval_loss": 0.012271757237613201, "eval_runtime": 4.5891, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 1640 }, { "epoch": 1.173190348525469, "grad_norm": 0.01608113944530487, "learning_rate": 9.456901492917565e-05, "loss": 0.0138, "step": 1641 }, { "epoch": 1.17390527256479, "grad_norm": 0.0180245041847229, "learning_rate": 9.455769204378387e-05, "loss": 0.019, "step": 1642 }, { "epoch": 1.1746201966041108, "grad_norm": 0.014111057855188847, "learning_rate": 9.454635804662697e-05, "loss": 0.0141, "step": 1643 }, { "epoch": 1.1753351206434317, "grad_norm": 0.033782511949539185, "learning_rate": 9.453501294053138e-05, "loss": 0.0118, "step": 1644 }, { "epoch": 1.1760500446827524, "grad_norm": 0.01635146327316761, "learning_rate": 9.452365672832635e-05, "loss": 0.0126, "step": 1645 }, { "epoch": 1.1760500446827524, "eval_loss": 0.012371747754514217, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 1645 }, { "epoch": 1.1767649687220734, "grad_norm": 0.01693066768348217, "learning_rate": 9.451228941284389e-05, "loss": 0.0182, "step": 1646 }, { "epoch": 1.177479892761394, "grad_norm": 0.04258311912417412, "learning_rate": 9.450091099691875e-05, "loss": 0.0182, "step": 1647 }, { "epoch": 1.178194816800715, "grad_norm": 0.016908830031752586, "learning_rate": 9.44895214833885e-05, "loss": 0.0167, "step": 1648 }, { "epoch": 1.1789097408400357, "grad_norm": 0.01612919755280018, "learning_rate": 9.447812087509342e-05, "loss": 0.023, "step": 1649 }, { "epoch": 1.1796246648793565, "grad_norm": 0.012471341527998447, "learning_rate": 9.44667091748766e-05, "loss": 0.0089, "step": 1650 }, { "epoch": 1.1796246648793565, "eval_loss": 0.012459165416657925, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 1650 }, { "epoch": 1.1803395889186774, "grad_norm": 0.013132600113749504, "learning_rate": 9.445528638558388e-05, "loss": 0.0127, "step": 1651 }, { "epoch": 1.1810545129579981, "grad_norm": 0.01619238778948784, "learning_rate": 9.444385251006388e-05, "loss": 0.0167, "step": 1652 }, { "epoch": 1.181769436997319, "grad_norm": 0.013245820999145508, "learning_rate": 9.443240755116795e-05, "loss": 0.0157, "step": 1653 }, { "epoch": 1.1824843610366398, "grad_norm": 0.016862038522958755, "learning_rate": 9.442095151175023e-05, "loss": 0.0136, "step": 1654 }, { "epoch": 1.1831992850759607, "grad_norm": 0.016700543463230133, "learning_rate": 9.440948439466763e-05, "loss": 0.0161, "step": 1655 }, { "epoch": 1.1831992850759607, "eval_loss": 0.012241752818226814, "eval_runtime": 4.607, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 1655 }, { "epoch": 1.1839142091152814, "grad_norm": 0.01271054521203041, "learning_rate": 9.439800620277981e-05, "loss": 0.0117, "step": 1656 }, { "epoch": 1.1846291331546024, "grad_norm": 0.013051116839051247, "learning_rate": 9.43865169389492e-05, "loss": 0.0136, "step": 1657 }, { "epoch": 1.185344057193923, "grad_norm": 0.014811989851295948, "learning_rate": 9.437501660604094e-05, "loss": 0.0168, "step": 1658 }, { "epoch": 1.186058981233244, "grad_norm": 0.015995517373085022, "learning_rate": 9.436350520692303e-05, "loss": 0.0154, "step": 1659 }, { "epoch": 1.1867739052725648, "grad_norm": 0.015877526253461838, "learning_rate": 9.435198274446613e-05, "loss": 0.0254, "step": 1660 }, { "epoch": 1.1867739052725648, "eval_loss": 0.012432483956217766, "eval_runtime": 4.5874, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 1660 }, { "epoch": 1.1874888293118857, "grad_norm": 0.014004581607878208, "learning_rate": 9.434044922154375e-05, "loss": 0.0098, "step": 1661 }, { "epoch": 1.1882037533512064, "grad_norm": 0.022617844864726067, "learning_rate": 9.432890464103208e-05, "loss": 0.0192, "step": 1662 }, { "epoch": 1.1889186773905274, "grad_norm": 0.014598255045711994, "learning_rate": 9.43173490058101e-05, "loss": 0.0128, "step": 1663 }, { "epoch": 1.189633601429848, "grad_norm": 0.015313958749175072, "learning_rate": 9.430578231875955e-05, "loss": 0.0218, "step": 1664 }, { "epoch": 1.1903485254691688, "grad_norm": 0.015525487251579762, "learning_rate": 9.429420458276494e-05, "loss": 0.0116, "step": 1665 }, { "epoch": 1.1903485254691688, "eval_loss": 0.012395775876939297, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 1665 }, { "epoch": 1.1910634495084897, "grad_norm": 0.020737146958708763, "learning_rate": 9.42826158007135e-05, "loss": 0.011, "step": 1666 }, { "epoch": 1.1917783735478105, "grad_norm": 0.017319971695542336, "learning_rate": 9.427101597549521e-05, "loss": 0.0213, "step": 1667 }, { "epoch": 1.1924932975871314, "grad_norm": 0.014774920418858528, "learning_rate": 9.425940511000287e-05, "loss": 0.0141, "step": 1668 }, { "epoch": 1.1932082216264521, "grad_norm": 0.018654536455869675, "learning_rate": 9.424778320713196e-05, "loss": 0.0155, "step": 1669 }, { "epoch": 1.193923145665773, "grad_norm": 0.0210418701171875, "learning_rate": 9.423615026978076e-05, "loss": 0.0254, "step": 1670 }, { "epoch": 1.193923145665773, "eval_loss": 0.012539276853203773, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1670 }, { "epoch": 1.1946380697050938, "grad_norm": 0.012642957270145416, "learning_rate": 9.422450630085026e-05, "loss": 0.0161, "step": 1671 }, { "epoch": 1.1953529937444147, "grad_norm": 0.012553463689982891, "learning_rate": 9.421285130324425e-05, "loss": 0.0157, "step": 1672 }, { "epoch": 1.1960679177837354, "grad_norm": 0.01242122519761324, "learning_rate": 9.420118527986923e-05, "loss": 0.0118, "step": 1673 }, { "epoch": 1.1967828418230564, "grad_norm": 0.017479486763477325, "learning_rate": 9.418950823363446e-05, "loss": 0.0148, "step": 1674 }, { "epoch": 1.197497765862377, "grad_norm": 0.016611894592642784, "learning_rate": 9.417782016745198e-05, "loss": 0.0197, "step": 1675 }, { "epoch": 1.197497765862377, "eval_loss": 0.012734027579426765, "eval_runtime": 4.596, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 1675 }, { "epoch": 1.1982126899016978, "grad_norm": 0.018829112872481346, "learning_rate": 9.416612108423653e-05, "loss": 0.0104, "step": 1676 }, { "epoch": 1.1989276139410188, "grad_norm": 0.015100840479135513, "learning_rate": 9.415441098690561e-05, "loss": 0.0116, "step": 1677 }, { "epoch": 1.1996425379803397, "grad_norm": 0.01953725703060627, "learning_rate": 9.414268987837949e-05, "loss": 0.0186, "step": 1678 }, { "epoch": 1.2003574620196604, "grad_norm": 0.02392272651195526, "learning_rate": 9.413095776158118e-05, "loss": 0.0143, "step": 1679 }, { "epoch": 1.2010723860589811, "grad_norm": 0.019708853214979172, "learning_rate": 9.41192146394364e-05, "loss": 0.0174, "step": 1680 }, { "epoch": 1.2010723860589811, "eval_loss": 0.012188290245831013, "eval_runtime": 4.601, "eval_samples_per_second": 10.867, "eval_steps_per_second": 2.825, "step": 1680 }, { "epoch": 1.201787310098302, "grad_norm": 0.017809847369790077, "learning_rate": 9.410746051487367e-05, "loss": 0.0201, "step": 1681 }, { "epoch": 1.2025022341376228, "grad_norm": 0.015407967381179333, "learning_rate": 9.409569539082421e-05, "loss": 0.0204, "step": 1682 }, { "epoch": 1.2032171581769437, "grad_norm": 0.017430374398827553, "learning_rate": 9.4083919270222e-05, "loss": 0.0171, "step": 1683 }, { "epoch": 1.2039320822162645, "grad_norm": 0.01640516333281994, "learning_rate": 9.407213215600377e-05, "loss": 0.0171, "step": 1684 }, { "epoch": 1.2046470062555854, "grad_norm": 0.013257295824587345, "learning_rate": 9.406033405110896e-05, "loss": 0.0089, "step": 1685 }, { "epoch": 1.2046470062555854, "eval_loss": 0.011982856318354607, "eval_runtime": 4.5933, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1685 }, { "epoch": 1.2053619302949061, "grad_norm": 0.015456026419997215, "learning_rate": 9.40485249584798e-05, "loss": 0.0182, "step": 1686 }, { "epoch": 1.206076854334227, "grad_norm": 0.013479904271662235, "learning_rate": 9.40367048810612e-05, "loss": 0.0111, "step": 1687 }, { "epoch": 1.2067917783735478, "grad_norm": 0.02088511921465397, "learning_rate": 9.402487382180088e-05, "loss": 0.0256, "step": 1688 }, { "epoch": 1.2075067024128687, "grad_norm": 0.01535392738878727, "learning_rate": 9.401303178364922e-05, "loss": 0.0168, "step": 1689 }, { "epoch": 1.2082216264521894, "grad_norm": 0.015979446470737457, "learning_rate": 9.400117876955943e-05, "loss": 0.0156, "step": 1690 }, { "epoch": 1.2082216264521894, "eval_loss": 0.011895110830664635, "eval_runtime": 4.6048, "eval_samples_per_second": 10.858, "eval_steps_per_second": 2.823, "step": 1690 }, { "epoch": 1.2089365504915102, "grad_norm": 0.016626644879579544, "learning_rate": 9.398931478248736e-05, "loss": 0.0146, "step": 1691 }, { "epoch": 1.209651474530831, "grad_norm": 0.01732015050947666, "learning_rate": 9.397743982539167e-05, "loss": 0.0151, "step": 1692 }, { "epoch": 1.2103663985701518, "grad_norm": 0.013306284323334694, "learning_rate": 9.39655539012337e-05, "loss": 0.0182, "step": 1693 }, { "epoch": 1.2110813226094728, "grad_norm": 0.017807256430387497, "learning_rate": 9.395365701297759e-05, "loss": 0.022, "step": 1694 }, { "epoch": 1.2117962466487935, "grad_norm": 0.016601061448454857, "learning_rate": 9.394174916359016e-05, "loss": 0.0221, "step": 1695 }, { "epoch": 1.2117962466487935, "eval_loss": 0.011736569926142693, "eval_runtime": 4.5872, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 1695 }, { "epoch": 1.2125111706881144, "grad_norm": 0.013202894479036331, "learning_rate": 9.392983035604097e-05, "loss": 0.0129, "step": 1696 }, { "epoch": 1.2132260947274351, "grad_norm": 0.013655259273946285, "learning_rate": 9.391790059330234e-05, "loss": 0.0178, "step": 1697 }, { "epoch": 1.213941018766756, "grad_norm": 0.013044866733253002, "learning_rate": 9.390595987834929e-05, "loss": 0.0111, "step": 1698 }, { "epoch": 1.2146559428060768, "grad_norm": 0.01642894372344017, "learning_rate": 9.389400821415961e-05, "loss": 0.0185, "step": 1699 }, { "epoch": 1.2153708668453977, "grad_norm": 0.01419941894710064, "learning_rate": 9.388204560371378e-05, "loss": 0.0185, "step": 1700 }, { "epoch": 1.2153708668453977, "eval_loss": 0.011785692535340786, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 1700 }, { "epoch": 1.2160857908847185, "grad_norm": 0.015338912606239319, "learning_rate": 9.387007204999503e-05, "loss": 0.0186, "step": 1701 }, { "epoch": 1.2168007149240394, "grad_norm": 0.01665988564491272, "learning_rate": 9.385808755598931e-05, "loss": 0.0215, "step": 1702 }, { "epoch": 1.2175156389633601, "grad_norm": 0.01707652024924755, "learning_rate": 9.384609212468532e-05, "loss": 0.0156, "step": 1703 }, { "epoch": 1.218230563002681, "grad_norm": 0.013057400472462177, "learning_rate": 9.383408575907447e-05, "loss": 0.0131, "step": 1704 }, { "epoch": 1.2189454870420018, "grad_norm": 0.018720898777246475, "learning_rate": 9.382206846215088e-05, "loss": 0.0159, "step": 1705 }, { "epoch": 1.2189454870420018, "eval_loss": 0.011546487919986248, "eval_runtime": 4.5906, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 1705 }, { "epoch": 1.2196604110813225, "grad_norm": 0.01651415228843689, "learning_rate": 9.381004023691142e-05, "loss": 0.0176, "step": 1706 }, { "epoch": 1.2203753351206434, "grad_norm": 0.013818725012242794, "learning_rate": 9.37980010863557e-05, "loss": 0.0105, "step": 1707 }, { "epoch": 1.2210902591599642, "grad_norm": 0.012934042140841484, "learning_rate": 9.378595101348602e-05, "loss": 0.013, "step": 1708 }, { "epoch": 1.221805183199285, "grad_norm": 0.013141704723238945, "learning_rate": 9.377389002130741e-05, "loss": 0.0133, "step": 1709 }, { "epoch": 1.2225201072386058, "grad_norm": 0.014822929166257381, "learning_rate": 9.376181811282764e-05, "loss": 0.0169, "step": 1710 }, { "epoch": 1.2225201072386058, "eval_loss": 0.011671124957501888, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 1710 }, { "epoch": 1.2232350312779268, "grad_norm": 0.01594484969973564, "learning_rate": 9.374973529105722e-05, "loss": 0.009, "step": 1711 }, { "epoch": 1.2239499553172475, "grad_norm": 0.014256695285439491, "learning_rate": 9.373764155900931e-05, "loss": 0.017, "step": 1712 }, { "epoch": 1.2246648793565684, "grad_norm": 0.017994582653045654, "learning_rate": 9.372553691969988e-05, "loss": 0.0153, "step": 1713 }, { "epoch": 1.2253798033958891, "grad_norm": 0.017398925498127937, "learning_rate": 9.371342137614753e-05, "loss": 0.0161, "step": 1714 }, { "epoch": 1.22609472743521, "grad_norm": 0.014987699687480927, "learning_rate": 9.370129493137366e-05, "loss": 0.0169, "step": 1715 }, { "epoch": 1.22609472743521, "eval_loss": 0.01150783896446228, "eval_runtime": 4.5956, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 1715 }, { "epoch": 1.2268096514745308, "grad_norm": 0.013035807758569717, "learning_rate": 9.368915758840235e-05, "loss": 0.0159, "step": 1716 }, { "epoch": 1.2275245755138517, "grad_norm": 0.019004464149475098, "learning_rate": 9.367700935026038e-05, "loss": 0.0199, "step": 1717 }, { "epoch": 1.2282394995531725, "grad_norm": 0.020046228542923927, "learning_rate": 9.366485021997728e-05, "loss": 0.0147, "step": 1718 }, { "epoch": 1.2289544235924934, "grad_norm": 0.018093518912792206, "learning_rate": 9.365268020058531e-05, "loss": 0.02, "step": 1719 }, { "epoch": 1.2296693476318141, "grad_norm": 0.012739380821585655, "learning_rate": 9.364049929511939e-05, "loss": 0.0142, "step": 1720 }, { "epoch": 1.2296693476318141, "eval_loss": 0.011621501296758652, "eval_runtime": 4.5931, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 1720 }, { "epoch": 1.2303842716711348, "grad_norm": 0.01802681013941765, "learning_rate": 9.36283075066172e-05, "loss": 0.0281, "step": 1721 }, { "epoch": 1.2310991957104558, "grad_norm": 0.01786441169679165, "learning_rate": 9.36161048381191e-05, "loss": 0.0186, "step": 1722 }, { "epoch": 1.2318141197497765, "grad_norm": 0.017941631376743317, "learning_rate": 9.360389129266822e-05, "loss": 0.0155, "step": 1723 }, { "epoch": 1.2325290437890974, "grad_norm": 0.01798142120242119, "learning_rate": 9.359166687331031e-05, "loss": 0.0126, "step": 1724 }, { "epoch": 1.2332439678284182, "grad_norm": 0.01607399433851242, "learning_rate": 9.357943158309395e-05, "loss": 0.0129, "step": 1725 }, { "epoch": 1.2332439678284182, "eval_loss": 0.011679301038384438, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1725 }, { "epoch": 1.233958891867739, "grad_norm": 0.016704272478818893, "learning_rate": 9.356718542507032e-05, "loss": 0.0212, "step": 1726 }, { "epoch": 1.2346738159070598, "grad_norm": 0.01819399930536747, "learning_rate": 9.355492840229339e-05, "loss": 0.0202, "step": 1727 }, { "epoch": 1.2353887399463808, "grad_norm": 0.013572904281318188, "learning_rate": 9.354266051781978e-05, "loss": 0.0114, "step": 1728 }, { "epoch": 1.2361036639857015, "grad_norm": 0.01774640567600727, "learning_rate": 9.353038177470886e-05, "loss": 0.014, "step": 1729 }, { "epoch": 1.2368185880250224, "grad_norm": 0.016620537266135216, "learning_rate": 9.35180921760227e-05, "loss": 0.0128, "step": 1730 }, { "epoch": 1.2368185880250224, "eval_loss": 0.011705503799021244, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1730 }, { "epoch": 1.2375335120643431, "grad_norm": 0.01967400498688221, "learning_rate": 9.350579172482606e-05, "loss": 0.025, "step": 1731 }, { "epoch": 1.2382484361036639, "grad_norm": 0.019106872379779816, "learning_rate": 9.349348042418642e-05, "loss": 0.0134, "step": 1732 }, { "epoch": 1.2389633601429848, "grad_norm": 0.014358198270201683, "learning_rate": 9.348115827717397e-05, "loss": 0.0102, "step": 1733 }, { "epoch": 1.2396782841823057, "grad_norm": 0.014011198654770851, "learning_rate": 9.346882528686159e-05, "loss": 0.011, "step": 1734 }, { "epoch": 1.2403932082216265, "grad_norm": 0.016511883586645126, "learning_rate": 9.345648145632489e-05, "loss": 0.0141, "step": 1735 }, { "epoch": 1.2403932082216265, "eval_loss": 0.011741181835532188, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1735 }, { "epoch": 1.2411081322609472, "grad_norm": 0.01790427975356579, "learning_rate": 9.344412678864213e-05, "loss": 0.0132, "step": 1736 }, { "epoch": 1.2418230563002681, "grad_norm": 0.018337037414312363, "learning_rate": 9.343176128689434e-05, "loss": 0.0184, "step": 1737 }, { "epoch": 1.2425379803395888, "grad_norm": 0.017898451536893845, "learning_rate": 9.34193849541652e-05, "loss": 0.0205, "step": 1738 }, { "epoch": 1.2432529043789098, "grad_norm": 0.01309875026345253, "learning_rate": 9.340699779354114e-05, "loss": 0.0121, "step": 1739 }, { "epoch": 1.2439678284182305, "grad_norm": 0.019314752891659737, "learning_rate": 9.339459980811123e-05, "loss": 0.0155, "step": 1740 }, { "epoch": 1.2439678284182305, "eval_loss": 0.011973638087511063, "eval_runtime": 4.5892, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 1740 }, { "epoch": 1.2446827524575514, "grad_norm": 0.018559034913778305, "learning_rate": 9.338219100096727e-05, "loss": 0.0169, "step": 1741 }, { "epoch": 1.2453976764968722, "grad_norm": 0.0151993278414011, "learning_rate": 9.336977137520379e-05, "loss": 0.0181, "step": 1742 }, { "epoch": 1.246112600536193, "grad_norm": 0.014596928842365742, "learning_rate": 9.335734093391796e-05, "loss": 0.0177, "step": 1743 }, { "epoch": 1.2468275245755138, "grad_norm": 0.0134976115077734, "learning_rate": 9.334489968020969e-05, "loss": 0.0162, "step": 1744 }, { "epoch": 1.2475424486148348, "grad_norm": 0.024229146540164948, "learning_rate": 9.333244761718157e-05, "loss": 0.0227, "step": 1745 }, { "epoch": 1.2475424486148348, "eval_loss": 0.011855129152536392, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1745 }, { "epoch": 1.2482573726541555, "grad_norm": 0.01635158061981201, "learning_rate": 9.331998474793886e-05, "loss": 0.0131, "step": 1746 }, { "epoch": 1.2489722966934762, "grad_norm": 0.012657404877245426, "learning_rate": 9.330751107558957e-05, "loss": 0.0207, "step": 1747 }, { "epoch": 1.2496872207327971, "grad_norm": 0.01830303482711315, "learning_rate": 9.329502660324437e-05, "loss": 0.0095, "step": 1748 }, { "epoch": 1.250402144772118, "grad_norm": 0.0198366716504097, "learning_rate": 9.328253133401663e-05, "loss": 0.016, "step": 1749 }, { "epoch": 1.2511170688114388, "grad_norm": 0.018894344568252563, "learning_rate": 9.32700252710224e-05, "loss": 0.0293, "step": 1750 }, { "epoch": 1.2511170688114388, "eval_loss": 0.01190108060836792, "eval_runtime": 4.5918, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 1750 }, { "epoch": 1.2518319928507595, "grad_norm": 0.014944967813789845, "learning_rate": 9.325750841738044e-05, "loss": 0.0185, "step": 1751 }, { "epoch": 1.2525469168900805, "grad_norm": 0.017233451828360558, "learning_rate": 9.32449807762122e-05, "loss": 0.0136, "step": 1752 }, { "epoch": 1.2532618409294012, "grad_norm": 0.0239251796156168, "learning_rate": 9.323244235064181e-05, "loss": 0.0165, "step": 1753 }, { "epoch": 1.2539767649687221, "grad_norm": 0.015247102826833725, "learning_rate": 9.32198931437961e-05, "loss": 0.0115, "step": 1754 }, { "epoch": 1.2546916890080428, "grad_norm": 0.019646815955638885, "learning_rate": 9.320733315880454e-05, "loss": 0.0229, "step": 1755 }, { "epoch": 1.2546916890080428, "eval_loss": 0.012085754424333572, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1755 }, { "epoch": 1.2554066130473638, "grad_norm": 0.02058582566678524, "learning_rate": 9.319476239879941e-05, "loss": 0.0242, "step": 1756 }, { "epoch": 1.2561215370866845, "grad_norm": 0.01927986554801464, "learning_rate": 9.318218086691552e-05, "loss": 0.0158, "step": 1757 }, { "epoch": 1.2568364611260054, "grad_norm": 0.017630359157919884, "learning_rate": 9.316958856629047e-05, "loss": 0.0194, "step": 1758 }, { "epoch": 1.2575513851653262, "grad_norm": 0.014673000201582909, "learning_rate": 9.315698550006456e-05, "loss": 0.0128, "step": 1759 }, { "epoch": 1.258266309204647, "grad_norm": 0.013908402062952518, "learning_rate": 9.314437167138066e-05, "loss": 0.0141, "step": 1760 }, { "epoch": 1.258266309204647, "eval_loss": 0.011922130361199379, "eval_runtime": 4.5917, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 1760 }, { "epoch": 1.2589812332439678, "grad_norm": 0.015877630561590195, "learning_rate": 9.313174708338445e-05, "loss": 0.0131, "step": 1761 }, { "epoch": 1.2596961572832885, "grad_norm": 0.01482989639043808, "learning_rate": 9.311911173922422e-05, "loss": 0.0144, "step": 1762 }, { "epoch": 1.2604110813226095, "grad_norm": 0.014950626529753208, "learning_rate": 9.310646564205099e-05, "loss": 0.0194, "step": 1763 }, { "epoch": 1.2611260053619304, "grad_norm": 0.021647939458489418, "learning_rate": 9.309380879501838e-05, "loss": 0.0173, "step": 1764 }, { "epoch": 1.2618409294012511, "grad_norm": 0.014894971624016762, "learning_rate": 9.30811412012828e-05, "loss": 0.0106, "step": 1765 }, { "epoch": 1.2618409294012511, "eval_loss": 0.012031502090394497, "eval_runtime": 4.6146, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 1765 }, { "epoch": 1.2625558534405719, "grad_norm": 0.015666179358959198, "learning_rate": 9.306846286400325e-05, "loss": 0.0212, "step": 1766 }, { "epoch": 1.2632707774798928, "grad_norm": 0.017222287133336067, "learning_rate": 9.305577378634148e-05, "loss": 0.0168, "step": 1767 }, { "epoch": 1.2639857015192135, "grad_norm": 0.014588217251002789, "learning_rate": 9.304307397146183e-05, "loss": 0.0193, "step": 1768 }, { "epoch": 1.2647006255585345, "grad_norm": 0.01631580851972103, "learning_rate": 9.303036342253143e-05, "loss": 0.0136, "step": 1769 }, { "epoch": 1.2654155495978552, "grad_norm": 0.021799737587571144, "learning_rate": 9.301764214272e-05, "loss": 0.0174, "step": 1770 }, { "epoch": 1.2654155495978552, "eval_loss": 0.012063792906701565, "eval_runtime": 4.5888, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 1770 }, { "epoch": 1.2661304736371761, "grad_norm": 0.02087181806564331, "learning_rate": 9.300491013519995e-05, "loss": 0.0202, "step": 1771 }, { "epoch": 1.2668453976764968, "grad_norm": 0.01615147665143013, "learning_rate": 9.299216740314638e-05, "loss": 0.009, "step": 1772 }, { "epoch": 1.2675603217158176, "grad_norm": 0.014048406854271889, "learning_rate": 9.29794139497371e-05, "loss": 0.0221, "step": 1773 }, { "epoch": 1.2682752457551385, "grad_norm": 0.015677455812692642, "learning_rate": 9.296664977815251e-05, "loss": 0.0184, "step": 1774 }, { "epoch": 1.2689901697944594, "grad_norm": 0.014919698238372803, "learning_rate": 9.295387489157576e-05, "loss": 0.0179, "step": 1775 }, { "epoch": 1.2689901697944594, "eval_loss": 0.011966388672590256, "eval_runtime": 4.6123, "eval_samples_per_second": 10.841, "eval_steps_per_second": 2.819, "step": 1775 }, { "epoch": 1.2697050938337802, "grad_norm": 0.016065487638115883, "learning_rate": 9.294108929319265e-05, "loss": 0.0134, "step": 1776 }, { "epoch": 1.2704200178731009, "grad_norm": 0.01877816952764988, "learning_rate": 9.292829298619161e-05, "loss": 0.0155, "step": 1777 }, { "epoch": 1.2711349419124218, "grad_norm": 0.01562882587313652, "learning_rate": 9.291548597376381e-05, "loss": 0.0189, "step": 1778 }, { "epoch": 1.2718498659517428, "grad_norm": 0.012481979094445705, "learning_rate": 9.290266825910301e-05, "loss": 0.0155, "step": 1779 }, { "epoch": 1.2725647899910635, "grad_norm": 0.015270586125552654, "learning_rate": 9.288983984540573e-05, "loss": 0.0113, "step": 1780 }, { "epoch": 1.2725647899910635, "eval_loss": 0.01167234592139721, "eval_runtime": 4.5899, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 1780 }, { "epoch": 1.2732797140303842, "grad_norm": 0.016960715875029564, "learning_rate": 9.287700073587107e-05, "loss": 0.016, "step": 1781 }, { "epoch": 1.2739946380697051, "grad_norm": 0.01905551739037037, "learning_rate": 9.286415093370086e-05, "loss": 0.0186, "step": 1782 }, { "epoch": 1.2747095621090259, "grad_norm": 0.019002897664904594, "learning_rate": 9.285129044209958e-05, "loss": 0.0114, "step": 1783 }, { "epoch": 1.2754244861483468, "grad_norm": 0.018628934398293495, "learning_rate": 9.283841926427436e-05, "loss": 0.0204, "step": 1784 }, { "epoch": 1.2761394101876675, "grad_norm": 0.014888442121446133, "learning_rate": 9.2825537403435e-05, "loss": 0.0126, "step": 1785 }, { "epoch": 1.2761394101876675, "eval_loss": 0.011813163757324219, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1785 }, { "epoch": 1.2768543342269885, "grad_norm": 0.01622309535741806, "learning_rate": 9.281264486279397e-05, "loss": 0.0286, "step": 1786 }, { "epoch": 1.2775692582663092, "grad_norm": 0.01572924107313156, "learning_rate": 9.279974164556642e-05, "loss": 0.0128, "step": 1787 }, { "epoch": 1.27828418230563, "grad_norm": 0.02424464374780655, "learning_rate": 9.278682775497012e-05, "loss": 0.0226, "step": 1788 }, { "epoch": 1.2789991063449508, "grad_norm": 0.018856225535273552, "learning_rate": 9.277390319422555e-05, "loss": 0.0201, "step": 1789 }, { "epoch": 1.2797140303842718, "grad_norm": 0.013787390664219856, "learning_rate": 9.27609679665558e-05, "loss": 0.0152, "step": 1790 }, { "epoch": 1.2797140303842718, "eval_loss": 0.011821035295724869, "eval_runtime": 4.5973, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 1790 }, { "epoch": 1.2804289544235925, "grad_norm": 0.01729242131114006, "learning_rate": 9.274802207518666e-05, "loss": 0.0159, "step": 1791 }, { "epoch": 1.2811438784629132, "grad_norm": 0.013696386478841305, "learning_rate": 9.27350655233466e-05, "loss": 0.0127, "step": 1792 }, { "epoch": 1.2818588025022342, "grad_norm": 0.018067287281155586, "learning_rate": 9.272209831426666e-05, "loss": 0.0164, "step": 1793 }, { "epoch": 1.2825737265415549, "grad_norm": 0.013784377835690975, "learning_rate": 9.270912045118064e-05, "loss": 0.0176, "step": 1794 }, { "epoch": 1.2832886505808758, "grad_norm": 0.012141223065555096, "learning_rate": 9.269613193732492e-05, "loss": 0.0104, "step": 1795 }, { "epoch": 1.2832886505808758, "eval_loss": 0.011751257814466953, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 1795 }, { "epoch": 1.2840035746201965, "grad_norm": 0.011855410411953926, "learning_rate": 9.268313277593858e-05, "loss": 0.0091, "step": 1796 }, { "epoch": 1.2847184986595175, "grad_norm": 0.01384701207280159, "learning_rate": 9.267012297026334e-05, "loss": 0.013, "step": 1797 }, { "epoch": 1.2854334226988382, "grad_norm": 0.015168161131441593, "learning_rate": 9.265710252354359e-05, "loss": 0.0111, "step": 1798 }, { "epoch": 1.2861483467381591, "grad_norm": 0.014260455034673214, "learning_rate": 9.264407143902631e-05, "loss": 0.0183, "step": 1799 }, { "epoch": 1.2868632707774799, "grad_norm": 0.01335203368216753, "learning_rate": 9.263102971996122e-05, "loss": 0.0132, "step": 1800 }, { "epoch": 1.2868632707774799, "eval_loss": 0.01164223812520504, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1800 }, { "epoch": 1.2875781948168008, "grad_norm": 0.014335998333990574, "learning_rate": 9.261797736960067e-05, "loss": 0.0247, "step": 1801 }, { "epoch": 1.2882931188561215, "grad_norm": 0.020886652171611786, "learning_rate": 9.260491439119962e-05, "loss": 0.018, "step": 1802 }, { "epoch": 1.2890080428954422, "grad_norm": 0.014875601045787334, "learning_rate": 9.259184078801572e-05, "loss": 0.0224, "step": 1803 }, { "epoch": 1.2897229669347632, "grad_norm": 0.016404112800955772, "learning_rate": 9.257875656330923e-05, "loss": 0.0147, "step": 1804 }, { "epoch": 1.2904378909740841, "grad_norm": 0.018204104155302048, "learning_rate": 9.256566172034312e-05, "loss": 0.0181, "step": 1805 }, { "epoch": 1.2904378909740841, "eval_loss": 0.011937999166548252, "eval_runtime": 4.5903, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 1805 }, { "epoch": 1.2911528150134048, "grad_norm": 0.014005713164806366, "learning_rate": 9.255255626238293e-05, "loss": 0.0075, "step": 1806 }, { "epoch": 1.2918677390527256, "grad_norm": 0.01768062636256218, "learning_rate": 9.253944019269694e-05, "loss": 0.0157, "step": 1807 }, { "epoch": 1.2925826630920465, "grad_norm": 0.018946794793009758, "learning_rate": 9.252631351455599e-05, "loss": 0.016, "step": 1808 }, { "epoch": 1.2932975871313672, "grad_norm": 0.014932164922356606, "learning_rate": 9.251317623123363e-05, "loss": 0.0178, "step": 1809 }, { "epoch": 1.2940125111706882, "grad_norm": 0.015573672950267792, "learning_rate": 9.250002834600599e-05, "loss": 0.0184, "step": 1810 }, { "epoch": 1.2940125111706882, "eval_loss": 0.011880642734467983, "eval_runtime": 4.5857, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 1810 }, { "epoch": 1.2947274352100089, "grad_norm": 0.017628870904445648, "learning_rate": 9.24868698621519e-05, "loss": 0.0197, "step": 1811 }, { "epoch": 1.2954423592493298, "grad_norm": 0.017341692000627518, "learning_rate": 9.24737007829528e-05, "loss": 0.0171, "step": 1812 }, { "epoch": 1.2961572832886505, "grad_norm": 0.014346321113407612, "learning_rate": 9.246052111169284e-05, "loss": 0.009, "step": 1813 }, { "epoch": 1.2968722073279715, "grad_norm": 0.013895530253648758, "learning_rate": 9.244733085165867e-05, "loss": 0.0167, "step": 1814 }, { "epoch": 1.2975871313672922, "grad_norm": 0.01628001220524311, "learning_rate": 9.243413000613974e-05, "loss": 0.01, "step": 1815 }, { "epoch": 1.2975871313672922, "eval_loss": 0.011738999746739864, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1815 }, { "epoch": 1.2983020554066131, "grad_norm": 0.013662748038768768, "learning_rate": 9.242091857842803e-05, "loss": 0.0116, "step": 1816 }, { "epoch": 1.2990169794459339, "grad_norm": 0.020450618118047714, "learning_rate": 9.240769657181821e-05, "loss": 0.0196, "step": 1817 }, { "epoch": 1.2997319034852546, "grad_norm": 0.019125403836369514, "learning_rate": 9.239446398960756e-05, "loss": 0.0196, "step": 1818 }, { "epoch": 1.3004468275245755, "grad_norm": 0.016843922436237335, "learning_rate": 9.238122083509601e-05, "loss": 0.0169, "step": 1819 }, { "epoch": 1.3011617515638965, "grad_norm": 0.01072420459240675, "learning_rate": 9.236796711158617e-05, "loss": 0.0133, "step": 1820 }, { "epoch": 1.3011617515638965, "eval_loss": 0.011567522771656513, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1820 }, { "epoch": 1.3018766756032172, "grad_norm": 0.013626400381326675, "learning_rate": 9.235470282238321e-05, "loss": 0.0194, "step": 1821 }, { "epoch": 1.302591599642538, "grad_norm": 0.015122946351766586, "learning_rate": 9.234142797079495e-05, "loss": 0.0137, "step": 1822 }, { "epoch": 1.3033065236818588, "grad_norm": 0.018230149522423744, "learning_rate": 9.232814256013191e-05, "loss": 0.0119, "step": 1823 }, { "epoch": 1.3040214477211796, "grad_norm": 0.017279069870710373, "learning_rate": 9.231484659370716e-05, "loss": 0.0196, "step": 1824 }, { "epoch": 1.3047363717605005, "grad_norm": 0.01647294871509075, "learning_rate": 9.230154007483645e-05, "loss": 0.0127, "step": 1825 }, { "epoch": 1.3047363717605005, "eval_loss": 0.011497761122882366, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 1825 }, { "epoch": 1.3054512957998212, "grad_norm": 0.012661582790315151, "learning_rate": 9.228822300683816e-05, "loss": 0.0151, "step": 1826 }, { "epoch": 1.3061662198391422, "grad_norm": 0.015625691041350365, "learning_rate": 9.227489539303328e-05, "loss": 0.0173, "step": 1827 }, { "epoch": 1.3068811438784629, "grad_norm": 0.01583274081349373, "learning_rate": 9.226155723674544e-05, "loss": 0.0147, "step": 1828 }, { "epoch": 1.3075960679177836, "grad_norm": 0.01221760455518961, "learning_rate": 9.22482085413009e-05, "loss": 0.0151, "step": 1829 }, { "epoch": 1.3083109919571045, "grad_norm": 0.015242958441376686, "learning_rate": 9.223484931002855e-05, "loss": 0.0199, "step": 1830 }, { "epoch": 1.3083109919571045, "eval_loss": 0.011421307921409607, "eval_runtime": 4.6118, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 1830 }, { "epoch": 1.3090259159964255, "grad_norm": 0.01667707785964012, "learning_rate": 9.222147954625992e-05, "loss": 0.0235, "step": 1831 }, { "epoch": 1.3097408400357462, "grad_norm": 0.017203181982040405, "learning_rate": 9.220809925332912e-05, "loss": 0.0323, "step": 1832 }, { "epoch": 1.310455764075067, "grad_norm": 0.0178048238158226, "learning_rate": 9.219470843457295e-05, "loss": 0.0121, "step": 1833 }, { "epoch": 1.3111706881143879, "grad_norm": 0.013856746256351471, "learning_rate": 9.218130709333078e-05, "loss": 0.0172, "step": 1834 }, { "epoch": 1.3118856121537088, "grad_norm": 0.01233347225934267, "learning_rate": 9.216789523294462e-05, "loss": 0.0143, "step": 1835 }, { "epoch": 1.3118856121537088, "eval_loss": 0.011548562906682491, "eval_runtime": 4.5997, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 1835 }, { "epoch": 1.3126005361930295, "grad_norm": 0.013912701047956944, "learning_rate": 9.215447285675916e-05, "loss": 0.0136, "step": 1836 }, { "epoch": 1.3133154602323502, "grad_norm": 0.01882079802453518, "learning_rate": 9.21410399681216e-05, "loss": 0.0159, "step": 1837 }, { "epoch": 1.3140303842716712, "grad_norm": 0.014149632304906845, "learning_rate": 9.212759657038186e-05, "loss": 0.0209, "step": 1838 }, { "epoch": 1.314745308310992, "grad_norm": 0.014017676003277302, "learning_rate": 9.211414266689244e-05, "loss": 0.0162, "step": 1839 }, { "epoch": 1.3154602323503128, "grad_norm": 0.021279355511069298, "learning_rate": 9.210067826100845e-05, "loss": 0.0179, "step": 1840 }, { "epoch": 1.3154602323503128, "eval_loss": 0.011814175173640251, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 1840 }, { "epoch": 1.3161751563896336, "grad_norm": 0.01902267336845398, "learning_rate": 9.208720335608767e-05, "loss": 0.0192, "step": 1841 }, { "epoch": 1.3168900804289545, "grad_norm": 0.013809367083013058, "learning_rate": 9.207371795549043e-05, "loss": 0.0141, "step": 1842 }, { "epoch": 1.3176050044682752, "grad_norm": 0.017548970878124237, "learning_rate": 9.206022206257969e-05, "loss": 0.0156, "step": 1843 }, { "epoch": 1.318319928507596, "grad_norm": 0.014615927822887897, "learning_rate": 9.20467156807211e-05, "loss": 0.0175, "step": 1844 }, { "epoch": 1.3190348525469169, "grad_norm": 0.012144117616117, "learning_rate": 9.203319881328283e-05, "loss": 0.0199, "step": 1845 }, { "epoch": 1.3190348525469169, "eval_loss": 0.011700903065502644, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 1845 }, { "epoch": 1.3197497765862378, "grad_norm": 0.014470825903117657, "learning_rate": 9.201967146363572e-05, "loss": 0.0139, "step": 1846 }, { "epoch": 1.3204647006255585, "grad_norm": 0.014527374878525734, "learning_rate": 9.200613363515324e-05, "loss": 0.0184, "step": 1847 }, { "epoch": 1.3211796246648793, "grad_norm": 0.015544300898909569, "learning_rate": 9.199258533121141e-05, "loss": 0.0115, "step": 1848 }, { "epoch": 1.3218945487042002, "grad_norm": 0.018379027023911476, "learning_rate": 9.197902655518887e-05, "loss": 0.0174, "step": 1849 }, { "epoch": 1.322609472743521, "grad_norm": 0.015981441363692284, "learning_rate": 9.196545731046695e-05, "loss": 0.0157, "step": 1850 }, { "epoch": 1.322609472743521, "eval_loss": 0.011521095409989357, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 1850 }, { "epoch": 1.3233243967828419, "grad_norm": 0.016257118433713913, "learning_rate": 9.195187760042952e-05, "loss": 0.0138, "step": 1851 }, { "epoch": 1.3240393208221626, "grad_norm": 0.013657630421221256, "learning_rate": 9.193828742846307e-05, "loss": 0.017, "step": 1852 }, { "epoch": 1.3247542448614835, "grad_norm": 0.01355596724897623, "learning_rate": 9.192468679795672e-05, "loss": 0.0152, "step": 1853 }, { "epoch": 1.3254691689008042, "grad_norm": 0.014832817018032074, "learning_rate": 9.191107571230217e-05, "loss": 0.0126, "step": 1854 }, { "epoch": 1.3261840929401252, "grad_norm": 0.015886304900050163, "learning_rate": 9.189745417489377e-05, "loss": 0.0187, "step": 1855 }, { "epoch": 1.3261840929401252, "eval_loss": 0.011821768246591091, "eval_runtime": 4.5792, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 1855 }, { "epoch": 1.326899016979446, "grad_norm": 0.014256157912313938, "learning_rate": 9.188382218912843e-05, "loss": 0.0116, "step": 1856 }, { "epoch": 1.3276139410187668, "grad_norm": 0.015055247582495213, "learning_rate": 9.187017975840568e-05, "loss": 0.0104, "step": 1857 }, { "epoch": 1.3283288650580876, "grad_norm": 0.013340744189918041, "learning_rate": 9.185652688612766e-05, "loss": 0.019, "step": 1858 }, { "epoch": 1.3290437890974083, "grad_norm": 0.019200781360268593, "learning_rate": 9.184286357569913e-05, "loss": 0.014, "step": 1859 }, { "epoch": 1.3297587131367292, "grad_norm": 0.014194277115166187, "learning_rate": 9.182918983052741e-05, "loss": 0.0137, "step": 1860 }, { "epoch": 1.3297587131367292, "eval_loss": 0.011690263636410236, "eval_runtime": 4.5915, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 1860 }, { "epoch": 1.3304736371760502, "grad_norm": 0.013297360390424728, "learning_rate": 9.181550565402248e-05, "loss": 0.0118, "step": 1861 }, { "epoch": 1.3311885612153709, "grad_norm": 0.014585437253117561, "learning_rate": 9.180181104959686e-05, "loss": 0.0174, "step": 1862 }, { "epoch": 1.3319034852546916, "grad_norm": 0.01787499338388443, "learning_rate": 9.178810602066574e-05, "loss": 0.0134, "step": 1863 }, { "epoch": 1.3326184092940125, "grad_norm": 0.017528170719742775, "learning_rate": 9.177439057064683e-05, "loss": 0.0134, "step": 1864 }, { "epoch": 1.3333333333333333, "grad_norm": 0.016679806634783745, "learning_rate": 9.17606647029605e-05, "loss": 0.0121, "step": 1865 }, { "epoch": 1.3333333333333333, "eval_loss": 0.011608355678617954, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 1865 }, { "epoch": 1.3340482573726542, "grad_norm": 0.014923786744475365, "learning_rate": 9.174692842102967e-05, "loss": 0.018, "step": 1866 }, { "epoch": 1.334763181411975, "grad_norm": 0.013092340901494026, "learning_rate": 9.173318172827994e-05, "loss": 0.0121, "step": 1867 }, { "epoch": 1.3354781054512959, "grad_norm": 0.017735406756401062, "learning_rate": 9.171942462813938e-05, "loss": 0.0174, "step": 1868 }, { "epoch": 1.3361930294906166, "grad_norm": 0.015352952294051647, "learning_rate": 9.17056571240388e-05, "loss": 0.0115, "step": 1869 }, { "epoch": 1.3369079535299375, "grad_norm": 0.01527327112853527, "learning_rate": 9.169187921941147e-05, "loss": 0.022, "step": 1870 }, { "epoch": 1.3369079535299375, "eval_loss": 0.01156847644597292, "eval_runtime": 4.5964, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 1870 }, { "epoch": 1.3376228775692582, "grad_norm": 0.01310166995972395, "learning_rate": 9.167809091769332e-05, "loss": 0.0117, "step": 1871 }, { "epoch": 1.3383378016085792, "grad_norm": 0.01689414493739605, "learning_rate": 9.166429222232291e-05, "loss": 0.0267, "step": 1872 }, { "epoch": 1.3390527256479, "grad_norm": 0.015577950514853, "learning_rate": 9.165048313674131e-05, "loss": 0.0134, "step": 1873 }, { "epoch": 1.3397676496872206, "grad_norm": 0.019599517807364464, "learning_rate": 9.163666366439223e-05, "loss": 0.022, "step": 1874 }, { "epoch": 1.3404825737265416, "grad_norm": 0.013409187085926533, "learning_rate": 9.162283380872196e-05, "loss": 0.0115, "step": 1875 }, { "epoch": 1.3404825737265416, "eval_loss": 0.01150159165263176, "eval_runtime": 4.5897, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 1875 }, { "epoch": 1.3411974977658625, "grad_norm": 0.016599660739302635, "learning_rate": 9.160899357317939e-05, "loss": 0.0206, "step": 1876 }, { "epoch": 1.3419124218051832, "grad_norm": 0.013915888033807278, "learning_rate": 9.159514296121598e-05, "loss": 0.016, "step": 1877 }, { "epoch": 1.342627345844504, "grad_norm": 0.013962941244244576, "learning_rate": 9.158128197628578e-05, "loss": 0.0165, "step": 1878 }, { "epoch": 1.3433422698838249, "grad_norm": 0.014668694697320461, "learning_rate": 9.156741062184543e-05, "loss": 0.0156, "step": 1879 }, { "epoch": 1.3440571939231456, "grad_norm": 0.016311436891555786, "learning_rate": 9.155352890135417e-05, "loss": 0.0169, "step": 1880 }, { "epoch": 1.3440571939231456, "eval_loss": 0.011227770708501339, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 1880 }, { "epoch": 1.3447721179624665, "grad_norm": 0.015355485491454601, "learning_rate": 9.15396368182738e-05, "loss": 0.0132, "step": 1881 }, { "epoch": 1.3454870420017873, "grad_norm": 0.016225818544626236, "learning_rate": 9.152573437606873e-05, "loss": 0.0138, "step": 1882 }, { "epoch": 1.3462019660411082, "grad_norm": 0.020786665380001068, "learning_rate": 9.151182157820595e-05, "loss": 0.0103, "step": 1883 }, { "epoch": 1.346916890080429, "grad_norm": 0.013367709703743458, "learning_rate": 9.1497898428155e-05, "loss": 0.0119, "step": 1884 }, { "epoch": 1.3476318141197496, "grad_norm": 0.017688676714897156, "learning_rate": 9.148396492938804e-05, "loss": 0.0148, "step": 1885 }, { "epoch": 1.3476318141197496, "eval_loss": 0.0114684933796525, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 1885 }, { "epoch": 1.3483467381590706, "grad_norm": 0.014089971780776978, "learning_rate": 9.147002108537978e-05, "loss": 0.0153, "step": 1886 }, { "epoch": 1.3490616621983915, "grad_norm": 0.012943820096552372, "learning_rate": 9.145606689960756e-05, "loss": 0.0091, "step": 1887 }, { "epoch": 1.3497765862377122, "grad_norm": 0.015198432840406895, "learning_rate": 9.144210237555123e-05, "loss": 0.0109, "step": 1888 }, { "epoch": 1.350491510277033, "grad_norm": 0.013590089976787567, "learning_rate": 9.142812751669327e-05, "loss": 0.0122, "step": 1889 }, { "epoch": 1.351206434316354, "grad_norm": 0.01936030387878418, "learning_rate": 9.141414232651871e-05, "loss": 0.0186, "step": 1890 }, { "epoch": 1.351206434316354, "eval_loss": 0.011307029984891415, "eval_runtime": 4.6153, "eval_samples_per_second": 10.833, "eval_steps_per_second": 2.817, "step": 1890 }, { "epoch": 1.3519213583556748, "grad_norm": 0.012915239669382572, "learning_rate": 9.140014680851516e-05, "loss": 0.0119, "step": 1891 }, { "epoch": 1.3526362823949956, "grad_norm": 0.012782033532857895, "learning_rate": 9.138614096617284e-05, "loss": 0.0183, "step": 1892 }, { "epoch": 1.3533512064343163, "grad_norm": 0.01408474426716566, "learning_rate": 9.137212480298451e-05, "loss": 0.0214, "step": 1893 }, { "epoch": 1.3540661304736372, "grad_norm": 0.012427760288119316, "learning_rate": 9.135809832244548e-05, "loss": 0.0146, "step": 1894 }, { "epoch": 1.354781054512958, "grad_norm": 0.01665169559419155, "learning_rate": 9.13440615280537e-05, "loss": 0.0146, "step": 1895 }, { "epoch": 1.354781054512958, "eval_loss": 0.011184586212038994, "eval_runtime": 4.5975, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 1895 }, { "epoch": 1.3554959785522789, "grad_norm": 0.016715485602617264, "learning_rate": 9.133001442330964e-05, "loss": 0.0161, "step": 1896 }, { "epoch": 1.3562109025915996, "grad_norm": 0.02028093859553337, "learning_rate": 9.131595701171636e-05, "loss": 0.0185, "step": 1897 }, { "epoch": 1.3569258266309205, "grad_norm": 0.016979552805423737, "learning_rate": 9.130188929677947e-05, "loss": 0.0129, "step": 1898 }, { "epoch": 1.3576407506702413, "grad_norm": 0.01528275478631258, "learning_rate": 9.12878112820072e-05, "loss": 0.0208, "step": 1899 }, { "epoch": 1.358355674709562, "grad_norm": 0.01477435790002346, "learning_rate": 9.127372297091028e-05, "loss": 0.0137, "step": 1900 }, { "epoch": 1.358355674709562, "eval_loss": 0.011302005499601364, "eval_runtime": 4.5895, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 1900 }, { "epoch": 1.359070598748883, "grad_norm": 0.015357387252151966, "learning_rate": 9.125962436700207e-05, "loss": 0.0193, "step": 1901 }, { "epoch": 1.3597855227882039, "grad_norm": 0.015108560211956501, "learning_rate": 9.124551547379846e-05, "loss": 0.0119, "step": 1902 }, { "epoch": 1.3605004468275246, "grad_norm": 0.015022053383290768, "learning_rate": 9.123139629481792e-05, "loss": 0.0162, "step": 1903 }, { "epoch": 1.3612153708668453, "grad_norm": 0.014919208362698555, "learning_rate": 9.121726683358147e-05, "loss": 0.0218, "step": 1904 }, { "epoch": 1.3619302949061662, "grad_norm": 0.013417830690741539, "learning_rate": 9.120312709361271e-05, "loss": 0.0155, "step": 1905 }, { "epoch": 1.3619302949061662, "eval_loss": 0.01127215102314949, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 1905 }, { "epoch": 1.3626452189454872, "grad_norm": 0.01633331924676895, "learning_rate": 9.118897707843779e-05, "loss": 0.0171, "step": 1906 }, { "epoch": 1.363360142984808, "grad_norm": 0.014892606995999813, "learning_rate": 9.117481679158546e-05, "loss": 0.0264, "step": 1907 }, { "epoch": 1.3640750670241286, "grad_norm": 0.017210904508829117, "learning_rate": 9.116064623658695e-05, "loss": 0.0107, "step": 1908 }, { "epoch": 1.3647899910634496, "grad_norm": 0.016317181289196014, "learning_rate": 9.114646541697617e-05, "loss": 0.0184, "step": 1909 }, { "epoch": 1.3655049151027703, "grad_norm": 0.016119543462991714, "learning_rate": 9.113227433628947e-05, "loss": 0.0167, "step": 1910 }, { "epoch": 1.3655049151027703, "eval_loss": 0.011392368003726006, "eval_runtime": 4.6083, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 1910 }, { "epoch": 1.3662198391420912, "grad_norm": 0.013834879733622074, "learning_rate": 9.111807299806583e-05, "loss": 0.0163, "step": 1911 }, { "epoch": 1.366934763181412, "grad_norm": 0.020246438682079315, "learning_rate": 9.110386140584677e-05, "loss": 0.0131, "step": 1912 }, { "epoch": 1.3676496872207329, "grad_norm": 0.01670546643435955, "learning_rate": 9.108963956317635e-05, "loss": 0.0162, "step": 1913 }, { "epoch": 1.3683646112600536, "grad_norm": 0.011867700144648552, "learning_rate": 9.107540747360124e-05, "loss": 0.0101, "step": 1914 }, { "epoch": 1.3690795352993743, "grad_norm": 0.014327184297144413, "learning_rate": 9.10611651406706e-05, "loss": 0.0172, "step": 1915 }, { "epoch": 1.3690795352993743, "eval_loss": 0.01119151059538126, "eval_runtime": 4.5871, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 1915 }, { "epoch": 1.3697944593386953, "grad_norm": 0.02841680496931076, "learning_rate": 9.104691256793617e-05, "loss": 0.0263, "step": 1916 }, { "epoch": 1.3705093833780162, "grad_norm": 0.014524148777127266, "learning_rate": 9.103264975895225e-05, "loss": 0.0129, "step": 1917 }, { "epoch": 1.371224307417337, "grad_norm": 0.016533024609088898, "learning_rate": 9.101837671727572e-05, "loss": 0.016, "step": 1918 }, { "epoch": 1.3719392314566576, "grad_norm": 0.01556132361292839, "learning_rate": 9.100409344646594e-05, "loss": 0.015, "step": 1919 }, { "epoch": 1.3726541554959786, "grad_norm": 0.018488813191652298, "learning_rate": 9.098979995008486e-05, "loss": 0.0192, "step": 1920 }, { "epoch": 1.3726541554959786, "eval_loss": 0.01108295377343893, "eval_runtime": 4.6226, "eval_samples_per_second": 10.816, "eval_steps_per_second": 2.812, "step": 1920 }, { "epoch": 1.3733690795352993, "grad_norm": 0.014207213185727596, "learning_rate": 9.097549623169701e-05, "loss": 0.0108, "step": 1921 }, { "epoch": 1.3740840035746202, "grad_norm": 0.01374024897813797, "learning_rate": 9.096118229486944e-05, "loss": 0.0159, "step": 1922 }, { "epoch": 1.374798927613941, "grad_norm": 0.0166142750531435, "learning_rate": 9.094685814317174e-05, "loss": 0.0118, "step": 1923 }, { "epoch": 1.375513851653262, "grad_norm": 0.015591590665280819, "learning_rate": 9.093252378017604e-05, "loss": 0.0206, "step": 1924 }, { "epoch": 1.3762287756925826, "grad_norm": 0.014636357314884663, "learning_rate": 9.091817920945705e-05, "loss": 0.021, "step": 1925 }, { "epoch": 1.3762287756925826, "eval_loss": 0.010993588715791702, "eval_runtime": 4.5948, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 1925 }, { "epoch": 1.3769436997319036, "grad_norm": 0.016621237620711327, "learning_rate": 9.090382443459201e-05, "loss": 0.0163, "step": 1926 }, { "epoch": 1.3776586237712243, "grad_norm": 0.016196174547076225, "learning_rate": 9.08894594591607e-05, "loss": 0.0161, "step": 1927 }, { "epoch": 1.3783735478105452, "grad_norm": 0.01956063136458397, "learning_rate": 9.087508428674544e-05, "loss": 0.029, "step": 1928 }, { "epoch": 1.379088471849866, "grad_norm": 0.016074247658252716, "learning_rate": 9.086069892093113e-05, "loss": 0.0172, "step": 1929 }, { "epoch": 1.3798033958891867, "grad_norm": 0.01350831612944603, "learning_rate": 9.084630336530516e-05, "loss": 0.0112, "step": 1930 }, { "epoch": 1.3798033958891867, "eval_loss": 0.011115353554487228, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1930 }, { "epoch": 1.3805183199285076, "grad_norm": 0.01469758152961731, "learning_rate": 9.083189762345745e-05, "loss": 0.0223, "step": 1931 }, { "epoch": 1.3812332439678285, "grad_norm": 0.017386112362146378, "learning_rate": 9.081748169898054e-05, "loss": 0.0185, "step": 1932 }, { "epoch": 1.3819481680071493, "grad_norm": 0.016051054000854492, "learning_rate": 9.080305559546946e-05, "loss": 0.0142, "step": 1933 }, { "epoch": 1.38266309204647, "grad_norm": 0.017745232209563255, "learning_rate": 9.078861931652177e-05, "loss": 0.0206, "step": 1934 }, { "epoch": 1.383378016085791, "grad_norm": 0.014346342533826828, "learning_rate": 9.077417286573759e-05, "loss": 0.0095, "step": 1935 }, { "epoch": 1.383378016085791, "eval_loss": 0.01104192528873682, "eval_runtime": 4.5917, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 1935 }, { "epoch": 1.3840929401251116, "grad_norm": 0.014919226057827473, "learning_rate": 9.075971624671953e-05, "loss": 0.0195, "step": 1936 }, { "epoch": 1.3848078641644326, "grad_norm": 0.013273178599774837, "learning_rate": 9.074524946307281e-05, "loss": 0.0157, "step": 1937 }, { "epoch": 1.3855227882037533, "grad_norm": 0.014806984923779964, "learning_rate": 9.073077251840513e-05, "loss": 0.0156, "step": 1938 }, { "epoch": 1.3862377122430742, "grad_norm": 0.018936289474368095, "learning_rate": 9.071628541632675e-05, "loss": 0.0172, "step": 1939 }, { "epoch": 1.386952636282395, "grad_norm": 0.01563047245144844, "learning_rate": 9.070178816045044e-05, "loss": 0.0132, "step": 1940 }, { "epoch": 1.386952636282395, "eval_loss": 0.01113426685333252, "eval_runtime": 4.6104, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 1940 }, { "epoch": 1.387667560321716, "grad_norm": 0.015089408494532108, "learning_rate": 9.068728075439152e-05, "loss": 0.0132, "step": 1941 }, { "epoch": 1.3883824843610366, "grad_norm": 0.01350607629865408, "learning_rate": 9.067276320176783e-05, "loss": 0.0112, "step": 1942 }, { "epoch": 1.3890974084003576, "grad_norm": 0.013798077590763569, "learning_rate": 9.065823550619976e-05, "loss": 0.0117, "step": 1943 }, { "epoch": 1.3898123324396783, "grad_norm": 0.02177920937538147, "learning_rate": 9.064369767131022e-05, "loss": 0.0258, "step": 1944 }, { "epoch": 1.390527256478999, "grad_norm": 0.014415907673537731, "learning_rate": 9.062914970072462e-05, "loss": 0.0124, "step": 1945 }, { "epoch": 1.390527256478999, "eval_loss": 0.011014740914106369, "eval_runtime": 4.5845, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 1945 }, { "epoch": 1.39124218051832, "grad_norm": 0.01951115019619465, "learning_rate": 9.061459159807096e-05, "loss": 0.014, "step": 1946 }, { "epoch": 1.3919571045576409, "grad_norm": 0.020295822992920876, "learning_rate": 9.060002336697968e-05, "loss": 0.0162, "step": 1947 }, { "epoch": 1.3926720285969616, "grad_norm": 0.01774553954601288, "learning_rate": 9.058544501108384e-05, "loss": 0.0153, "step": 1948 }, { "epoch": 1.3933869526362823, "grad_norm": 0.016587335616350174, "learning_rate": 9.057085653401896e-05, "loss": 0.024, "step": 1949 }, { "epoch": 1.3941018766756033, "grad_norm": 0.022024407982826233, "learning_rate": 9.055625793942308e-05, "loss": 0.0131, "step": 1950 }, { "epoch": 1.3941018766756033, "eval_loss": 0.011188303120434284, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 1950 }, { "epoch": 1.394816800714924, "grad_norm": 0.01570919342339039, "learning_rate": 9.054164923093685e-05, "loss": 0.0138, "step": 1951 }, { "epoch": 1.395531724754245, "grad_norm": 0.01745607517659664, "learning_rate": 9.052703041220332e-05, "loss": 0.023, "step": 1952 }, { "epoch": 1.3962466487935656, "grad_norm": 0.019000640138983727, "learning_rate": 9.051240148686814e-05, "loss": 0.0163, "step": 1953 }, { "epoch": 1.3969615728328866, "grad_norm": 0.016498252749443054, "learning_rate": 9.049776245857947e-05, "loss": 0.0133, "step": 1954 }, { "epoch": 1.3976764968722073, "grad_norm": 0.013326888903975487, "learning_rate": 9.048311333098798e-05, "loss": 0.0154, "step": 1955 }, { "epoch": 1.3976764968722073, "eval_loss": 0.011164122261106968, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 1955 }, { "epoch": 1.398391420911528, "grad_norm": 0.014406205154955387, "learning_rate": 9.046845410774685e-05, "loss": 0.015, "step": 1956 }, { "epoch": 1.399106344950849, "grad_norm": 0.019207030534744263, "learning_rate": 9.045378479251179e-05, "loss": 0.0157, "step": 1957 }, { "epoch": 1.39982126899017, "grad_norm": 0.017531711608171463, "learning_rate": 9.0439105388941e-05, "loss": 0.0115, "step": 1958 }, { "epoch": 1.4005361930294906, "grad_norm": 0.018893053755164146, "learning_rate": 9.042441590069526e-05, "loss": 0.015, "step": 1959 }, { "epoch": 1.4012511170688113, "grad_norm": 0.014852351509034634, "learning_rate": 9.04097163314378e-05, "loss": 0.0104, "step": 1960 }, { "epoch": 1.4012511170688113, "eval_loss": 0.011135311797261238, "eval_runtime": 4.5888, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 1960 }, { "epoch": 1.4019660411081323, "grad_norm": 0.013398561626672745, "learning_rate": 9.03950066848344e-05, "loss": 0.0148, "step": 1961 }, { "epoch": 1.4026809651474532, "grad_norm": 0.015741785988211632, "learning_rate": 9.038028696455334e-05, "loss": 0.0196, "step": 1962 }, { "epoch": 1.403395889186774, "grad_norm": 0.014799479395151138, "learning_rate": 9.03655571742654e-05, "loss": 0.017, "step": 1963 }, { "epoch": 1.4041108132260947, "grad_norm": 0.014513036236166954, "learning_rate": 9.035081731764389e-05, "loss": 0.0151, "step": 1964 }, { "epoch": 1.4048257372654156, "grad_norm": 0.016050880774855614, "learning_rate": 9.033606739836462e-05, "loss": 0.012, "step": 1965 }, { "epoch": 1.4048257372654156, "eval_loss": 0.01112150214612484, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1965 }, { "epoch": 1.4055406613047363, "grad_norm": 0.014065328054130077, "learning_rate": 9.032130742010594e-05, "loss": 0.014, "step": 1966 }, { "epoch": 1.4062555853440573, "grad_norm": 0.013815117999911308, "learning_rate": 9.030653738654864e-05, "loss": 0.0134, "step": 1967 }, { "epoch": 1.406970509383378, "grad_norm": 0.016044605523347855, "learning_rate": 9.02917573013761e-05, "loss": 0.0194, "step": 1968 }, { "epoch": 1.407685433422699, "grad_norm": 0.019221637398004532, "learning_rate": 9.027696716827415e-05, "loss": 0.0214, "step": 1969 }, { "epoch": 1.4084003574620196, "grad_norm": 0.017899304628372192, "learning_rate": 9.026216699093114e-05, "loss": 0.0159, "step": 1970 }, { "epoch": 1.4084003574620196, "eval_loss": 0.011174379847943783, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 1970 }, { "epoch": 1.4091152815013404, "grad_norm": 0.011758523061871529, "learning_rate": 9.024735677303793e-05, "loss": 0.0089, "step": 1971 }, { "epoch": 1.4098302055406613, "grad_norm": 0.015721654519438744, "learning_rate": 9.023253651828789e-05, "loss": 0.0165, "step": 1972 }, { "epoch": 1.4105451295799822, "grad_norm": 0.012500890530645847, "learning_rate": 9.021770623037688e-05, "loss": 0.0094, "step": 1973 }, { "epoch": 1.411260053619303, "grad_norm": 0.015550049021840096, "learning_rate": 9.020286591300325e-05, "loss": 0.0185, "step": 1974 }, { "epoch": 1.4119749776586237, "grad_norm": 0.01582292653620243, "learning_rate": 9.018801556986789e-05, "loss": 0.0205, "step": 1975 }, { "epoch": 1.4119749776586237, "eval_loss": 0.011432874947786331, "eval_runtime": 4.5882, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 1975 }, { "epoch": 1.4126899016979446, "grad_norm": 0.015909584239125252, "learning_rate": 9.017315520467415e-05, "loss": 0.0136, "step": 1976 }, { "epoch": 1.4134048257372653, "grad_norm": 0.015354026108980179, "learning_rate": 9.015828482112792e-05, "loss": 0.0152, "step": 1977 }, { "epoch": 1.4141197497765863, "grad_norm": 0.013039936311542988, "learning_rate": 9.014340442293756e-05, "loss": 0.0164, "step": 1978 }, { "epoch": 1.414834673815907, "grad_norm": 0.013835371471941471, "learning_rate": 9.012851401381391e-05, "loss": 0.0127, "step": 1979 }, { "epoch": 1.415549597855228, "grad_norm": 0.012757007032632828, "learning_rate": 9.011361359747034e-05, "loss": 0.0111, "step": 1980 }, { "epoch": 1.415549597855228, "eval_loss": 0.011672696098685265, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 1980 }, { "epoch": 1.4162645218945487, "grad_norm": 0.01627998799085617, "learning_rate": 9.009870317762273e-05, "loss": 0.0167, "step": 1981 }, { "epoch": 1.4169794459338696, "grad_norm": 0.013908687978982925, "learning_rate": 9.008378275798938e-05, "loss": 0.012, "step": 1982 }, { "epoch": 1.4176943699731903, "grad_norm": 0.014941319823265076, "learning_rate": 9.006885234229118e-05, "loss": 0.0125, "step": 1983 }, { "epoch": 1.4184092940125113, "grad_norm": 0.016626819968223572, "learning_rate": 9.005391193425145e-05, "loss": 0.0123, "step": 1984 }, { "epoch": 1.419124218051832, "grad_norm": 0.015945926308631897, "learning_rate": 9.003896153759601e-05, "loss": 0.0147, "step": 1985 }, { "epoch": 1.419124218051832, "eval_loss": 0.011643361300230026, "eval_runtime": 4.5961, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 1985 }, { "epoch": 1.4198391420911527, "grad_norm": 0.01790180616080761, "learning_rate": 9.002400115605319e-05, "loss": 0.0153, "step": 1986 }, { "epoch": 1.4205540661304736, "grad_norm": 0.018518466502428055, "learning_rate": 9.00090307933538e-05, "loss": 0.0149, "step": 1987 }, { "epoch": 1.4212689901697946, "grad_norm": 0.017709026113152504, "learning_rate": 8.999405045323113e-05, "loss": 0.0161, "step": 1988 }, { "epoch": 1.4219839142091153, "grad_norm": 0.01828734204173088, "learning_rate": 8.997906013942096e-05, "loss": 0.0175, "step": 1989 }, { "epoch": 1.422698838248436, "grad_norm": 0.015297790057957172, "learning_rate": 8.99640598556616e-05, "loss": 0.011, "step": 1990 }, { "epoch": 1.422698838248436, "eval_loss": 0.011845052242279053, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 1990 }, { "epoch": 1.423413762287757, "grad_norm": 0.018549680709838867, "learning_rate": 8.994904960569377e-05, "loss": 0.0219, "step": 1991 }, { "epoch": 1.4241286863270777, "grad_norm": 0.018171481788158417, "learning_rate": 8.993402939326072e-05, "loss": 0.0228, "step": 1992 }, { "epoch": 1.4248436103663986, "grad_norm": 0.018749883398413658, "learning_rate": 8.99189992221082e-05, "loss": 0.0145, "step": 1993 }, { "epoch": 1.4255585344057193, "grad_norm": 0.016247782856225967, "learning_rate": 8.990395909598442e-05, "loss": 0.019, "step": 1994 }, { "epoch": 1.4262734584450403, "grad_norm": 0.021898038685321808, "learning_rate": 8.988890901864005e-05, "loss": 0.0217, "step": 1995 }, { "epoch": 1.4262734584450403, "eval_loss": 0.011782094836235046, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 1995 }, { "epoch": 1.426988382484361, "grad_norm": 0.016189316287636757, "learning_rate": 8.987384899382831e-05, "loss": 0.0153, "step": 1996 }, { "epoch": 1.427703306523682, "grad_norm": 0.01589430309832096, "learning_rate": 8.985877902530481e-05, "loss": 0.0114, "step": 1997 }, { "epoch": 1.4284182305630027, "grad_norm": 0.013527177274227142, "learning_rate": 8.984369911682773e-05, "loss": 0.015, "step": 1998 }, { "epoch": 1.4291331546023236, "grad_norm": 0.020180394873023033, "learning_rate": 8.982860927215765e-05, "loss": 0.0182, "step": 1999 }, { "epoch": 1.4298480786416443, "grad_norm": 0.016330575570464134, "learning_rate": 8.981350949505769e-05, "loss": 0.0241, "step": 2000 }, { "epoch": 1.4298480786416443, "eval_loss": 0.011393582448363304, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 2000 }, { "epoch": 1.430563002680965, "grad_norm": 0.018953390419483185, "learning_rate": 8.979839978929342e-05, "loss": 0.0153, "step": 2001 }, { "epoch": 1.431277926720286, "grad_norm": 0.014812394045293331, "learning_rate": 8.978328015863288e-05, "loss": 0.0102, "step": 2002 }, { "epoch": 1.431992850759607, "grad_norm": 0.014362107031047344, "learning_rate": 8.976815060684659e-05, "loss": 0.0109, "step": 2003 }, { "epoch": 1.4327077747989276, "grad_norm": 0.01440686546266079, "learning_rate": 8.975301113770756e-05, "loss": 0.0141, "step": 2004 }, { "epoch": 1.4334226988382484, "grad_norm": 0.016222048550844193, "learning_rate": 8.973786175499123e-05, "loss": 0.011, "step": 2005 }, { "epoch": 1.4334226988382484, "eval_loss": 0.011298870667815208, "eval_runtime": 4.5951, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 2005 }, { "epoch": 1.4341376228775693, "grad_norm": 0.012930365279316902, "learning_rate": 8.972270246247558e-05, "loss": 0.0151, "step": 2006 }, { "epoch": 1.43485254691689, "grad_norm": 0.016847429797053337, "learning_rate": 8.970753326394101e-05, "loss": 0.013, "step": 2007 }, { "epoch": 1.435567470956211, "grad_norm": 0.012469938024878502, "learning_rate": 8.96923541631704e-05, "loss": 0.0112, "step": 2008 }, { "epoch": 1.4362823949955317, "grad_norm": 0.0203084796667099, "learning_rate": 8.96771651639491e-05, "loss": 0.0173, "step": 2009 }, { "epoch": 1.4369973190348526, "grad_norm": 0.019588468596339226, "learning_rate": 8.966196627006493e-05, "loss": 0.0157, "step": 2010 }, { "epoch": 1.4369973190348526, "eval_loss": 0.011188830249011517, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 2010 }, { "epoch": 1.4377122430741733, "grad_norm": 0.025144187733530998, "learning_rate": 8.964675748530819e-05, "loss": 0.0134, "step": 2011 }, { "epoch": 1.438427167113494, "grad_norm": 0.017597293481230736, "learning_rate": 8.963153881347163e-05, "loss": 0.0207, "step": 2012 }, { "epoch": 1.439142091152815, "grad_norm": 0.014313186518847942, "learning_rate": 8.96163102583505e-05, "loss": 0.0105, "step": 2013 }, { "epoch": 1.439857015192136, "grad_norm": 0.01563636213541031, "learning_rate": 8.960107182374243e-05, "loss": 0.0149, "step": 2014 }, { "epoch": 1.4405719392314567, "grad_norm": 0.013917957432568073, "learning_rate": 8.958582351344758e-05, "loss": 0.0077, "step": 2015 }, { "epoch": 1.4405719392314567, "eval_loss": 0.011352426372468472, "eval_runtime": 4.5948, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 2015 }, { "epoch": 1.4412868632707774, "grad_norm": 0.01584140956401825, "learning_rate": 8.957056533126861e-05, "loss": 0.0176, "step": 2016 }, { "epoch": 1.4420017873100983, "grad_norm": 0.018148059025406837, "learning_rate": 8.955529728101055e-05, "loss": 0.0144, "step": 2017 }, { "epoch": 1.4427167113494193, "grad_norm": 0.012504956685006618, "learning_rate": 8.954001936648095e-05, "loss": 0.0229, "step": 2018 }, { "epoch": 1.44343163538874, "grad_norm": 0.016239404678344727, "learning_rate": 8.952473159148981e-05, "loss": 0.0125, "step": 2019 }, { "epoch": 1.4441465594280607, "grad_norm": 0.015680650249123573, "learning_rate": 8.950943395984958e-05, "loss": 0.0132, "step": 2020 }, { "epoch": 1.4441465594280607, "eval_loss": 0.011219622567296028, "eval_runtime": 4.6032, "eval_samples_per_second": 10.862, "eval_steps_per_second": 2.824, "step": 2020 }, { "epoch": 1.4448614834673816, "grad_norm": 0.018904339522123337, "learning_rate": 8.949412647537518e-05, "loss": 0.0203, "step": 2021 }, { "epoch": 1.4455764075067024, "grad_norm": 0.01736549288034439, "learning_rate": 8.947880914188397e-05, "loss": 0.0228, "step": 2022 }, { "epoch": 1.4462913315460233, "grad_norm": 0.011641125194728374, "learning_rate": 8.946348196319579e-05, "loss": 0.0139, "step": 2023 }, { "epoch": 1.447006255585344, "grad_norm": 0.015802456066012383, "learning_rate": 8.944814494313288e-05, "loss": 0.0202, "step": 2024 }, { "epoch": 1.447721179624665, "grad_norm": 0.014809551648795605, "learning_rate": 8.943279808552001e-05, "loss": 0.0133, "step": 2025 }, { "epoch": 1.447721179624665, "eval_loss": 0.011161540634930134, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 2025 }, { "epoch": 1.4484361036639857, "grad_norm": 0.01775025576353073, "learning_rate": 8.941744139418437e-05, "loss": 0.0193, "step": 2026 }, { "epoch": 1.4491510277033064, "grad_norm": 0.014593123458325863, "learning_rate": 8.940207487295558e-05, "loss": 0.0154, "step": 2027 }, { "epoch": 1.4498659517426273, "grad_norm": 0.016986563801765442, "learning_rate": 8.938669852566576e-05, "loss": 0.0136, "step": 2028 }, { "epoch": 1.4505808757819483, "grad_norm": 0.016237590461969376, "learning_rate": 8.937131235614945e-05, "loss": 0.0205, "step": 2029 }, { "epoch": 1.451295799821269, "grad_norm": 0.015030661597847939, "learning_rate": 8.935591636824359e-05, "loss": 0.0166, "step": 2030 }, { "epoch": 1.451295799821269, "eval_loss": 0.011290321126580238, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2030 }, { "epoch": 1.4520107238605897, "grad_norm": 0.018569348379969597, "learning_rate": 8.934051056578767e-05, "loss": 0.0135, "step": 2031 }, { "epoch": 1.4527256478999107, "grad_norm": 0.018324583768844604, "learning_rate": 8.932509495262358e-05, "loss": 0.0139, "step": 2032 }, { "epoch": 1.4534405719392314, "grad_norm": 0.01405333075672388, "learning_rate": 8.930966953259563e-05, "loss": 0.0159, "step": 2033 }, { "epoch": 1.4541554959785523, "grad_norm": 0.01565111242234707, "learning_rate": 8.929423430955062e-05, "loss": 0.0139, "step": 2034 }, { "epoch": 1.454870420017873, "grad_norm": 0.016095956787467003, "learning_rate": 8.927878928733777e-05, "loss": 0.0242, "step": 2035 }, { "epoch": 1.454870420017873, "eval_loss": 0.011118859052658081, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 2035 }, { "epoch": 1.455585344057194, "grad_norm": 0.015415162779390812, "learning_rate": 8.926333446980873e-05, "loss": 0.0126, "step": 2036 }, { "epoch": 1.4563002680965147, "grad_norm": 0.018628031015396118, "learning_rate": 8.924786986081763e-05, "loss": 0.0197, "step": 2037 }, { "epoch": 1.4570151921358356, "grad_norm": 0.015446518547832966, "learning_rate": 8.923239546422102e-05, "loss": 0.0144, "step": 2038 }, { "epoch": 1.4577301161751564, "grad_norm": 0.013255229219794273, "learning_rate": 8.92169112838779e-05, "loss": 0.0141, "step": 2039 }, { "epoch": 1.4584450402144773, "grad_norm": 0.015534991398453712, "learning_rate": 8.92014173236497e-05, "loss": 0.0256, "step": 2040 }, { "epoch": 1.4584450402144773, "eval_loss": 0.01135935913771391, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 2040 }, { "epoch": 1.459159964253798, "grad_norm": 0.018016254529356956, "learning_rate": 8.918591358740028e-05, "loss": 0.0205, "step": 2041 }, { "epoch": 1.4598748882931187, "grad_norm": 0.016343848779797554, "learning_rate": 8.917040007899595e-05, "loss": 0.0214, "step": 2042 }, { "epoch": 1.4605898123324397, "grad_norm": 0.011532915756106377, "learning_rate": 8.915487680230549e-05, "loss": 0.0095, "step": 2043 }, { "epoch": 1.4613047363717606, "grad_norm": 0.01625422015786171, "learning_rate": 8.913934376120005e-05, "loss": 0.0114, "step": 2044 }, { "epoch": 1.4620196604110813, "grad_norm": 0.024719446897506714, "learning_rate": 8.912380095955326e-05, "loss": 0.0168, "step": 2045 }, { "epoch": 1.4620196604110813, "eval_loss": 0.011177478358149529, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 2045 }, { "epoch": 1.462734584450402, "grad_norm": 0.01632869988679886, "learning_rate": 8.910824840124117e-05, "loss": 0.0105, "step": 2046 }, { "epoch": 1.463449508489723, "grad_norm": 0.01735582761466503, "learning_rate": 8.909268609014228e-05, "loss": 0.0171, "step": 2047 }, { "epoch": 1.4641644325290437, "grad_norm": 0.015812167897820473, "learning_rate": 8.907711403013748e-05, "loss": 0.0102, "step": 2048 }, { "epoch": 1.4648793565683647, "grad_norm": 0.01824556663632393, "learning_rate": 8.906153222511013e-05, "loss": 0.0218, "step": 2049 }, { "epoch": 1.4655942806076854, "grad_norm": 0.01574079506099224, "learning_rate": 8.904594067894603e-05, "loss": 0.0187, "step": 2050 }, { "epoch": 1.4655942806076854, "eval_loss": 0.011089160107076168, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 2050 }, { "epoch": 1.4663092046470063, "grad_norm": 0.013484695926308632, "learning_rate": 8.903033939553336e-05, "loss": 0.0125, "step": 2051 }, { "epoch": 1.467024128686327, "grad_norm": 0.015412974171340466, "learning_rate": 8.90147283787628e-05, "loss": 0.0254, "step": 2052 }, { "epoch": 1.467739052725648, "grad_norm": 0.015785623341798782, "learning_rate": 8.899910763252734e-05, "loss": 0.012, "step": 2053 }, { "epoch": 1.4684539767649687, "grad_norm": 0.01592998206615448, "learning_rate": 8.898347716072254e-05, "loss": 0.0101, "step": 2054 }, { "epoch": 1.4691689008042896, "grad_norm": 0.012085553258657455, "learning_rate": 8.896783696724629e-05, "loss": 0.0143, "step": 2055 }, { "epoch": 1.4691689008042896, "eval_loss": 0.011240259744226933, "eval_runtime": 4.5784, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 2055 }, { "epoch": 1.4698838248436104, "grad_norm": 0.02348512038588524, "learning_rate": 8.895218705599893e-05, "loss": 0.0139, "step": 2056 }, { "epoch": 1.470598748882931, "grad_norm": 0.018552422523498535, "learning_rate": 8.89365274308832e-05, "loss": 0.0193, "step": 2057 }, { "epoch": 1.471313672922252, "grad_norm": 0.013909941539168358, "learning_rate": 8.892085809580436e-05, "loss": 0.0095, "step": 2058 }, { "epoch": 1.472028596961573, "grad_norm": 0.01815125346183777, "learning_rate": 8.890517905466991e-05, "loss": 0.0134, "step": 2059 }, { "epoch": 1.4727435210008937, "grad_norm": 0.017157597467303276, "learning_rate": 8.888949031138997e-05, "loss": 0.0129, "step": 2060 }, { "epoch": 1.4727435210008937, "eval_loss": 0.01107516698539257, "eval_runtime": 4.5819, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 2060 }, { "epoch": 1.4734584450402144, "grad_norm": 0.017310673370957375, "learning_rate": 8.887379186987695e-05, "loss": 0.0172, "step": 2061 }, { "epoch": 1.4741733690795353, "grad_norm": 0.01359073631465435, "learning_rate": 8.885808373404572e-05, "loss": 0.0136, "step": 2062 }, { "epoch": 1.474888293118856, "grad_norm": 0.018681153655052185, "learning_rate": 8.884236590781354e-05, "loss": 0.0184, "step": 2063 }, { "epoch": 1.475603217158177, "grad_norm": 0.015652533620595932, "learning_rate": 8.882663839510016e-05, "loss": 0.0211, "step": 2064 }, { "epoch": 1.4763181411974977, "grad_norm": 0.015462543815374374, "learning_rate": 8.881090119982764e-05, "loss": 0.0153, "step": 2065 }, { "epoch": 1.4763181411974977, "eval_loss": 0.011037221178412437, "eval_runtime": 4.6028, "eval_samples_per_second": 10.863, "eval_steps_per_second": 2.824, "step": 2065 }, { "epoch": 1.4770330652368187, "grad_norm": 0.01687895879149437, "learning_rate": 8.879515432592057e-05, "loss": 0.015, "step": 2066 }, { "epoch": 1.4777479892761394, "grad_norm": 0.01516218576580286, "learning_rate": 8.877939777730586e-05, "loss": 0.0118, "step": 2067 }, { "epoch": 1.4784629133154603, "grad_norm": 0.018488092347979546, "learning_rate": 8.876363155791285e-05, "loss": 0.014, "step": 2068 }, { "epoch": 1.479177837354781, "grad_norm": 0.01637318544089794, "learning_rate": 8.874785567167334e-05, "loss": 0.0152, "step": 2069 }, { "epoch": 1.479892761394102, "grad_norm": 0.01810058206319809, "learning_rate": 8.87320701225215e-05, "loss": 0.0265, "step": 2070 }, { "epoch": 1.479892761394102, "eval_loss": 0.011073418892920017, "eval_runtime": 4.5976, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.828, "step": 2070 }, { "epoch": 1.4806076854334227, "grad_norm": 0.016232984140515327, "learning_rate": 8.87162749143939e-05, "loss": 0.011, "step": 2071 }, { "epoch": 1.4813226094727434, "grad_norm": 0.017848651856184006, "learning_rate": 8.870047005122956e-05, "loss": 0.0138, "step": 2072 }, { "epoch": 1.4820375335120644, "grad_norm": 0.012457217089831829, "learning_rate": 8.86846555369699e-05, "loss": 0.0124, "step": 2073 }, { "epoch": 1.4827524575513853, "grad_norm": 0.019290799275040627, "learning_rate": 8.866883137555869e-05, "loss": 0.0163, "step": 2074 }, { "epoch": 1.483467381590706, "grad_norm": 0.01679968647658825, "learning_rate": 8.865299757094217e-05, "loss": 0.0128, "step": 2075 }, { "epoch": 1.483467381590706, "eval_loss": 0.011187970638275146, "eval_runtime": 4.6166, "eval_samples_per_second": 10.83, "eval_steps_per_second": 2.816, "step": 2075 }, { "epoch": 1.4841823056300267, "grad_norm": 0.01721375249326229, "learning_rate": 8.863715412706896e-05, "loss": 0.0139, "step": 2076 }, { "epoch": 1.4848972296693477, "grad_norm": 0.017243042588233948, "learning_rate": 8.86213010478901e-05, "loss": 0.014, "step": 2077 }, { "epoch": 1.4856121537086684, "grad_norm": 0.019469909369945526, "learning_rate": 8.860543833735901e-05, "loss": 0.0145, "step": 2078 }, { "epoch": 1.4863270777479893, "grad_norm": 0.017280030995607376, "learning_rate": 8.858956599943152e-05, "loss": 0.0132, "step": 2079 }, { "epoch": 1.48704200178731, "grad_norm": 0.015561066567897797, "learning_rate": 8.857368403806585e-05, "loss": 0.0137, "step": 2080 }, { "epoch": 1.48704200178731, "eval_loss": 0.01143869198858738, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 2080 }, { "epoch": 1.487756925826631, "grad_norm": 0.017241334542632103, "learning_rate": 8.855779245722264e-05, "loss": 0.0119, "step": 2081 }, { "epoch": 1.4884718498659517, "grad_norm": 0.017340892925858498, "learning_rate": 8.854189126086493e-05, "loss": 0.0174, "step": 2082 }, { "epoch": 1.4891867739052724, "grad_norm": 0.014535880647599697, "learning_rate": 8.852598045295816e-05, "loss": 0.0145, "step": 2083 }, { "epoch": 1.4899016979445934, "grad_norm": 0.019007856026291847, "learning_rate": 8.851006003747012e-05, "loss": 0.019, "step": 2084 }, { "epoch": 1.4906166219839143, "grad_norm": 0.015267900191247463, "learning_rate": 8.849413001837105e-05, "loss": 0.0108, "step": 2085 }, { "epoch": 1.4906166219839143, "eval_loss": 0.011251415126025677, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 2085 }, { "epoch": 1.491331546023235, "grad_norm": 0.016497738659381866, "learning_rate": 8.847819039963359e-05, "loss": 0.0216, "step": 2086 }, { "epoch": 1.4920464700625558, "grad_norm": 0.016203636303544044, "learning_rate": 8.846224118523271e-05, "loss": 0.012, "step": 2087 }, { "epoch": 1.4927613941018767, "grad_norm": 0.016652943566441536, "learning_rate": 8.844628237914584e-05, "loss": 0.0252, "step": 2088 }, { "epoch": 1.4934763181411976, "grad_norm": 0.012643840163946152, "learning_rate": 8.843031398535277e-05, "loss": 0.0119, "step": 2089 }, { "epoch": 1.4941912421805184, "grad_norm": 0.015806615352630615, "learning_rate": 8.841433600783567e-05, "loss": 0.0161, "step": 2090 }, { "epoch": 1.4941912421805184, "eval_loss": 0.011383803561329842, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 2090 }, { "epoch": 1.494906166219839, "grad_norm": 0.015620295889675617, "learning_rate": 8.839834845057914e-05, "loss": 0.0152, "step": 2091 }, { "epoch": 1.49562109025916, "grad_norm": 0.018028326332569122, "learning_rate": 8.838235131757014e-05, "loss": 0.0216, "step": 2092 }, { "epoch": 1.4963360142984807, "grad_norm": 0.013831286691129208, "learning_rate": 8.8366344612798e-05, "loss": 0.0143, "step": 2093 }, { "epoch": 1.4970509383378017, "grad_norm": 0.01104454044252634, "learning_rate": 8.83503283402545e-05, "loss": 0.0087, "step": 2094 }, { "epoch": 1.4977658623771224, "grad_norm": 0.01538182981312275, "learning_rate": 8.833430250393375e-05, "loss": 0.0208, "step": 2095 }, { "epoch": 1.4977658623771224, "eval_loss": 0.01132628321647644, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 2095 }, { "epoch": 1.4984807864164433, "grad_norm": 0.016718409955501556, "learning_rate": 8.831826710783225e-05, "loss": 0.0178, "step": 2096 }, { "epoch": 1.499195710455764, "grad_norm": 0.014881463721394539, "learning_rate": 8.83022221559489e-05, "loss": 0.0162, "step": 2097 }, { "epoch": 1.4999106344950848, "grad_norm": 0.018583783879876137, "learning_rate": 8.828616765228499e-05, "loss": 0.0133, "step": 2098 }, { "epoch": 1.5006255585344057, "grad_norm": 0.017061976715922356, "learning_rate": 8.827010360084418e-05, "loss": 0.0191, "step": 2099 }, { "epoch": 1.5013404825737267, "grad_norm": 0.014572933316230774, "learning_rate": 8.825403000563249e-05, "loss": 0.0171, "step": 2100 }, { "epoch": 1.5013404825737267, "eval_loss": 0.011309598572552204, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2100 }, { "epoch": 1.5020554066130474, "grad_norm": 0.017424410209059715, "learning_rate": 8.823794687065837e-05, "loss": 0.0168, "step": 2101 }, { "epoch": 1.502770330652368, "grad_norm": 0.014239796437323093, "learning_rate": 8.822185419993258e-05, "loss": 0.0135, "step": 2102 }, { "epoch": 1.503485254691689, "grad_norm": 0.014319462701678276, "learning_rate": 8.820575199746835e-05, "loss": 0.0122, "step": 2103 }, { "epoch": 1.50420017873101, "grad_norm": 0.017703980207443237, "learning_rate": 8.81896402672812e-05, "loss": 0.0121, "step": 2104 }, { "epoch": 1.5049151027703307, "grad_norm": 0.017482534050941467, "learning_rate": 8.817351901338908e-05, "loss": 0.0138, "step": 2105 }, { "epoch": 1.5049151027703307, "eval_loss": 0.01128739770501852, "eval_runtime": 4.5732, "eval_samples_per_second": 10.933, "eval_steps_per_second": 2.843, "step": 2105 }, { "epoch": 1.5056300268096514, "grad_norm": 0.01776953414082527, "learning_rate": 8.815738823981229e-05, "loss": 0.0124, "step": 2106 }, { "epoch": 1.5063449508489724, "grad_norm": 0.01451632846146822, "learning_rate": 8.814124795057351e-05, "loss": 0.0113, "step": 2107 }, { "epoch": 1.507059874888293, "grad_norm": 0.018393440172076225, "learning_rate": 8.812509814969779e-05, "loss": 0.0223, "step": 2108 }, { "epoch": 1.5077747989276138, "grad_norm": 0.014507952146232128, "learning_rate": 8.810893884121255e-05, "loss": 0.0112, "step": 2109 }, { "epoch": 1.5084897229669347, "grad_norm": 0.015335222706198692, "learning_rate": 8.80927700291476e-05, "loss": 0.0137, "step": 2110 }, { "epoch": 1.5084897229669347, "eval_loss": 0.011121072806417942, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2110 }, { "epoch": 1.5092046470062557, "grad_norm": 0.011526180431246758, "learning_rate": 8.807659171753512e-05, "loss": 0.0134, "step": 2111 }, { "epoch": 1.5099195710455764, "grad_norm": 0.015054570510983467, "learning_rate": 8.806040391040962e-05, "loss": 0.0154, "step": 2112 }, { "epoch": 1.510634495084897, "grad_norm": 0.013827070593833923, "learning_rate": 8.804420661180801e-05, "loss": 0.021, "step": 2113 }, { "epoch": 1.511349419124218, "grad_norm": 0.01210605725646019, "learning_rate": 8.802799982576956e-05, "loss": 0.0115, "step": 2114 }, { "epoch": 1.512064343163539, "grad_norm": 0.01490862388163805, "learning_rate": 8.80117835563359e-05, "loss": 0.0109, "step": 2115 }, { "epoch": 1.512064343163539, "eval_loss": 0.011132428422570229, "eval_runtime": 4.5796, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 2115 }, { "epoch": 1.5127792672028597, "grad_norm": 0.0166309867054224, "learning_rate": 8.799555780755107e-05, "loss": 0.021, "step": 2116 }, { "epoch": 1.5134941912421804, "grad_norm": 0.009527546353638172, "learning_rate": 8.797932258346137e-05, "loss": 0.0115, "step": 2117 }, { "epoch": 1.5142091152815014, "grad_norm": 0.01623200811445713, "learning_rate": 8.796307788811559e-05, "loss": 0.0171, "step": 2118 }, { "epoch": 1.5149240393208223, "grad_norm": 0.01603994332253933, "learning_rate": 8.79468237255648e-05, "loss": 0.0165, "step": 2119 }, { "epoch": 1.515638963360143, "grad_norm": 0.01681458204984665, "learning_rate": 8.793056009986243e-05, "loss": 0.0154, "step": 2120 }, { "epoch": 1.515638963360143, "eval_loss": 0.010936970822513103, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 2120 }, { "epoch": 1.5163538873994638, "grad_norm": 0.0113649507984519, "learning_rate": 8.791428701506433e-05, "loss": 0.0084, "step": 2121 }, { "epoch": 1.5170688114387847, "grad_norm": 0.015973057597875595, "learning_rate": 8.789800447522862e-05, "loss": 0.0125, "step": 2122 }, { "epoch": 1.5177837354781054, "grad_norm": 0.01941191591322422, "learning_rate": 8.788171248441587e-05, "loss": 0.0234, "step": 2123 }, { "epoch": 1.5184986595174261, "grad_norm": 0.014182965271174908, "learning_rate": 8.786541104668895e-05, "loss": 0.0095, "step": 2124 }, { "epoch": 1.519213583556747, "grad_norm": 0.015106436796486378, "learning_rate": 8.784910016611311e-05, "loss": 0.0181, "step": 2125 }, { "epoch": 1.519213583556747, "eval_loss": 0.011062691919505596, "eval_runtime": 4.5714, "eval_samples_per_second": 10.937, "eval_steps_per_second": 2.844, "step": 2125 }, { "epoch": 1.519928507596068, "grad_norm": 0.013728204183280468, "learning_rate": 8.783277984675593e-05, "loss": 0.0182, "step": 2126 }, { "epoch": 1.5206434316353887, "grad_norm": 0.014506184495985508, "learning_rate": 8.781645009268738e-05, "loss": 0.0133, "step": 2127 }, { "epoch": 1.5213583556747095, "grad_norm": 0.016507098451256752, "learning_rate": 8.780011090797973e-05, "loss": 0.0165, "step": 2128 }, { "epoch": 1.5220732797140304, "grad_norm": 0.01834407076239586, "learning_rate": 8.778376229670766e-05, "loss": 0.0222, "step": 2129 }, { "epoch": 1.5227882037533513, "grad_norm": 0.014034483581781387, "learning_rate": 8.776740426294818e-05, "loss": 0.0158, "step": 2130 }, { "epoch": 1.5227882037533513, "eval_loss": 0.011049783788621426, "eval_runtime": 4.6009, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 2130 }, { "epoch": 1.523503127792672, "grad_norm": 0.018872834742069244, "learning_rate": 8.775103681078061e-05, "loss": 0.018, "step": 2131 }, { "epoch": 1.5242180518319928, "grad_norm": 0.012368648312985897, "learning_rate": 8.773465994428669e-05, "loss": 0.0117, "step": 2132 }, { "epoch": 1.5249329758713137, "grad_norm": 0.016282133758068085, "learning_rate": 8.771827366755046e-05, "loss": 0.0186, "step": 2133 }, { "epoch": 1.5256478999106347, "grad_norm": 0.013166324235498905, "learning_rate": 8.770187798465832e-05, "loss": 0.0142, "step": 2134 }, { "epoch": 1.5263628239499554, "grad_norm": 0.017073651775717735, "learning_rate": 8.7685472899699e-05, "loss": 0.0108, "step": 2135 }, { "epoch": 1.5263628239499554, "eval_loss": 0.011303042992949486, "eval_runtime": 4.6231, "eval_samples_per_second": 10.815, "eval_steps_per_second": 2.812, "step": 2135 }, { "epoch": 1.527077747989276, "grad_norm": 0.017461396753787994, "learning_rate": 8.76690584167636e-05, "loss": 0.017, "step": 2136 }, { "epoch": 1.527792672028597, "grad_norm": 0.0156569704413414, "learning_rate": 8.765263453994555e-05, "loss": 0.0138, "step": 2137 }, { "epoch": 1.5285075960679178, "grad_norm": 0.013259371742606163, "learning_rate": 8.763620127334062e-05, "loss": 0.0129, "step": 2138 }, { "epoch": 1.5292225201072385, "grad_norm": 0.010460292920470238, "learning_rate": 8.761975862104694e-05, "loss": 0.0121, "step": 2139 }, { "epoch": 1.5299374441465594, "grad_norm": 0.014692713506519794, "learning_rate": 8.760330658716495e-05, "loss": 0.0181, "step": 2140 }, { "epoch": 1.5299374441465594, "eval_loss": 0.011102823540568352, "eval_runtime": 4.5965, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 2140 }, { "epoch": 1.5306523681858804, "grad_norm": 0.018839672207832336, "learning_rate": 8.758684517579746e-05, "loss": 0.0163, "step": 2141 }, { "epoch": 1.531367292225201, "grad_norm": 0.012741087935864925, "learning_rate": 8.75703743910496e-05, "loss": 0.0121, "step": 2142 }, { "epoch": 1.5320822162645218, "grad_norm": 0.019107239320874214, "learning_rate": 8.755389423702883e-05, "loss": 0.0214, "step": 2143 }, { "epoch": 1.5327971403038427, "grad_norm": 0.017463315278291702, "learning_rate": 8.753740471784497e-05, "loss": 0.0227, "step": 2144 }, { "epoch": 1.5335120643431637, "grad_norm": 0.022921977564692497, "learning_rate": 8.752090583761016e-05, "loss": 0.0163, "step": 2145 }, { "epoch": 1.5335120643431637, "eval_loss": 0.011210961267352104, "eval_runtime": 4.5916, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 2145 }, { "epoch": 1.5342269883824844, "grad_norm": 0.019285595044493675, "learning_rate": 8.750439760043891e-05, "loss": 0.0117, "step": 2146 }, { "epoch": 1.534941912421805, "grad_norm": 0.016406329348683357, "learning_rate": 8.748788001044799e-05, "loss": 0.0129, "step": 2147 }, { "epoch": 1.535656836461126, "grad_norm": 0.01684889942407608, "learning_rate": 8.747135307175655e-05, "loss": 0.0127, "step": 2148 }, { "epoch": 1.536371760500447, "grad_norm": 0.01809617690742016, "learning_rate": 8.745481678848608e-05, "loss": 0.0166, "step": 2149 }, { "epoch": 1.5370866845397675, "grad_norm": 0.018005164340138435, "learning_rate": 8.743827116476039e-05, "loss": 0.0302, "step": 2150 }, { "epoch": 1.5370866845397675, "eval_loss": 0.011038394644856453, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 2150 }, { "epoch": 1.5378016085790884, "grad_norm": 0.015012509189546108, "learning_rate": 8.742171620470561e-05, "loss": 0.0091, "step": 2151 }, { "epoch": 1.5385165326184094, "grad_norm": 0.013617989607155323, "learning_rate": 8.740515191245018e-05, "loss": 0.0176, "step": 2152 }, { "epoch": 1.53923145665773, "grad_norm": 0.014652554877102375, "learning_rate": 8.738857829212495e-05, "loss": 0.0115, "step": 2153 }, { "epoch": 1.5399463806970508, "grad_norm": 0.016884468495845795, "learning_rate": 8.737199534786297e-05, "loss": 0.0145, "step": 2154 }, { "epoch": 1.5406613047363718, "grad_norm": 0.016813309863209724, "learning_rate": 8.735540308379973e-05, "loss": 0.0153, "step": 2155 }, { "epoch": 1.5406613047363718, "eval_loss": 0.010987560264766216, "eval_runtime": 4.6279, "eval_samples_per_second": 10.804, "eval_steps_per_second": 2.809, "step": 2155 }, { "epoch": 1.5413762287756927, "grad_norm": 0.02016187645494938, "learning_rate": 8.733880150407296e-05, "loss": 0.0134, "step": 2156 }, { "epoch": 1.5420911528150134, "grad_norm": 0.016174061223864555, "learning_rate": 8.732219061282278e-05, "loss": 0.0212, "step": 2157 }, { "epoch": 1.5428060768543341, "grad_norm": 0.016328910365700722, "learning_rate": 8.730557041419161e-05, "loss": 0.0157, "step": 2158 }, { "epoch": 1.543521000893655, "grad_norm": 0.018548205494880676, "learning_rate": 8.728894091232415e-05, "loss": 0.013, "step": 2159 }, { "epoch": 1.544235924932976, "grad_norm": 0.016343817114830017, "learning_rate": 8.727230211136747e-05, "loss": 0.0199, "step": 2160 }, { "epoch": 1.544235924932976, "eval_loss": 0.011064184829592705, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 2160 }, { "epoch": 1.5449508489722967, "grad_norm": 0.01488068699836731, "learning_rate": 8.725565401547096e-05, "loss": 0.0189, "step": 2161 }, { "epoch": 1.5456657730116174, "grad_norm": 0.01522751897573471, "learning_rate": 8.723899662878626e-05, "loss": 0.0148, "step": 2162 }, { "epoch": 1.5463806970509384, "grad_norm": 0.017571838572621346, "learning_rate": 8.722232995546742e-05, "loss": 0.0161, "step": 2163 }, { "epoch": 1.547095621090259, "grad_norm": 0.017333317548036575, "learning_rate": 8.720565399967076e-05, "loss": 0.0141, "step": 2164 }, { "epoch": 1.5478105451295798, "grad_norm": 0.017239192500710487, "learning_rate": 8.71889687655549e-05, "loss": 0.0219, "step": 2165 }, { "epoch": 1.5478105451295798, "eval_loss": 0.011164084076881409, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2165 }, { "epoch": 1.5485254691689008, "grad_norm": 0.02210170403122902, "learning_rate": 8.717227425728081e-05, "loss": 0.0251, "step": 2166 }, { "epoch": 1.5492403932082217, "grad_norm": 0.014820207841694355, "learning_rate": 8.715557047901174e-05, "loss": 0.0155, "step": 2167 }, { "epoch": 1.5499553172475424, "grad_norm": 0.014987374655902386, "learning_rate": 8.713885743491326e-05, "loss": 0.0128, "step": 2168 }, { "epoch": 1.5506702412868631, "grad_norm": 0.018574830144643784, "learning_rate": 8.712213512915328e-05, "loss": 0.0203, "step": 2169 }, { "epoch": 1.551385165326184, "grad_norm": 0.013559440150856972, "learning_rate": 8.710540356590198e-05, "loss": 0.0134, "step": 2170 }, { "epoch": 1.551385165326184, "eval_loss": 0.01133913453668356, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2170 }, { "epoch": 1.552100089365505, "grad_norm": 0.012237926945090294, "learning_rate": 8.70886627493319e-05, "loss": 0.011, "step": 2171 }, { "epoch": 1.5528150134048258, "grad_norm": 0.015585258603096008, "learning_rate": 8.707191268361779e-05, "loss": 0.015, "step": 2172 }, { "epoch": 1.5535299374441465, "grad_norm": 0.013422444462776184, "learning_rate": 8.705515337293681e-05, "loss": 0.0152, "step": 2173 }, { "epoch": 1.5542448614834674, "grad_norm": 0.018750762566924095, "learning_rate": 8.703838482146838e-05, "loss": 0.0183, "step": 2174 }, { "epoch": 1.5549597855227884, "grad_norm": 0.01591338962316513, "learning_rate": 8.702160703339422e-05, "loss": 0.0179, "step": 2175 }, { "epoch": 1.5549597855227884, "eval_loss": 0.011232437565922737, "eval_runtime": 4.5861, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 2175 }, { "epoch": 1.555674709562109, "grad_norm": 0.0144593371078372, "learning_rate": 8.700482001289837e-05, "loss": 0.0164, "step": 2176 }, { "epoch": 1.5563896336014298, "grad_norm": 0.017804060131311417, "learning_rate": 8.698802376416718e-05, "loss": 0.0213, "step": 2177 }, { "epoch": 1.5571045576407507, "grad_norm": 0.018191704526543617, "learning_rate": 8.697121829138924e-05, "loss": 0.0247, "step": 2178 }, { "epoch": 1.5578194816800714, "grad_norm": 0.018138576298952103, "learning_rate": 8.695440359875554e-05, "loss": 0.0153, "step": 2179 }, { "epoch": 1.5585344057193922, "grad_norm": 0.016402635723352432, "learning_rate": 8.693757969045927e-05, "loss": 0.0164, "step": 2180 }, { "epoch": 1.5585344057193922, "eval_loss": 0.010903984308242798, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 2180 }, { "epoch": 1.559249329758713, "grad_norm": 0.014967083930969238, "learning_rate": 8.692074657069603e-05, "loss": 0.0189, "step": 2181 }, { "epoch": 1.559964253798034, "grad_norm": 0.01576434075832367, "learning_rate": 8.690390424366357e-05, "loss": 0.0171, "step": 2182 }, { "epoch": 1.5606791778373548, "grad_norm": 0.01491315197199583, "learning_rate": 8.688705271356208e-05, "loss": 0.017, "step": 2183 }, { "epoch": 1.5613941018766755, "grad_norm": 0.019803037866950035, "learning_rate": 8.687019198459394e-05, "loss": 0.012, "step": 2184 }, { "epoch": 1.5621090259159964, "grad_norm": 0.012430020608007908, "learning_rate": 8.68533220609639e-05, "loss": 0.0091, "step": 2185 }, { "epoch": 1.5621090259159964, "eval_loss": 0.010769435204565525, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2185 }, { "epoch": 1.5628239499553174, "grad_norm": 0.014351729303598404, "learning_rate": 8.683644294687894e-05, "loss": 0.0131, "step": 2186 }, { "epoch": 1.563538873994638, "grad_norm": 0.017979100346565247, "learning_rate": 8.681955464654839e-05, "loss": 0.0176, "step": 2187 }, { "epoch": 1.5642537980339588, "grad_norm": 0.013340883888304234, "learning_rate": 8.680265716418382e-05, "loss": 0.0128, "step": 2188 }, { "epoch": 1.5649687220732797, "grad_norm": 0.013527237810194492, "learning_rate": 8.678575050399912e-05, "loss": 0.0136, "step": 2189 }, { "epoch": 1.5656836461126007, "grad_norm": 0.017776992172002792, "learning_rate": 8.676883467021046e-05, "loss": 0.0129, "step": 2190 }, { "epoch": 1.5656836461126007, "eval_loss": 0.010873227380216122, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 2190 }, { "epoch": 1.5663985701519214, "grad_norm": 0.01147503312677145, "learning_rate": 8.67519096670363e-05, "loss": 0.0132, "step": 2191 }, { "epoch": 1.5671134941912421, "grad_norm": 0.01296315249055624, "learning_rate": 8.673497549869738e-05, "loss": 0.0161, "step": 2192 }, { "epoch": 1.567828418230563, "grad_norm": 0.017218630760908127, "learning_rate": 8.671803216941674e-05, "loss": 0.0263, "step": 2193 }, { "epoch": 1.5685433422698838, "grad_norm": 0.019950630143284798, "learning_rate": 8.670107968341969e-05, "loss": 0.0159, "step": 2194 }, { "epoch": 1.5692582663092045, "grad_norm": 0.015324492938816547, "learning_rate": 8.668411804493384e-05, "loss": 0.0149, "step": 2195 }, { "epoch": 1.5692582663092045, "eval_loss": 0.01114950142800808, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2195 }, { "epoch": 1.5699731903485254, "grad_norm": 0.01685805432498455, "learning_rate": 8.666714725818903e-05, "loss": 0.0118, "step": 2196 }, { "epoch": 1.5706881143878464, "grad_norm": 0.014036549255251884, "learning_rate": 8.665016732741747e-05, "loss": 0.0163, "step": 2197 }, { "epoch": 1.571403038427167, "grad_norm": 0.019297273829579353, "learning_rate": 8.663317825685358e-05, "loss": 0.0111, "step": 2198 }, { "epoch": 1.5721179624664878, "grad_norm": 0.014156295917928219, "learning_rate": 8.661618005073411e-05, "loss": 0.0105, "step": 2199 }, { "epoch": 1.5728328865058088, "grad_norm": 0.0141457449644804, "learning_rate": 8.659917271329801e-05, "loss": 0.0144, "step": 2200 }, { "epoch": 1.5728328865058088, "eval_loss": 0.011396141722798347, "eval_runtime": 4.5911, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 2200 }, { "epoch": 1.5735478105451297, "grad_norm": 0.01621358096599579, "learning_rate": 8.65821562487866e-05, "loss": 0.0145, "step": 2201 }, { "epoch": 1.5742627345844504, "grad_norm": 0.017537429928779602, "learning_rate": 8.656513066144341e-05, "loss": 0.0121, "step": 2202 }, { "epoch": 1.5749776586237711, "grad_norm": 0.02146010473370552, "learning_rate": 8.654809595551429e-05, "loss": 0.0168, "step": 2203 }, { "epoch": 1.575692582663092, "grad_norm": 0.015277513302862644, "learning_rate": 8.653105213524732e-05, "loss": 0.0173, "step": 2204 }, { "epoch": 1.576407506702413, "grad_norm": 0.0168246291577816, "learning_rate": 8.651399920489289e-05, "loss": 0.0158, "step": 2205 }, { "epoch": 1.576407506702413, "eval_loss": 0.011276151053607464, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 2205 }, { "epoch": 1.5771224307417335, "grad_norm": 0.019784091040492058, "learning_rate": 8.649693716870364e-05, "loss": 0.0111, "step": 2206 }, { "epoch": 1.5778373547810545, "grad_norm": 0.016722366213798523, "learning_rate": 8.64798660309345e-05, "loss": 0.0128, "step": 2207 }, { "epoch": 1.5785522788203754, "grad_norm": 0.017475847154855728, "learning_rate": 8.646278579584265e-05, "loss": 0.0177, "step": 2208 }, { "epoch": 1.5792672028596961, "grad_norm": 0.021793821826577187, "learning_rate": 8.644569646768755e-05, "loss": 0.0183, "step": 2209 }, { "epoch": 1.5799821268990168, "grad_norm": 0.011795331723988056, "learning_rate": 8.642859805073089e-05, "loss": 0.0161, "step": 2210 }, { "epoch": 1.5799821268990168, "eval_loss": 0.011057569645345211, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2210 }, { "epoch": 1.5806970509383378, "grad_norm": 0.01701347902417183, "learning_rate": 8.641149054923673e-05, "loss": 0.0129, "step": 2211 }, { "epoch": 1.5814119749776587, "grad_norm": 0.018863892182707787, "learning_rate": 8.639437396747127e-05, "loss": 0.0122, "step": 2212 }, { "epoch": 1.5821268990169794, "grad_norm": 0.01389476377516985, "learning_rate": 8.637724830970306e-05, "loss": 0.0179, "step": 2213 }, { "epoch": 1.5828418230563002, "grad_norm": 0.016220679506659508, "learning_rate": 8.636011358020287e-05, "loss": 0.0167, "step": 2214 }, { "epoch": 1.583556747095621, "grad_norm": 0.019674545153975487, "learning_rate": 8.634296978324373e-05, "loss": 0.0126, "step": 2215 }, { "epoch": 1.583556747095621, "eval_loss": 0.01114159356802702, "eval_runtime": 4.5943, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.83, "step": 2215 }, { "epoch": 1.584271671134942, "grad_norm": 0.014163116924464703, "learning_rate": 8.6325816923101e-05, "loss": 0.0162, "step": 2216 }, { "epoch": 1.5849865951742628, "grad_norm": 0.01769358478486538, "learning_rate": 8.630865500405219e-05, "loss": 0.0175, "step": 2217 }, { "epoch": 1.5857015192135835, "grad_norm": 0.014712394215166569, "learning_rate": 8.629148403037716e-05, "loss": 0.0113, "step": 2218 }, { "epoch": 1.5864164432529044, "grad_norm": 0.016157696023583412, "learning_rate": 8.627430400635799e-05, "loss": 0.0122, "step": 2219 }, { "epoch": 1.5871313672922251, "grad_norm": 0.01583644188940525, "learning_rate": 8.625711493627901e-05, "loss": 0.0086, "step": 2220 }, { "epoch": 1.5871313672922251, "eval_loss": 0.01104170735925436, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2220 }, { "epoch": 1.5878462913315459, "grad_norm": 0.015536102466285229, "learning_rate": 8.623991682442684e-05, "loss": 0.0178, "step": 2221 }, { "epoch": 1.5885612153708668, "grad_norm": 0.017385797575116158, "learning_rate": 8.622270967509032e-05, "loss": 0.0144, "step": 2222 }, { "epoch": 1.5892761394101877, "grad_norm": 0.02080448530614376, "learning_rate": 8.620549349256056e-05, "loss": 0.0214, "step": 2223 }, { "epoch": 1.5899910634495085, "grad_norm": 0.026200320571660995, "learning_rate": 8.618826828113091e-05, "loss": 0.0127, "step": 2224 }, { "epoch": 1.5907059874888292, "grad_norm": 0.014563824981451035, "learning_rate": 8.617103404509698e-05, "loss": 0.0157, "step": 2225 }, { "epoch": 1.5907059874888292, "eval_loss": 0.011184638366103172, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 2225 }, { "epoch": 1.5914209115281501, "grad_norm": 0.01507546752691269, "learning_rate": 8.615379078875663e-05, "loss": 0.0108, "step": 2226 }, { "epoch": 1.592135835567471, "grad_norm": 0.01899881474673748, "learning_rate": 8.613653851641002e-05, "loss": 0.0167, "step": 2227 }, { "epoch": 1.5928507596067918, "grad_norm": 0.019300954416394234, "learning_rate": 8.611927723235943e-05, "loss": 0.0176, "step": 2228 }, { "epoch": 1.5935656836461125, "grad_norm": 0.013776577077805996, "learning_rate": 8.610200694090951e-05, "loss": 0.0172, "step": 2229 }, { "epoch": 1.5942806076854334, "grad_norm": 0.01794934831559658, "learning_rate": 8.608472764636712e-05, "loss": 0.0165, "step": 2230 }, { "epoch": 1.5942806076854334, "eval_loss": 0.011386082507669926, "eval_runtime": 4.5781, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 2230 }, { "epoch": 1.5949955317247544, "grad_norm": 0.018001167103648186, "learning_rate": 8.606743935304134e-05, "loss": 0.0117, "step": 2231 }, { "epoch": 1.595710455764075, "grad_norm": 0.017877545207738876, "learning_rate": 8.605014206524351e-05, "loss": 0.0112, "step": 2232 }, { "epoch": 1.5964253798033958, "grad_norm": 0.011906291358172894, "learning_rate": 8.603283578728723e-05, "loss": 0.0151, "step": 2233 }, { "epoch": 1.5971403038427168, "grad_norm": 0.013241138309240341, "learning_rate": 8.601552052348832e-05, "loss": 0.011, "step": 2234 }, { "epoch": 1.5978552278820375, "grad_norm": 0.012979471124708652, "learning_rate": 8.599819627816485e-05, "loss": 0.0146, "step": 2235 }, { "epoch": 1.5978552278820375, "eval_loss": 0.011457348242402077, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 2235 }, { "epoch": 1.5985701519213582, "grad_norm": 0.018410420045256615, "learning_rate": 8.598086305563713e-05, "loss": 0.02, "step": 2236 }, { "epoch": 1.5992850759606791, "grad_norm": 0.016119299456477165, "learning_rate": 8.59635208602277e-05, "loss": 0.0106, "step": 2237 }, { "epoch": 1.6, "grad_norm": 0.018729232251644135, "learning_rate": 8.594616969626134e-05, "loss": 0.0147, "step": 2238 }, { "epoch": 1.6007149240393208, "grad_norm": 0.01666508987545967, "learning_rate": 8.592880956806509e-05, "loss": 0.0128, "step": 2239 }, { "epoch": 1.6014298480786415, "grad_norm": 0.014201982878148556, "learning_rate": 8.591144047996817e-05, "loss": 0.0152, "step": 2240 }, { "epoch": 1.6014298480786415, "eval_loss": 0.011347577907145023, "eval_runtime": 4.5928, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.83, "step": 2240 }, { "epoch": 1.6021447721179625, "grad_norm": 0.014753506518900394, "learning_rate": 8.589406243630211e-05, "loss": 0.0121, "step": 2241 }, { "epoch": 1.6028596961572834, "grad_norm": 0.015352652408182621, "learning_rate": 8.587667544140062e-05, "loss": 0.0139, "step": 2242 }, { "epoch": 1.6035746201966041, "grad_norm": 0.014410875737667084, "learning_rate": 8.585927949959966e-05, "loss": 0.0125, "step": 2243 }, { "epoch": 1.6042895442359248, "grad_norm": 0.017414821311831474, "learning_rate": 8.58418746152374e-05, "loss": 0.0129, "step": 2244 }, { "epoch": 1.6050044682752458, "grad_norm": 0.017056630924344063, "learning_rate": 8.582446079265427e-05, "loss": 0.0117, "step": 2245 }, { "epoch": 1.6050044682752458, "eval_loss": 0.011278598569333553, "eval_runtime": 4.5893, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 2245 }, { "epoch": 1.6057193923145667, "grad_norm": 0.013695457950234413, "learning_rate": 8.580703803619292e-05, "loss": 0.012, "step": 2246 }, { "epoch": 1.6064343163538874, "grad_norm": 0.015647750347852707, "learning_rate": 8.578960635019823e-05, "loss": 0.0112, "step": 2247 }, { "epoch": 1.6071492403932082, "grad_norm": 0.014027469791471958, "learning_rate": 8.577216573901727e-05, "loss": 0.0155, "step": 2248 }, { "epoch": 1.607864164432529, "grad_norm": 0.018976852297782898, "learning_rate": 8.575471620699941e-05, "loss": 0.0107, "step": 2249 }, { "epoch": 1.6085790884718498, "grad_norm": 0.0173083133995533, "learning_rate": 8.573725775849617e-05, "loss": 0.0177, "step": 2250 }, { "epoch": 1.6085790884718498, "eval_loss": 0.011185950599610806, "eval_runtime": 4.5912, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.832, "step": 2250 }, { "epoch": 1.6092940125111705, "grad_norm": 0.011129425838589668, "learning_rate": 8.571979039786135e-05, "loss": 0.0131, "step": 2251 }, { "epoch": 1.6100089365504915, "grad_norm": 0.013244668021798134, "learning_rate": 8.570231412945091e-05, "loss": 0.017, "step": 2252 }, { "epoch": 1.6107238605898124, "grad_norm": 0.013348257169127464, "learning_rate": 8.568482895762311e-05, "loss": 0.0168, "step": 2253 }, { "epoch": 1.6114387846291331, "grad_norm": 0.01503976434469223, "learning_rate": 8.566733488673837e-05, "loss": 0.0122, "step": 2254 }, { "epoch": 1.6121537086684539, "grad_norm": 0.011789894662797451, "learning_rate": 8.564983192115934e-05, "loss": 0.0135, "step": 2255 }, { "epoch": 1.6121537086684539, "eval_loss": 0.011295663192868233, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 2255 }, { "epoch": 1.6128686327077748, "grad_norm": 0.017724066972732544, "learning_rate": 8.563232006525092e-05, "loss": 0.0216, "step": 2256 }, { "epoch": 1.6135835567470957, "grad_norm": 0.012789353728294373, "learning_rate": 8.561479932338019e-05, "loss": 0.0121, "step": 2257 }, { "epoch": 1.6142984807864165, "grad_norm": 0.014462786726653576, "learning_rate": 8.559726969991647e-05, "loss": 0.0155, "step": 2258 }, { "epoch": 1.6150134048257372, "grad_norm": 0.015642331913113594, "learning_rate": 8.557973119923126e-05, "loss": 0.011, "step": 2259 }, { "epoch": 1.6157283288650581, "grad_norm": 0.017616719007492065, "learning_rate": 8.556218382569833e-05, "loss": 0.0148, "step": 2260 }, { "epoch": 1.6157283288650581, "eval_loss": 0.011293599382042885, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 2260 }, { "epoch": 1.616443252904379, "grad_norm": 0.01163551863282919, "learning_rate": 8.554462758369361e-05, "loss": 0.0107, "step": 2261 }, { "epoch": 1.6171581769436996, "grad_norm": 0.017752226442098618, "learning_rate": 8.552706247759528e-05, "loss": 0.0201, "step": 2262 }, { "epoch": 1.6178731009830205, "grad_norm": 0.014781419187784195, "learning_rate": 8.550948851178369e-05, "loss": 0.0143, "step": 2263 }, { "epoch": 1.6185880250223414, "grad_norm": 0.015706805512309074, "learning_rate": 8.549190569064143e-05, "loss": 0.0134, "step": 2264 }, { "epoch": 1.6193029490616622, "grad_norm": 0.019430579617619514, "learning_rate": 8.547431401855332e-05, "loss": 0.0152, "step": 2265 }, { "epoch": 1.6193029490616622, "eval_loss": 0.011436736211180687, "eval_runtime": 4.626, "eval_samples_per_second": 10.808, "eval_steps_per_second": 2.81, "step": 2265 }, { "epoch": 1.6200178731009829, "grad_norm": 0.015216940082609653, "learning_rate": 8.545671349990634e-05, "loss": 0.0111, "step": 2266 }, { "epoch": 1.6207327971403038, "grad_norm": 0.016067253425717354, "learning_rate": 8.543910413908967e-05, "loss": 0.0156, "step": 2267 }, { "epoch": 1.6214477211796248, "grad_norm": 0.01605910435318947, "learning_rate": 8.542148594049475e-05, "loss": 0.012, "step": 2268 }, { "epoch": 1.6221626452189455, "grad_norm": 0.01721440628170967, "learning_rate": 8.540385890851519e-05, "loss": 0.0179, "step": 2269 }, { "epoch": 1.6228775692582662, "grad_norm": 0.014148073270916939, "learning_rate": 8.53862230475468e-05, "loss": 0.0152, "step": 2270 }, { "epoch": 1.6228775692582662, "eval_loss": 0.01130815502256155, "eval_runtime": 4.5944, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.83, "step": 2270 }, { "epoch": 1.6235924932975871, "grad_norm": 0.012006672099232674, "learning_rate": 8.536857836198759e-05, "loss": 0.0173, "step": 2271 }, { "epoch": 1.624307417336908, "grad_norm": 0.015298847109079361, "learning_rate": 8.535092485623779e-05, "loss": 0.0129, "step": 2272 }, { "epoch": 1.6250223413762288, "grad_norm": 0.013278691098093987, "learning_rate": 8.533326253469982e-05, "loss": 0.0138, "step": 2273 }, { "epoch": 1.6257372654155495, "grad_norm": 0.01795518398284912, "learning_rate": 8.531559140177828e-05, "loss": 0.0161, "step": 2274 }, { "epoch": 1.6264521894548705, "grad_norm": 0.015622321516275406, "learning_rate": 8.529791146188002e-05, "loss": 0.0239, "step": 2275 }, { "epoch": 1.6264521894548705, "eval_loss": 0.011181721463799477, "eval_runtime": 4.5928, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 2275 }, { "epoch": 1.6271671134941914, "grad_norm": 0.017816556617617607, "learning_rate": 8.528022271941402e-05, "loss": 0.0122, "step": 2276 }, { "epoch": 1.627882037533512, "grad_norm": 0.016512850299477577, "learning_rate": 8.52625251787915e-05, "loss": 0.0194, "step": 2277 }, { "epoch": 1.6285969615728328, "grad_norm": 0.014507658779621124, "learning_rate": 8.524481884442583e-05, "loss": 0.0099, "step": 2278 }, { "epoch": 1.6293118856121538, "grad_norm": 0.015613497234880924, "learning_rate": 8.522710372073266e-05, "loss": 0.0121, "step": 2279 }, { "epoch": 1.6300268096514745, "grad_norm": 0.013795871287584305, "learning_rate": 8.520937981212974e-05, "loss": 0.0124, "step": 2280 }, { "epoch": 1.6300268096514745, "eval_loss": 0.011111993342638016, "eval_runtime": 4.5891, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 2280 }, { "epoch": 1.6307417336907952, "grad_norm": 0.016726132482290268, "learning_rate": 8.519164712303702e-05, "loss": 0.0129, "step": 2281 }, { "epoch": 1.6314566577301162, "grad_norm": 0.017614290118217468, "learning_rate": 8.51739056578767e-05, "loss": 0.0151, "step": 2282 }, { "epoch": 1.632171581769437, "grad_norm": 0.021106600761413574, "learning_rate": 8.515615542107316e-05, "loss": 0.0185, "step": 2283 }, { "epoch": 1.6328865058087578, "grad_norm": 0.01868121139705181, "learning_rate": 8.513839641705288e-05, "loss": 0.0162, "step": 2284 }, { "epoch": 1.6336014298480785, "grad_norm": 0.018156731501221657, "learning_rate": 8.512062865024462e-05, "loss": 0.0156, "step": 2285 }, { "epoch": 1.6336014298480785, "eval_loss": 0.010856620036065578, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 2285 }, { "epoch": 1.6343163538873995, "grad_norm": 0.013736449182033539, "learning_rate": 8.510285212507929e-05, "loss": 0.0152, "step": 2286 }, { "epoch": 1.6350312779267204, "grad_norm": 0.012741763144731522, "learning_rate": 8.508506684599e-05, "loss": 0.014, "step": 2287 }, { "epoch": 1.6357462019660411, "grad_norm": 0.01842685602605343, "learning_rate": 8.5067272817412e-05, "loss": 0.0158, "step": 2288 }, { "epoch": 1.6364611260053619, "grad_norm": 0.017714593559503555, "learning_rate": 8.504947004378276e-05, "loss": 0.0167, "step": 2289 }, { "epoch": 1.6371760500446828, "grad_norm": 0.018763793632388115, "learning_rate": 8.503165852954194e-05, "loss": 0.0161, "step": 2290 }, { "epoch": 1.6371760500446828, "eval_loss": 0.010794928297400475, "eval_runtime": 4.5936, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 2290 }, { "epoch": 1.6378909740840035, "grad_norm": 0.014810653403401375, "learning_rate": 8.501383827913137e-05, "loss": 0.0135, "step": 2291 }, { "epoch": 1.6386058981233242, "grad_norm": 0.011924704536795616, "learning_rate": 8.4996009296995e-05, "loss": 0.0165, "step": 2292 }, { "epoch": 1.6393208221626452, "grad_norm": 0.013361934572458267, "learning_rate": 8.497817158757906e-05, "loss": 0.0141, "step": 2293 }, { "epoch": 1.6400357462019661, "grad_norm": 0.020856572315096855, "learning_rate": 8.49603251553319e-05, "loss": 0.0191, "step": 2294 }, { "epoch": 1.6407506702412868, "grad_norm": 0.01596495509147644, "learning_rate": 8.494247000470403e-05, "loss": 0.0143, "step": 2295 }, { "epoch": 1.6407506702412868, "eval_loss": 0.010852164588868618, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 2295 }, { "epoch": 1.6414655942806076, "grad_norm": 0.01863791234791279, "learning_rate": 8.492460614014817e-05, "loss": 0.0191, "step": 2296 }, { "epoch": 1.6421805183199285, "grad_norm": 0.011735532432794571, "learning_rate": 8.490673356611917e-05, "loss": 0.0082, "step": 2297 }, { "epoch": 1.6428954423592494, "grad_norm": 0.017362071201205254, "learning_rate": 8.488885228707413e-05, "loss": 0.021, "step": 2298 }, { "epoch": 1.6436103663985702, "grad_norm": 0.021329142153263092, "learning_rate": 8.487096230747223e-05, "loss": 0.0224, "step": 2299 }, { "epoch": 1.6443252904378909, "grad_norm": 0.015016310848295689, "learning_rate": 8.485306363177485e-05, "loss": 0.0113, "step": 2300 }, { "epoch": 1.6443252904378909, "eval_loss": 0.010834638960659504, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 2300 }, { "epoch": 1.6450402144772118, "grad_norm": 0.015702463686466217, "learning_rate": 8.48351562644456e-05, "loss": 0.0149, "step": 2301 }, { "epoch": 1.6457551385165328, "grad_norm": 0.017181186005473137, "learning_rate": 8.481724020995017e-05, "loss": 0.0151, "step": 2302 }, { "epoch": 1.6464700625558535, "grad_norm": 0.015920452773571014, "learning_rate": 8.479931547275644e-05, "loss": 0.0156, "step": 2303 }, { "epoch": 1.6471849865951742, "grad_norm": 0.01652446947991848, "learning_rate": 8.47813820573345e-05, "loss": 0.0182, "step": 2304 }, { "epoch": 1.6478999106344951, "grad_norm": 0.01925748959183693, "learning_rate": 8.476343996815657e-05, "loss": 0.0176, "step": 2305 }, { "epoch": 1.6478999106344951, "eval_loss": 0.010928979143500328, "eval_runtime": 4.6392, "eval_samples_per_second": 10.778, "eval_steps_per_second": 2.802, "step": 2305 }, { "epoch": 1.6486148346738159, "grad_norm": 0.013659097254276276, "learning_rate": 8.4745489209697e-05, "loss": 0.0145, "step": 2306 }, { "epoch": 1.6493297587131366, "grad_norm": 0.012992543168365955, "learning_rate": 8.472752978643239e-05, "loss": 0.0109, "step": 2307 }, { "epoch": 1.6500446827524575, "grad_norm": 0.014803266152739525, "learning_rate": 8.470956170284142e-05, "loss": 0.0186, "step": 2308 }, { "epoch": 1.6507596067917785, "grad_norm": 0.01911507546901703, "learning_rate": 8.469158496340495e-05, "loss": 0.0114, "step": 2309 }, { "epoch": 1.6514745308310992, "grad_norm": 0.017780568450689316, "learning_rate": 8.4673599572606e-05, "loss": 0.0142, "step": 2310 }, { "epoch": 1.6514745308310992, "eval_loss": 0.010989010334014893, "eval_runtime": 4.5962, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 2310 }, { "epoch": 1.65218945487042, "grad_norm": 0.015416515991091728, "learning_rate": 8.465560553492977e-05, "loss": 0.0136, "step": 2311 }, { "epoch": 1.6529043789097408, "grad_norm": 0.013474310748279095, "learning_rate": 8.463760285486361e-05, "loss": 0.0139, "step": 2312 }, { "epoch": 1.6536193029490618, "grad_norm": 0.022851906716823578, "learning_rate": 8.4619591536897e-05, "loss": 0.0143, "step": 2313 }, { "epoch": 1.6543342269883825, "grad_norm": 0.012791934423148632, "learning_rate": 8.46015715855216e-05, "loss": 0.0112, "step": 2314 }, { "epoch": 1.6550491510277032, "grad_norm": 0.01867012120783329, "learning_rate": 8.45835430052312e-05, "loss": 0.0158, "step": 2315 }, { "epoch": 1.6550491510277032, "eval_loss": 0.01095092948526144, "eval_runtime": 4.5867, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 2315 }, { "epoch": 1.6557640750670242, "grad_norm": 0.015033258125185966, "learning_rate": 8.456550580052176e-05, "loss": 0.015, "step": 2316 }, { "epoch": 1.656478999106345, "grad_norm": 0.01281016319990158, "learning_rate": 8.454745997589139e-05, "loss": 0.0113, "step": 2317 }, { "epoch": 1.6571939231456658, "grad_norm": 0.014308668673038483, "learning_rate": 8.452940553584032e-05, "loss": 0.0162, "step": 2318 }, { "epoch": 1.6579088471849865, "grad_norm": 0.010697108693420887, "learning_rate": 8.451134248487101e-05, "loss": 0.0084, "step": 2319 }, { "epoch": 1.6586237712243075, "grad_norm": 0.012463596649467945, "learning_rate": 8.449327082748795e-05, "loss": 0.0163, "step": 2320 }, { "epoch": 1.6586237712243075, "eval_loss": 0.010748540051281452, "eval_runtime": 4.6169, "eval_samples_per_second": 10.83, "eval_steps_per_second": 2.816, "step": 2320 }, { "epoch": 1.6593386952636282, "grad_norm": 0.013778341002762318, "learning_rate": 8.447519056819785e-05, "loss": 0.0182, "step": 2321 }, { "epoch": 1.660053619302949, "grad_norm": 0.017383981496095657, "learning_rate": 8.445710171150958e-05, "loss": 0.0194, "step": 2322 }, { "epoch": 1.6607685433422699, "grad_norm": 0.015592120587825775, "learning_rate": 8.443900426193409e-05, "loss": 0.0192, "step": 2323 }, { "epoch": 1.6614834673815908, "grad_norm": 0.016134629026055336, "learning_rate": 8.442089822398455e-05, "loss": 0.0149, "step": 2324 }, { "epoch": 1.6621983914209115, "grad_norm": 0.015863843262195587, "learning_rate": 8.44027836021762e-05, "loss": 0.0127, "step": 2325 }, { "epoch": 1.6621983914209115, "eval_loss": 0.01060405746102333, "eval_runtime": 4.5914, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 2325 }, { "epoch": 1.6629133154602322, "grad_norm": 0.019991569221019745, "learning_rate": 8.438466040102646e-05, "loss": 0.0186, "step": 2326 }, { "epoch": 1.6636282394995532, "grad_norm": 0.015145243145525455, "learning_rate": 8.436652862505487e-05, "loss": 0.018, "step": 2327 }, { "epoch": 1.6643431635388741, "grad_norm": 0.017267033457756042, "learning_rate": 8.434838827878315e-05, "loss": 0.0169, "step": 2328 }, { "epoch": 1.6650580875781948, "grad_norm": 0.013400926254689693, "learning_rate": 8.433023936673509e-05, "loss": 0.0092, "step": 2329 }, { "epoch": 1.6657730116175156, "grad_norm": 0.014779197983443737, "learning_rate": 8.43120818934367e-05, "loss": 0.0175, "step": 2330 }, { "epoch": 1.6657730116175156, "eval_loss": 0.01079830527305603, "eval_runtime": 4.6119, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 2330 }, { "epoch": 1.6664879356568365, "grad_norm": 0.019061364233493805, "learning_rate": 8.4293915863416e-05, "loss": 0.0197, "step": 2331 }, { "epoch": 1.6672028596961574, "grad_norm": 0.028157461434602737, "learning_rate": 8.42757412812033e-05, "loss": 0.0189, "step": 2332 }, { "epoch": 1.667917783735478, "grad_norm": 0.017874307930469513, "learning_rate": 8.425755815133092e-05, "loss": 0.0176, "step": 2333 }, { "epoch": 1.6686327077747989, "grad_norm": 0.014968289993703365, "learning_rate": 8.423936647833338e-05, "loss": 0.0109, "step": 2334 }, { "epoch": 1.6693476318141198, "grad_norm": 0.028066394850611687, "learning_rate": 8.422116626674727e-05, "loss": 0.0334, "step": 2335 }, { "epoch": 1.6693476318141198, "eval_loss": 0.010645866394042969, "eval_runtime": 4.5892, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 2335 }, { "epoch": 1.6700625558534405, "grad_norm": 0.015175661072134972, "learning_rate": 8.420295752111138e-05, "loss": 0.0115, "step": 2336 }, { "epoch": 1.6707774798927613, "grad_norm": 0.016600806266069412, "learning_rate": 8.418474024596658e-05, "loss": 0.0167, "step": 2337 }, { "epoch": 1.6714924039320822, "grad_norm": 0.016870975494384766, "learning_rate": 8.41665144458559e-05, "loss": 0.0159, "step": 2338 }, { "epoch": 1.6722073279714031, "grad_norm": 0.017026536166667938, "learning_rate": 8.414828012532446e-05, "loss": 0.0176, "step": 2339 }, { "epoch": 1.6729222520107239, "grad_norm": 0.017571929842233658, "learning_rate": 8.413003728891953e-05, "loss": 0.0134, "step": 2340 }, { "epoch": 1.6729222520107239, "eval_loss": 0.010611234232783318, "eval_runtime": 4.5917, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 2340 }, { "epoch": 1.6736371760500446, "grad_norm": 0.018536221235990524, "learning_rate": 8.411178594119047e-05, "loss": 0.0164, "step": 2341 }, { "epoch": 1.6743521000893655, "grad_norm": 0.024907827377319336, "learning_rate": 8.409352608668882e-05, "loss": 0.0202, "step": 2342 }, { "epoch": 1.6750670241286865, "grad_norm": 0.022625645622611046, "learning_rate": 8.407525772996818e-05, "loss": 0.0214, "step": 2343 }, { "epoch": 1.6757819481680072, "grad_norm": 0.015815047547221184, "learning_rate": 8.405698087558432e-05, "loss": 0.0109, "step": 2344 }, { "epoch": 1.676496872207328, "grad_norm": 0.019424064084887505, "learning_rate": 8.403869552809512e-05, "loss": 0.0245, "step": 2345 }, { "epoch": 1.676496872207328, "eval_loss": 0.010615291073918343, "eval_runtime": 4.5758, "eval_samples_per_second": 10.927, "eval_steps_per_second": 2.841, "step": 2345 }, { "epoch": 1.6772117962466488, "grad_norm": 0.0128397848457098, "learning_rate": 8.402040169206054e-05, "loss": 0.0169, "step": 2346 }, { "epoch": 1.6779267202859696, "grad_norm": 0.01749984733760357, "learning_rate": 8.400209937204269e-05, "loss": 0.0127, "step": 2347 }, { "epoch": 1.6786416443252903, "grad_norm": 0.015596360899508, "learning_rate": 8.39837885726058e-05, "loss": 0.0219, "step": 2348 }, { "epoch": 1.6793565683646112, "grad_norm": 0.017561854794621468, "learning_rate": 8.39654692983162e-05, "loss": 0.011, "step": 2349 }, { "epoch": 1.6800714924039322, "grad_norm": 0.016544105485081673, "learning_rate": 8.394714155374233e-05, "loss": 0.0168, "step": 2350 }, { "epoch": 1.6800714924039322, "eval_loss": 0.010596240870654583, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2350 }, { "epoch": 1.6807864164432529, "grad_norm": 0.014516635797917843, "learning_rate": 8.392880534345477e-05, "loss": 0.0142, "step": 2351 }, { "epoch": 1.6815013404825736, "grad_norm": 0.013985374011099339, "learning_rate": 8.391046067202618e-05, "loss": 0.0117, "step": 2352 }, { "epoch": 1.6822162645218945, "grad_norm": 0.014563758857548237, "learning_rate": 8.38921075440313e-05, "loss": 0.01, "step": 2353 }, { "epoch": 1.6829311885612155, "grad_norm": 0.011800700798630714, "learning_rate": 8.387374596404709e-05, "loss": 0.0127, "step": 2354 }, { "epoch": 1.6836461126005362, "grad_norm": 0.013189309276640415, "learning_rate": 8.38553759366525e-05, "loss": 0.0106, "step": 2355 }, { "epoch": 1.6836461126005362, "eval_loss": 0.010548148304224014, "eval_runtime": 4.5899, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 2355 }, { "epoch": 1.684361036639857, "grad_norm": 0.01527728047221899, "learning_rate": 8.383699746642866e-05, "loss": 0.0159, "step": 2356 }, { "epoch": 1.6850759606791779, "grad_norm": 0.015293664298951626, "learning_rate": 8.381861055795877e-05, "loss": 0.0129, "step": 2357 }, { "epoch": 1.6857908847184988, "grad_norm": 0.013507182709872723, "learning_rate": 8.380021521582813e-05, "loss": 0.0184, "step": 2358 }, { "epoch": 1.6865058087578195, "grad_norm": 0.01690330170094967, "learning_rate": 8.378181144462417e-05, "loss": 0.0163, "step": 2359 }, { "epoch": 1.6872207327971402, "grad_norm": 0.01645725592970848, "learning_rate": 8.376339924893642e-05, "loss": 0.0092, "step": 2360 }, { "epoch": 1.6872207327971402, "eval_loss": 0.01070411130785942, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 2360 }, { "epoch": 1.6879356568364612, "grad_norm": 0.01390966959297657, "learning_rate": 8.374497863335648e-05, "loss": 0.011, "step": 2361 }, { "epoch": 1.688650580875782, "grad_norm": 0.015353269875049591, "learning_rate": 8.372654960247809e-05, "loss": 0.0131, "step": 2362 }, { "epoch": 1.6893655049151026, "grad_norm": 0.016273200511932373, "learning_rate": 8.370811216089704e-05, "loss": 0.0145, "step": 2363 }, { "epoch": 1.6900804289544236, "grad_norm": 0.014327640645205975, "learning_rate": 8.368966631321129e-05, "loss": 0.016, "step": 2364 }, { "epoch": 1.6907953529937445, "grad_norm": 0.014490857720375061, "learning_rate": 8.36712120640208e-05, "loss": 0.0219, "step": 2365 }, { "epoch": 1.6907953529937445, "eval_loss": 0.010735493153333664, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 2365 }, { "epoch": 1.6915102770330652, "grad_norm": 0.017634496092796326, "learning_rate": 8.365274941792772e-05, "loss": 0.0182, "step": 2366 }, { "epoch": 1.692225201072386, "grad_norm": 0.018182173371315002, "learning_rate": 8.363427837953621e-05, "loss": 0.0165, "step": 2367 }, { "epoch": 1.6929401251117069, "grad_norm": 0.019619299098849297, "learning_rate": 8.361579895345262e-05, "loss": 0.0196, "step": 2368 }, { "epoch": 1.6936550491510278, "grad_norm": 0.01923658885061741, "learning_rate": 8.359731114428528e-05, "loss": 0.0161, "step": 2369 }, { "epoch": 1.6943699731903485, "grad_norm": 0.014645435847342014, "learning_rate": 8.357881495664471e-05, "loss": 0.0175, "step": 2370 }, { "epoch": 1.6943699731903485, "eval_loss": 0.010536582209169865, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 2370 }, { "epoch": 1.6950848972296693, "grad_norm": 0.017734341323375702, "learning_rate": 8.356031039514343e-05, "loss": 0.0195, "step": 2371 }, { "epoch": 1.6957998212689902, "grad_norm": 0.013714492321014404, "learning_rate": 8.354179746439615e-05, "loss": 0.0189, "step": 2372 }, { "epoch": 1.6965147453083111, "grad_norm": 0.01999996043741703, "learning_rate": 8.352327616901956e-05, "loss": 0.0108, "step": 2373 }, { "epoch": 1.6972296693476319, "grad_norm": 0.016270382329821587, "learning_rate": 8.350474651363253e-05, "loss": 0.015, "step": 2374 }, { "epoch": 1.6979445933869526, "grad_norm": 0.013674041256308556, "learning_rate": 8.348620850285594e-05, "loss": 0.0114, "step": 2375 }, { "epoch": 1.6979445933869526, "eval_loss": 0.010650665499269962, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 2375 }, { "epoch": 1.6986595174262735, "grad_norm": 0.016592592000961304, "learning_rate": 8.346766214131281e-05, "loss": 0.0169, "step": 2376 }, { "epoch": 1.6993744414655942, "grad_norm": 0.015528705902397633, "learning_rate": 8.344910743362818e-05, "loss": 0.0165, "step": 2377 }, { "epoch": 1.700089365504915, "grad_norm": 0.01692471094429493, "learning_rate": 8.343054438442925e-05, "loss": 0.0155, "step": 2378 }, { "epoch": 1.700804289544236, "grad_norm": 0.019517678767442703, "learning_rate": 8.341197299834524e-05, "loss": 0.0099, "step": 2379 }, { "epoch": 1.7015192135835568, "grad_norm": 0.015281673520803452, "learning_rate": 8.33933932800075e-05, "loss": 0.0124, "step": 2380 }, { "epoch": 1.7015192135835568, "eval_loss": 0.010568181984126568, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2380 }, { "epoch": 1.7022341376228776, "grad_norm": 0.017123227939009666, "learning_rate": 8.337480523404937e-05, "loss": 0.0175, "step": 2381 }, { "epoch": 1.7029490616621983, "grad_norm": 0.014478391967713833, "learning_rate": 8.335620886510638e-05, "loss": 0.0122, "step": 2382 }, { "epoch": 1.7036639857015192, "grad_norm": 0.020471064373850822, "learning_rate": 8.333760417781605e-05, "loss": 0.0103, "step": 2383 }, { "epoch": 1.7043789097408402, "grad_norm": 0.017370913177728653, "learning_rate": 8.331899117681798e-05, "loss": 0.0219, "step": 2384 }, { "epoch": 1.7050938337801609, "grad_norm": 0.015578879043459892, "learning_rate": 8.330036986675392e-05, "loss": 0.0188, "step": 2385 }, { "epoch": 1.7050938337801609, "eval_loss": 0.01056011114269495, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 2385 }, { "epoch": 1.7058087578194816, "grad_norm": 0.016884764656424522, "learning_rate": 8.328174025226761e-05, "loss": 0.0127, "step": 2386 }, { "epoch": 1.7065236818588025, "grad_norm": 0.014704334549605846, "learning_rate": 8.326310233800487e-05, "loss": 0.0096, "step": 2387 }, { "epoch": 1.7072386058981235, "grad_norm": 0.020703019574284554, "learning_rate": 8.324445612861367e-05, "loss": 0.014, "step": 2388 }, { "epoch": 1.707953529937444, "grad_norm": 0.01798408105969429, "learning_rate": 8.322580162874391e-05, "loss": 0.0212, "step": 2389 }, { "epoch": 1.708668453976765, "grad_norm": 0.015238984487950802, "learning_rate": 8.32071388430477e-05, "loss": 0.0165, "step": 2390 }, { "epoch": 1.708668453976765, "eval_loss": 0.010572151280939579, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2390 }, { "epoch": 1.7093833780160859, "grad_norm": 0.01793661154806614, "learning_rate": 8.318846777617912e-05, "loss": 0.0248, "step": 2391 }, { "epoch": 1.7100983020554066, "grad_norm": 0.016769960522651672, "learning_rate": 8.316978843279437e-05, "loss": 0.0152, "step": 2392 }, { "epoch": 1.7108132260947273, "grad_norm": 0.016094427555799484, "learning_rate": 8.315110081755166e-05, "loss": 0.0152, "step": 2393 }, { "epoch": 1.7115281501340482, "grad_norm": 0.01874617487192154, "learning_rate": 8.313240493511132e-05, "loss": 0.0095, "step": 2394 }, { "epoch": 1.7122430741733692, "grad_norm": 0.01548713631927967, "learning_rate": 8.311370079013571e-05, "loss": 0.0298, "step": 2395 }, { "epoch": 1.7122430741733692, "eval_loss": 0.01048271730542183, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 2395 }, { "epoch": 1.71295799821269, "grad_norm": 0.014373309910297394, "learning_rate": 8.309498838728924e-05, "loss": 0.0089, "step": 2396 }, { "epoch": 1.7136729222520106, "grad_norm": 0.01605580560863018, "learning_rate": 8.307626773123843e-05, "loss": 0.0151, "step": 2397 }, { "epoch": 1.7143878462913316, "grad_norm": 0.015028044581413269, "learning_rate": 8.305753882665178e-05, "loss": 0.0194, "step": 2398 }, { "epoch": 1.7151027703306525, "grad_norm": 0.01690712571144104, "learning_rate": 8.303880167819993e-05, "loss": 0.0217, "step": 2399 }, { "epoch": 1.7158176943699732, "grad_norm": 0.013279139064252377, "learning_rate": 8.30200562905555e-05, "loss": 0.0123, "step": 2400 }, { "epoch": 1.7158176943699732, "eval_loss": 0.01053064875304699, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 2400 }, { "epoch": 1.716532618409294, "grad_norm": 0.015278024598956108, "learning_rate": 8.300130266839322e-05, "loss": 0.0135, "step": 2401 }, { "epoch": 1.7172475424486149, "grad_norm": 0.013730419799685478, "learning_rate": 8.298254081638988e-05, "loss": 0.0168, "step": 2402 }, { "epoch": 1.7179624664879356, "grad_norm": 0.01855659857392311, "learning_rate": 8.296377073922426e-05, "loss": 0.0144, "step": 2403 }, { "epoch": 1.7186773905272563, "grad_norm": 0.019448667764663696, "learning_rate": 8.294499244157723e-05, "loss": 0.0158, "step": 2404 }, { "epoch": 1.7193923145665773, "grad_norm": 0.01923747919499874, "learning_rate": 8.292620592813172e-05, "loss": 0.0293, "step": 2405 }, { "epoch": 1.7193923145665773, "eval_loss": 0.010681663639843464, "eval_runtime": 4.6597, "eval_samples_per_second": 10.73, "eval_steps_per_second": 2.79, "step": 2405 }, { "epoch": 1.7201072386058982, "grad_norm": 0.015131620690226555, "learning_rate": 8.290741120357271e-05, "loss": 0.0132, "step": 2406 }, { "epoch": 1.720822162645219, "grad_norm": 0.01752234809100628, "learning_rate": 8.28886082725872e-05, "loss": 0.0175, "step": 2407 }, { "epoch": 1.7215370866845396, "grad_norm": 0.015065865591168404, "learning_rate": 8.286979713986424e-05, "loss": 0.0118, "step": 2408 }, { "epoch": 1.7222520107238606, "grad_norm": 0.01328202523291111, "learning_rate": 8.285097781009496e-05, "loss": 0.0099, "step": 2409 }, { "epoch": 1.7229669347631815, "grad_norm": 0.014840678311884403, "learning_rate": 8.28321502879725e-05, "loss": 0.0121, "step": 2410 }, { "epoch": 1.7229669347631815, "eval_loss": 0.010679380036890507, "eval_runtime": 4.5957, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 2410 }, { "epoch": 1.7236818588025022, "grad_norm": 0.011180159635841846, "learning_rate": 8.281331457819204e-05, "loss": 0.0192, "step": 2411 }, { "epoch": 1.724396782841823, "grad_norm": 0.014560377225279808, "learning_rate": 8.279447068545085e-05, "loss": 0.0191, "step": 2412 }, { "epoch": 1.725111706881144, "grad_norm": 0.014304379001259804, "learning_rate": 8.277561861444818e-05, "loss": 0.0215, "step": 2413 }, { "epoch": 1.7258266309204648, "grad_norm": 0.015156413428485394, "learning_rate": 8.275675836988534e-05, "loss": 0.0139, "step": 2414 }, { "epoch": 1.7265415549597856, "grad_norm": 0.014253096655011177, "learning_rate": 8.27378899564657e-05, "loss": 0.0134, "step": 2415 }, { "epoch": 1.7265415549597856, "eval_loss": 0.010622923262417316, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2415 }, { "epoch": 1.7272564789991063, "grad_norm": 0.014871723018586636, "learning_rate": 8.271901337889468e-05, "loss": 0.0094, "step": 2416 }, { "epoch": 1.7279714030384272, "grad_norm": 0.015243291854858398, "learning_rate": 8.270012864187964e-05, "loss": 0.0173, "step": 2417 }, { "epoch": 1.728686327077748, "grad_norm": 0.013299490325152874, "learning_rate": 8.268123575013009e-05, "loss": 0.0099, "step": 2418 }, { "epoch": 1.7294012511170687, "grad_norm": 0.015643253922462463, "learning_rate": 8.26623347083575e-05, "loss": 0.0112, "step": 2419 }, { "epoch": 1.7301161751563896, "grad_norm": 0.013032330200076103, "learning_rate": 8.264342552127542e-05, "loss": 0.0103, "step": 2420 }, { "epoch": 1.7301161751563896, "eval_loss": 0.010539591312408447, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2420 }, { "epoch": 1.7308310991957105, "grad_norm": 0.0178415197879076, "learning_rate": 8.26245081935994e-05, "loss": 0.0158, "step": 2421 }, { "epoch": 1.7315460232350313, "grad_norm": 0.01070758793503046, "learning_rate": 8.260558273004702e-05, "loss": 0.0109, "step": 2422 }, { "epoch": 1.732260947274352, "grad_norm": 0.014068301767110825, "learning_rate": 8.258664913533792e-05, "loss": 0.0098, "step": 2423 }, { "epoch": 1.732975871313673, "grad_norm": 0.0130384536460042, "learning_rate": 8.256770741419374e-05, "loss": 0.0094, "step": 2424 }, { "epoch": 1.7336907953529939, "grad_norm": 0.01957371085882187, "learning_rate": 8.254875757133814e-05, "loss": 0.0132, "step": 2425 }, { "epoch": 1.7336907953529939, "eval_loss": 0.010598016902804375, "eval_runtime": 4.593, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 2425 }, { "epoch": 1.7344057193923146, "grad_norm": 0.01948191039264202, "learning_rate": 8.252979961149683e-05, "loss": 0.0119, "step": 2426 }, { "epoch": 1.7351206434316353, "grad_norm": 0.014985932037234306, "learning_rate": 8.251083353939752e-05, "loss": 0.0141, "step": 2427 }, { "epoch": 1.7358355674709562, "grad_norm": 0.01617990992963314, "learning_rate": 8.249185935976997e-05, "loss": 0.0097, "step": 2428 }, { "epoch": 1.7365504915102772, "grad_norm": 0.017531557008624077, "learning_rate": 8.247287707734594e-05, "loss": 0.0101, "step": 2429 }, { "epoch": 1.737265415549598, "grad_norm": 0.016714660450816154, "learning_rate": 8.245388669685921e-05, "loss": 0.0133, "step": 2430 }, { "epoch": 1.737265415549598, "eval_loss": 0.010521520860493183, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 2430 }, { "epoch": 1.7379803395889186, "grad_norm": 0.014625170268118382, "learning_rate": 8.24348882230456e-05, "loss": 0.0143, "step": 2431 }, { "epoch": 1.7386952636282396, "grad_norm": 0.01555924117565155, "learning_rate": 8.241588166064293e-05, "loss": 0.0175, "step": 2432 }, { "epoch": 1.7394101876675603, "grad_norm": 0.013604676350951195, "learning_rate": 8.239686701439106e-05, "loss": 0.0138, "step": 2433 }, { "epoch": 1.740125111706881, "grad_norm": 0.018475547432899475, "learning_rate": 8.237784428903181e-05, "loss": 0.0168, "step": 2434 }, { "epoch": 1.740840035746202, "grad_norm": 0.015425371006131172, "learning_rate": 8.235881348930909e-05, "loss": 0.0166, "step": 2435 }, { "epoch": 1.740840035746202, "eval_loss": 0.01065817940980196, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2435 }, { "epoch": 1.7415549597855229, "grad_norm": 0.013073202222585678, "learning_rate": 8.233977461996878e-05, "loss": 0.0126, "step": 2436 }, { "epoch": 1.7422698838248436, "grad_norm": 0.01722676493227482, "learning_rate": 8.232072768575876e-05, "loss": 0.0195, "step": 2437 }, { "epoch": 1.7429848078641643, "grad_norm": 0.016834022477269173, "learning_rate": 8.230167269142896e-05, "loss": 0.0242, "step": 2438 }, { "epoch": 1.7436997319034853, "grad_norm": 0.01797345280647278, "learning_rate": 8.228260964173129e-05, "loss": 0.015, "step": 2439 }, { "epoch": 1.7444146559428062, "grad_norm": 0.014282151125371456, "learning_rate": 8.22635385414197e-05, "loss": 0.0178, "step": 2440 }, { "epoch": 1.7444146559428062, "eval_loss": 0.010848386213183403, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 2440 }, { "epoch": 1.745129579982127, "grad_norm": 0.013739063404500484, "learning_rate": 8.224445939525009e-05, "loss": 0.016, "step": 2441 }, { "epoch": 1.7458445040214476, "grad_norm": 0.016119135543704033, "learning_rate": 8.222537220798045e-05, "loss": 0.0122, "step": 2442 }, { "epoch": 1.7465594280607686, "grad_norm": 0.01874389313161373, "learning_rate": 8.22062769843707e-05, "loss": 0.0152, "step": 2443 }, { "epoch": 1.7472743521000895, "grad_norm": 0.0169060081243515, "learning_rate": 8.218717372918278e-05, "loss": 0.0169, "step": 2444 }, { "epoch": 1.7479892761394102, "grad_norm": 0.015107907354831696, "learning_rate": 8.216806244718068e-05, "loss": 0.0127, "step": 2445 }, { "epoch": 1.7479892761394102, "eval_loss": 0.010699902661144733, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2445 }, { "epoch": 1.748704200178731, "grad_norm": 0.013271714560687542, "learning_rate": 8.214894314313033e-05, "loss": 0.0159, "step": 2446 }, { "epoch": 1.749419124218052, "grad_norm": 0.012746460735797882, "learning_rate": 8.21298158217997e-05, "loss": 0.0109, "step": 2447 }, { "epoch": 1.7501340482573726, "grad_norm": 0.015393530949950218, "learning_rate": 8.211068048795876e-05, "loss": 0.0098, "step": 2448 }, { "epoch": 1.7508489722966933, "grad_norm": 0.019559690728783607, "learning_rate": 8.209153714637943e-05, "loss": 0.0185, "step": 2449 }, { "epoch": 1.7515638963360143, "grad_norm": 0.01599411480128765, "learning_rate": 8.20723858018357e-05, "loss": 0.0197, "step": 2450 }, { "epoch": 1.7515638963360143, "eval_loss": 0.010517514310777187, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2450 }, { "epoch": 1.7522788203753352, "grad_norm": 0.012915972620248795, "learning_rate": 8.205322645910351e-05, "loss": 0.0134, "step": 2451 }, { "epoch": 1.752993744414656, "grad_norm": 0.015175486914813519, "learning_rate": 8.20340591229608e-05, "loss": 0.0185, "step": 2452 }, { "epoch": 1.7537086684539767, "grad_norm": 0.014038808643817902, "learning_rate": 8.20148837981875e-05, "loss": 0.0106, "step": 2453 }, { "epoch": 1.7544235924932976, "grad_norm": 0.01464279368519783, "learning_rate": 8.199570048956553e-05, "loss": 0.0147, "step": 2454 }, { "epoch": 1.7551385165326185, "grad_norm": 0.012911546975374222, "learning_rate": 8.197650920187883e-05, "loss": 0.0145, "step": 2455 }, { "epoch": 1.7551385165326185, "eval_loss": 0.01075015403330326, "eval_runtime": 4.5878, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 2455 }, { "epoch": 1.7558534405719393, "grad_norm": 0.013594248332083225, "learning_rate": 8.19573099399133e-05, "loss": 0.0104, "step": 2456 }, { "epoch": 1.75656836461126, "grad_norm": 0.01591225154697895, "learning_rate": 8.193810270845684e-05, "loss": 0.0203, "step": 2457 }, { "epoch": 1.757283288650581, "grad_norm": 0.018586503341794014, "learning_rate": 8.191888751229934e-05, "loss": 0.0154, "step": 2458 }, { "epoch": 1.7579982126899019, "grad_norm": 0.017513863742351532, "learning_rate": 8.189966435623266e-05, "loss": 0.0236, "step": 2459 }, { "epoch": 1.7587131367292224, "grad_norm": 0.011601434089243412, "learning_rate": 8.188043324505067e-05, "loss": 0.0135, "step": 2460 }, { "epoch": 1.7587131367292224, "eval_loss": 0.010590961202979088, "eval_runtime": 4.5936, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 2460 }, { "epoch": 1.7594280607685433, "grad_norm": 0.016333451494574547, "learning_rate": 8.186119418354921e-05, "loss": 0.0192, "step": 2461 }, { "epoch": 1.7601429848078642, "grad_norm": 0.013688591308891773, "learning_rate": 8.184194717652609e-05, "loss": 0.0129, "step": 2462 }, { "epoch": 1.760857908847185, "grad_norm": 0.016482166945934296, "learning_rate": 8.182269222878112e-05, "loss": 0.0212, "step": 2463 }, { "epoch": 1.7615728328865057, "grad_norm": 0.016101110726594925, "learning_rate": 8.180342934511609e-05, "loss": 0.0144, "step": 2464 }, { "epoch": 1.7622877569258266, "grad_norm": 0.016446338966488838, "learning_rate": 8.178415853033476e-05, "loss": 0.0169, "step": 2465 }, { "epoch": 1.7622877569258266, "eval_loss": 0.01047771517187357, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 2465 }, { "epoch": 1.7630026809651476, "grad_norm": 0.01574697345495224, "learning_rate": 8.176487978924288e-05, "loss": 0.0199, "step": 2466 }, { "epoch": 1.7637176050044683, "grad_norm": 0.012941980734467506, "learning_rate": 8.174559312664816e-05, "loss": 0.0089, "step": 2467 }, { "epoch": 1.764432529043789, "grad_norm": 0.02710234373807907, "learning_rate": 8.172629854736029e-05, "loss": 0.0155, "step": 2468 }, { "epoch": 1.76514745308311, "grad_norm": 0.017494244500994682, "learning_rate": 8.170699605619096e-05, "loss": 0.0116, "step": 2469 }, { "epoch": 1.7658623771224309, "grad_norm": 0.015938658267259598, "learning_rate": 8.168768565795377e-05, "loss": 0.0127, "step": 2470 }, { "epoch": 1.7658623771224309, "eval_loss": 0.010376066900789738, "eval_runtime": 4.6138, "eval_samples_per_second": 10.837, "eval_steps_per_second": 2.818, "step": 2470 }, { "epoch": 1.7665773011617516, "grad_norm": 0.013980233110487461, "learning_rate": 8.166836735746439e-05, "loss": 0.0144, "step": 2471 }, { "epoch": 1.7672922252010723, "grad_norm": 0.013993777334690094, "learning_rate": 8.164904115954035e-05, "loss": 0.0165, "step": 2472 }, { "epoch": 1.7680071492403933, "grad_norm": 0.016528606414794922, "learning_rate": 8.162970706900124e-05, "loss": 0.0125, "step": 2473 }, { "epoch": 1.768722073279714, "grad_norm": 0.013956103473901749, "learning_rate": 8.161036509066855e-05, "loss": 0.0173, "step": 2474 }, { "epoch": 1.7694369973190347, "grad_norm": 0.014151910319924355, "learning_rate": 8.159101522936581e-05, "loss": 0.0176, "step": 2475 }, { "epoch": 1.7694369973190347, "eval_loss": 0.01063343696296215, "eval_runtime": 4.5973, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 2475 }, { "epoch": 1.7701519213583556, "grad_norm": 0.020095083862543106, "learning_rate": 8.157165748991844e-05, "loss": 0.0143, "step": 2476 }, { "epoch": 1.7708668453976766, "grad_norm": 0.017745910212397575, "learning_rate": 8.155229187715386e-05, "loss": 0.0156, "step": 2477 }, { "epoch": 1.7715817694369973, "grad_norm": 0.014636745676398277, "learning_rate": 8.153291839590147e-05, "loss": 0.0132, "step": 2478 }, { "epoch": 1.772296693476318, "grad_norm": 0.019046619534492493, "learning_rate": 8.15135370509926e-05, "loss": 0.0191, "step": 2479 }, { "epoch": 1.773011617515639, "grad_norm": 0.017264869064092636, "learning_rate": 8.149414784726059e-05, "loss": 0.0143, "step": 2480 }, { "epoch": 1.773011617515639, "eval_loss": 0.010826787911355495, "eval_runtime": 4.5941, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 2480 }, { "epoch": 1.77372654155496, "grad_norm": 0.01726660318672657, "learning_rate": 8.147475078954066e-05, "loss": 0.0203, "step": 2481 }, { "epoch": 1.7744414655942806, "grad_norm": 0.013128708116710186, "learning_rate": 8.145534588267005e-05, "loss": 0.019, "step": 2482 }, { "epoch": 1.7751563896336013, "grad_norm": 0.015641624107956886, "learning_rate": 8.143593313148794e-05, "loss": 0.0106, "step": 2483 }, { "epoch": 1.7758713136729223, "grad_norm": 0.014586469158530235, "learning_rate": 8.141651254083549e-05, "loss": 0.0139, "step": 2484 }, { "epoch": 1.7765862377122432, "grad_norm": 0.013539873994886875, "learning_rate": 8.139708411555575e-05, "loss": 0.0106, "step": 2485 }, { "epoch": 1.7765862377122432, "eval_loss": 0.010690400376915932, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 2485 }, { "epoch": 1.777301161751564, "grad_norm": 0.027263320982456207, "learning_rate": 8.137764786049383e-05, "loss": 0.0189, "step": 2486 }, { "epoch": 1.7780160857908847, "grad_norm": 0.01739044114947319, "learning_rate": 8.135820378049667e-05, "loss": 0.0133, "step": 2487 }, { "epoch": 1.7787310098302056, "grad_norm": 0.014716839417815208, "learning_rate": 8.133875188041324e-05, "loss": 0.0124, "step": 2488 }, { "epoch": 1.7794459338695263, "grad_norm": 0.013030409812927246, "learning_rate": 8.131929216509444e-05, "loss": 0.015, "step": 2489 }, { "epoch": 1.780160857908847, "grad_norm": 0.014430114068090916, "learning_rate": 8.129982463939313e-05, "loss": 0.0131, "step": 2490 }, { "epoch": 1.780160857908847, "eval_loss": 0.010523319244384766, "eval_runtime": 4.5932, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 2490 }, { "epoch": 1.780875781948168, "grad_norm": 0.01522095687687397, "learning_rate": 8.128034930816411e-05, "loss": 0.0207, "step": 2491 }, { "epoch": 1.781590705987489, "grad_norm": 0.01707318052649498, "learning_rate": 8.12608661762641e-05, "loss": 0.0201, "step": 2492 }, { "epoch": 1.7823056300268096, "grad_norm": 0.010873355902731419, "learning_rate": 8.12413752485518e-05, "loss": 0.0107, "step": 2493 }, { "epoch": 1.7830205540661304, "grad_norm": 0.012380555272102356, "learning_rate": 8.122187652988786e-05, "loss": 0.0164, "step": 2494 }, { "epoch": 1.7837354781054513, "grad_norm": 0.019151290878653526, "learning_rate": 8.120237002513483e-05, "loss": 0.0226, "step": 2495 }, { "epoch": 1.7837354781054513, "eval_loss": 0.010379169136285782, "eval_runtime": 4.5804, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 2495 }, { "epoch": 1.7844504021447722, "grad_norm": 0.018040571361780167, "learning_rate": 8.118285573915727e-05, "loss": 0.016, "step": 2496 }, { "epoch": 1.785165326184093, "grad_norm": 0.016649242490530014, "learning_rate": 8.116333367682159e-05, "loss": 0.0167, "step": 2497 }, { "epoch": 1.7858802502234137, "grad_norm": 0.015964007005095482, "learning_rate": 8.11438038429962e-05, "loss": 0.0194, "step": 2498 }, { "epoch": 1.7865951742627346, "grad_norm": 0.016717785969376564, "learning_rate": 8.112426624255145e-05, "loss": 0.015, "step": 2499 }, { "epoch": 1.7873100983020556, "grad_norm": 0.020116835832595825, "learning_rate": 8.110472088035961e-05, "loss": 0.0115, "step": 2500 }, { "epoch": 1.7873100983020556, "eval_loss": 0.010534171015024185, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 2500 }, { "epoch": 1.7880250223413763, "grad_norm": 0.020341457799077034, "learning_rate": 8.108516776129489e-05, "loss": 0.0142, "step": 2501 }, { "epoch": 1.788739946380697, "grad_norm": 0.014537624083459377, "learning_rate": 8.106560689023342e-05, "loss": 0.0163, "step": 2502 }, { "epoch": 1.789454870420018, "grad_norm": 0.01443986315280199, "learning_rate": 8.104603827205329e-05, "loss": 0.0147, "step": 2503 }, { "epoch": 1.7901697944593387, "grad_norm": 0.013224775902926922, "learning_rate": 8.10264619116345e-05, "loss": 0.0089, "step": 2504 }, { "epoch": 1.7908847184986594, "grad_norm": 0.01634635217487812, "learning_rate": 8.1006877813859e-05, "loss": 0.009, "step": 2505 }, { "epoch": 1.7908847184986594, "eval_loss": 0.010111859068274498, "eval_runtime": 4.592, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 2505 }, { "epoch": 1.7915996425379803, "grad_norm": 0.016967957839369774, "learning_rate": 8.098728598361063e-05, "loss": 0.0132, "step": 2506 }, { "epoch": 1.7923145665773013, "grad_norm": 0.01967676915228367, "learning_rate": 8.096768642577521e-05, "loss": 0.0127, "step": 2507 }, { "epoch": 1.793029490616622, "grad_norm": 0.014996406622231007, "learning_rate": 8.094807914524048e-05, "loss": 0.0116, "step": 2508 }, { "epoch": 1.7937444146559427, "grad_norm": 0.02174399420619011, "learning_rate": 8.092846414689604e-05, "loss": 0.014, "step": 2509 }, { "epoch": 1.7944593386952636, "grad_norm": 0.01495073176920414, "learning_rate": 8.090884143563351e-05, "loss": 0.018, "step": 2510 }, { "epoch": 1.7944593386952636, "eval_loss": 0.010180269367992878, "eval_runtime": 4.5961, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.828, "step": 2510 }, { "epoch": 1.7951742627345846, "grad_norm": 0.013228467665612698, "learning_rate": 8.088921101634637e-05, "loss": 0.0136, "step": 2511 }, { "epoch": 1.7958891867739053, "grad_norm": 0.013913668692111969, "learning_rate": 8.086957289393002e-05, "loss": 0.0233, "step": 2512 }, { "epoch": 1.796604110813226, "grad_norm": 0.013269862160086632, "learning_rate": 8.084992707328184e-05, "loss": 0.0104, "step": 2513 }, { "epoch": 1.797319034852547, "grad_norm": 0.014940707013010979, "learning_rate": 8.083027355930106e-05, "loss": 0.0085, "step": 2514 }, { "epoch": 1.798033958891868, "grad_norm": 0.017233528196811676, "learning_rate": 8.081061235688888e-05, "loss": 0.0198, "step": 2515 }, { "epoch": 1.798033958891868, "eval_loss": 0.010277492925524712, "eval_runtime": 4.5916, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 2515 }, { "epoch": 1.7987488829311884, "grad_norm": 0.013670474290847778, "learning_rate": 8.079094347094838e-05, "loss": 0.0162, "step": 2516 }, { "epoch": 1.7994638069705093, "grad_norm": 0.014748182147741318, "learning_rate": 8.07712669063846e-05, "loss": 0.0192, "step": 2517 }, { "epoch": 1.8001787310098303, "grad_norm": 0.015939975157380104, "learning_rate": 8.075158266810443e-05, "loss": 0.0103, "step": 2518 }, { "epoch": 1.800893655049151, "grad_norm": 0.016730956733226776, "learning_rate": 8.073189076101673e-05, "loss": 0.0207, "step": 2519 }, { "epoch": 1.8016085790884717, "grad_norm": 0.017614826560020447, "learning_rate": 8.071219119003224e-05, "loss": 0.0149, "step": 2520 }, { "epoch": 1.8016085790884717, "eval_loss": 0.010225153528153896, "eval_runtime": 4.5815, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 2520 }, { "epoch": 1.8023235031277927, "grad_norm": 0.014310012571513653, "learning_rate": 8.069248396006365e-05, "loss": 0.0114, "step": 2521 }, { "epoch": 1.8030384271671136, "grad_norm": 0.017655199393630028, "learning_rate": 8.067276907602551e-05, "loss": 0.0189, "step": 2522 }, { "epoch": 1.8037533512064343, "grad_norm": 0.017437906935811043, "learning_rate": 8.065304654283433e-05, "loss": 0.0198, "step": 2523 }, { "epoch": 1.804468275245755, "grad_norm": 0.014224045909941196, "learning_rate": 8.063331636540848e-05, "loss": 0.0107, "step": 2524 }, { "epoch": 1.805183199285076, "grad_norm": 0.014553860761225224, "learning_rate": 8.061357854866826e-05, "loss": 0.0182, "step": 2525 }, { "epoch": 1.805183199285076, "eval_loss": 0.010420719161629677, "eval_runtime": 4.61, "eval_samples_per_second": 10.846, "eval_steps_per_second": 2.82, "step": 2525 }, { "epoch": 1.805898123324397, "grad_norm": 0.013631421141326427, "learning_rate": 8.059383309753588e-05, "loss": 0.0085, "step": 2526 }, { "epoch": 1.8066130473637176, "grad_norm": 0.013578923419117928, "learning_rate": 8.057408001693545e-05, "loss": 0.011, "step": 2527 }, { "epoch": 1.8073279714030384, "grad_norm": 0.016905806958675385, "learning_rate": 8.055431931179296e-05, "loss": 0.015, "step": 2528 }, { "epoch": 1.8080428954423593, "grad_norm": 0.015260319225490093, "learning_rate": 8.053455098703634e-05, "loss": 0.0202, "step": 2529 }, { "epoch": 1.80875781948168, "grad_norm": 0.01403724867850542, "learning_rate": 8.051477504759539e-05, "loss": 0.0118, "step": 2530 }, { "epoch": 1.80875781948168, "eval_loss": 0.010328950360417366, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 2530 }, { "epoch": 1.8094727435210007, "grad_norm": 0.014504745602607727, "learning_rate": 8.049499149840183e-05, "loss": 0.0165, "step": 2531 }, { "epoch": 1.8101876675603217, "grad_norm": 0.012644859962165356, "learning_rate": 8.047520034438925e-05, "loss": 0.0185, "step": 2532 }, { "epoch": 1.8109025915996426, "grad_norm": 0.017394201830029488, "learning_rate": 8.045540159049313e-05, "loss": 0.0122, "step": 2533 }, { "epoch": 1.8116175156389633, "grad_norm": 0.014470478519797325, "learning_rate": 8.043559524165096e-05, "loss": 0.01, "step": 2534 }, { "epoch": 1.812332439678284, "grad_norm": 0.015281233005225658, "learning_rate": 8.041578130280194e-05, "loss": 0.0161, "step": 2535 }, { "epoch": 1.812332439678284, "eval_loss": 0.010413429699838161, "eval_runtime": 4.5916, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 2535 }, { "epoch": 1.813047363717605, "grad_norm": 0.013241413049399853, "learning_rate": 8.03959597788873e-05, "loss": 0.0137, "step": 2536 }, { "epoch": 1.813762287756926, "grad_norm": 0.018712611868977547, "learning_rate": 8.037613067485011e-05, "loss": 0.0171, "step": 2537 }, { "epoch": 1.8144772117962467, "grad_norm": 0.01853441447019577, "learning_rate": 8.035629399563533e-05, "loss": 0.0163, "step": 2538 }, { "epoch": 1.8151921358355674, "grad_norm": 0.0192478708922863, "learning_rate": 8.033644974618983e-05, "loss": 0.0211, "step": 2539 }, { "epoch": 1.8159070598748883, "grad_norm": 0.015187887474894524, "learning_rate": 8.031659793146235e-05, "loss": 0.0116, "step": 2540 }, { "epoch": 1.8159070598748883, "eval_loss": 0.010459636338055134, "eval_runtime": 4.5787, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 2540 }, { "epoch": 1.8166219839142093, "grad_norm": 0.017992112785577774, "learning_rate": 8.029673855640351e-05, "loss": 0.0169, "step": 2541 }, { "epoch": 1.81733690795353, "grad_norm": 0.01569983921945095, "learning_rate": 8.027687162596585e-05, "loss": 0.0145, "step": 2542 }, { "epoch": 1.8180518319928507, "grad_norm": 0.013328216969966888, "learning_rate": 8.025699714510374e-05, "loss": 0.0079, "step": 2543 }, { "epoch": 1.8187667560321716, "grad_norm": 0.016812684014439583, "learning_rate": 8.023711511877348e-05, "loss": 0.0157, "step": 2544 }, { "epoch": 1.8194816800714924, "grad_norm": 0.01588749885559082, "learning_rate": 8.021722555193323e-05, "loss": 0.0126, "step": 2545 }, { "epoch": 1.8194816800714924, "eval_loss": 0.010449975728988647, "eval_runtime": 4.5792, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 2545 }, { "epoch": 1.820196604110813, "grad_norm": 0.013486203737556934, "learning_rate": 8.019732844954305e-05, "loss": 0.0112, "step": 2546 }, { "epoch": 1.820911528150134, "grad_norm": 0.016922753304243088, "learning_rate": 8.017742381656485e-05, "loss": 0.0192, "step": 2547 }, { "epoch": 1.821626452189455, "grad_norm": 0.01761535368859768, "learning_rate": 8.015751165796246e-05, "loss": 0.0119, "step": 2548 }, { "epoch": 1.8223413762287757, "grad_norm": 0.018274754285812378, "learning_rate": 8.013759197870151e-05, "loss": 0.0176, "step": 2549 }, { "epoch": 1.8230563002680964, "grad_norm": 0.014214244671165943, "learning_rate": 8.01176647837496e-05, "loss": 0.0099, "step": 2550 }, { "epoch": 1.8230563002680964, "eval_loss": 0.010631680488586426, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 2550 }, { "epoch": 1.8237712243074173, "grad_norm": 0.015837330371141434, "learning_rate": 8.009773007807615e-05, "loss": 0.0096, "step": 2551 }, { "epoch": 1.8244861483467383, "grad_norm": 0.015680627897381783, "learning_rate": 8.007778786665245e-05, "loss": 0.0091, "step": 2552 }, { "epoch": 1.825201072386059, "grad_norm": 0.014517943374812603, "learning_rate": 8.005783815445168e-05, "loss": 0.0115, "step": 2553 }, { "epoch": 1.8259159964253797, "grad_norm": 0.013173670507967472, "learning_rate": 8.003788094644888e-05, "loss": 0.0105, "step": 2554 }, { "epoch": 1.8266309204647007, "grad_norm": 0.01707465574145317, "learning_rate": 8.001791624762096e-05, "loss": 0.0158, "step": 2555 }, { "epoch": 1.8266309204647007, "eval_loss": 0.010587268508970737, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 2555 }, { "epoch": 1.8273458445040216, "grad_norm": 0.01363072544336319, "learning_rate": 7.999794406294673e-05, "loss": 0.0125, "step": 2556 }, { "epoch": 1.8280607685433423, "grad_norm": 0.024266758933663368, "learning_rate": 7.997796439740682e-05, "loss": 0.0128, "step": 2557 }, { "epoch": 1.828775692582663, "grad_norm": 0.021819261834025383, "learning_rate": 7.995797725598373e-05, "loss": 0.014, "step": 2558 }, { "epoch": 1.829490616621984, "grad_norm": 0.016059067100286484, "learning_rate": 7.993798264366188e-05, "loss": 0.0238, "step": 2559 }, { "epoch": 1.8302055406613047, "grad_norm": 0.013955766335129738, "learning_rate": 7.991798056542747e-05, "loss": 0.0137, "step": 2560 }, { "epoch": 1.8302055406613047, "eval_loss": 0.010681421495974064, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 2560 }, { "epoch": 1.8309204647006254, "grad_norm": 0.01654655858874321, "learning_rate": 7.989797102626861e-05, "loss": 0.0114, "step": 2561 }, { "epoch": 1.8316353887399464, "grad_norm": 0.01402646116912365, "learning_rate": 7.987795403117529e-05, "loss": 0.0106, "step": 2562 }, { "epoch": 1.8323503127792673, "grad_norm": 0.022096198052167892, "learning_rate": 7.985792958513931e-05, "loss": 0.0117, "step": 2563 }, { "epoch": 1.833065236818588, "grad_norm": 0.016681840643286705, "learning_rate": 7.983789769315438e-05, "loss": 0.0175, "step": 2564 }, { "epoch": 1.8337801608579087, "grad_norm": 0.02004530094563961, "learning_rate": 7.9817858360216e-05, "loss": 0.0098, "step": 2565 }, { "epoch": 1.8337801608579087, "eval_loss": 0.01068401150405407, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 2565 }, { "epoch": 1.8344950848972297, "grad_norm": 0.012680409476161003, "learning_rate": 7.979781159132157e-05, "loss": 0.0164, "step": 2566 }, { "epoch": 1.8352100089365506, "grad_norm": 0.017829198390245438, "learning_rate": 7.977775739147037e-05, "loss": 0.0183, "step": 2567 }, { "epoch": 1.8359249329758713, "grad_norm": 0.01432188693434, "learning_rate": 7.975769576566347e-05, "loss": 0.0194, "step": 2568 }, { "epoch": 1.836639857015192, "grad_norm": 0.016513744369149208, "learning_rate": 7.973762671890386e-05, "loss": 0.0095, "step": 2569 }, { "epoch": 1.837354781054513, "grad_norm": 0.015114862471818924, "learning_rate": 7.971755025619632e-05, "loss": 0.0137, "step": 2570 }, { "epoch": 1.837354781054513, "eval_loss": 0.010645085014402866, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 2570 }, { "epoch": 1.838069705093834, "grad_norm": 0.015833625569939613, "learning_rate": 7.96974663825475e-05, "loss": 0.0157, "step": 2571 }, { "epoch": 1.8387846291331547, "grad_norm": 0.012893032282590866, "learning_rate": 7.967737510296592e-05, "loss": 0.0101, "step": 2572 }, { "epoch": 1.8394995531724754, "grad_norm": 0.01979769393801689, "learning_rate": 7.96572764224619e-05, "loss": 0.0182, "step": 2573 }, { "epoch": 1.8402144772117963, "grad_norm": 0.013893192633986473, "learning_rate": 7.963717034604765e-05, "loss": 0.0158, "step": 2574 }, { "epoch": 1.840929401251117, "grad_norm": 0.01487845741212368, "learning_rate": 7.961705687873721e-05, "loss": 0.0126, "step": 2575 }, { "epoch": 1.840929401251117, "eval_loss": 0.01068382803350687, "eval_runtime": 4.5768, "eval_samples_per_second": 10.925, "eval_steps_per_second": 2.84, "step": 2575 }, { "epoch": 1.8416443252904378, "grad_norm": 0.016975723206996918, "learning_rate": 7.959693602554648e-05, "loss": 0.0163, "step": 2576 }, { "epoch": 1.8423592493297587, "grad_norm": 0.019617771729826927, "learning_rate": 7.957680779149315e-05, "loss": 0.0103, "step": 2577 }, { "epoch": 1.8430741733690796, "grad_norm": 0.01704299822449684, "learning_rate": 7.95566721815968e-05, "loss": 0.0198, "step": 2578 }, { "epoch": 1.8437890974084004, "grad_norm": 0.015425766818225384, "learning_rate": 7.953652920087884e-05, "loss": 0.0133, "step": 2579 }, { "epoch": 1.844504021447721, "grad_norm": 0.019972827285528183, "learning_rate": 7.95163788543625e-05, "loss": 0.0132, "step": 2580 }, { "epoch": 1.844504021447721, "eval_loss": 0.010615412145853043, "eval_runtime": 4.5813, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2580 }, { "epoch": 1.845218945487042, "grad_norm": 0.015001516789197922, "learning_rate": 7.949622114707288e-05, "loss": 0.0135, "step": 2581 }, { "epoch": 1.845933869526363, "grad_norm": 0.013785423710942268, "learning_rate": 7.947605608403688e-05, "loss": 0.0161, "step": 2582 }, { "epoch": 1.8466487935656837, "grad_norm": 0.012646684423089027, "learning_rate": 7.945588367028323e-05, "loss": 0.0168, "step": 2583 }, { "epoch": 1.8473637176050044, "grad_norm": 0.01453813910484314, "learning_rate": 7.943570391084254e-05, "loss": 0.0216, "step": 2584 }, { "epoch": 1.8480786416443253, "grad_norm": 0.01217399537563324, "learning_rate": 7.941551681074723e-05, "loss": 0.0107, "step": 2585 }, { "epoch": 1.8480786416443253, "eval_loss": 0.010631540790200233, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 2585 }, { "epoch": 1.8487935656836463, "grad_norm": 0.01699807681143284, "learning_rate": 7.939532237503152e-05, "loss": 0.0223, "step": 2586 }, { "epoch": 1.8495084897229668, "grad_norm": 0.01962064951658249, "learning_rate": 7.937512060873148e-05, "loss": 0.0133, "step": 2587 }, { "epoch": 1.8502234137622877, "grad_norm": 0.01712501235306263, "learning_rate": 7.935491151688503e-05, "loss": 0.0133, "step": 2588 }, { "epoch": 1.8509383378016087, "grad_norm": 0.01495163794606924, "learning_rate": 7.933469510453189e-05, "loss": 0.0132, "step": 2589 }, { "epoch": 1.8516532618409294, "grad_norm": 0.013900600373744965, "learning_rate": 7.931447137671363e-05, "loss": 0.0111, "step": 2590 }, { "epoch": 1.8516532618409294, "eval_loss": 0.010847089812159538, "eval_runtime": 4.5945, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.829, "step": 2590 }, { "epoch": 1.85236818588025, "grad_norm": 0.022115735337138176, "learning_rate": 7.929424033847361e-05, "loss": 0.0111, "step": 2591 }, { "epoch": 1.853083109919571, "grad_norm": 0.014928425662219524, "learning_rate": 7.927400199485704e-05, "loss": 0.0116, "step": 2592 }, { "epoch": 1.853798033958892, "grad_norm": 0.02331397496163845, "learning_rate": 7.925375635091095e-05, "loss": 0.0221, "step": 2593 }, { "epoch": 1.8545129579982127, "grad_norm": 0.015280029736459255, "learning_rate": 7.923350341168416e-05, "loss": 0.0094, "step": 2594 }, { "epoch": 1.8552278820375334, "grad_norm": 0.013950382359325886, "learning_rate": 7.921324318222737e-05, "loss": 0.0186, "step": 2595 }, { "epoch": 1.8552278820375334, "eval_loss": 0.010872400365769863, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 2595 }, { "epoch": 1.8559428060768544, "grad_norm": 0.015553012490272522, "learning_rate": 7.919297566759303e-05, "loss": 0.0175, "step": 2596 }, { "epoch": 1.8566577301161753, "grad_norm": 0.015107691287994385, "learning_rate": 7.917270087283544e-05, "loss": 0.0149, "step": 2597 }, { "epoch": 1.857372654155496, "grad_norm": 0.013574158772826195, "learning_rate": 7.915241880301075e-05, "loss": 0.0094, "step": 2598 }, { "epoch": 1.8580875781948167, "grad_norm": 0.01402074471116066, "learning_rate": 7.913212946317683e-05, "loss": 0.0172, "step": 2599 }, { "epoch": 1.8588025022341377, "grad_norm": 0.014098713174462318, "learning_rate": 7.911183285839347e-05, "loss": 0.0143, "step": 2600 }, { "epoch": 1.8588025022341377, "eval_loss": 0.010918968357145786, "eval_runtime": 4.6102, "eval_samples_per_second": 10.846, "eval_steps_per_second": 2.82, "step": 2600 }, { "epoch": 1.8595174262734584, "grad_norm": 0.014952809549868107, "learning_rate": 7.909152899372219e-05, "loss": 0.0205, "step": 2601 }, { "epoch": 1.8602323503127791, "grad_norm": 0.0209987573325634, "learning_rate": 7.907121787422638e-05, "loss": 0.0268, "step": 2602 }, { "epoch": 1.8609472743521, "grad_norm": 0.01827150583267212, "learning_rate": 7.90508995049712e-05, "loss": 0.0145, "step": 2603 }, { "epoch": 1.861662198391421, "grad_norm": 0.013933046720921993, "learning_rate": 7.903057389102362e-05, "loss": 0.0149, "step": 2604 }, { "epoch": 1.8623771224307417, "grad_norm": 0.012946988455951214, "learning_rate": 7.901024103745245e-05, "loss": 0.0139, "step": 2605 }, { "epoch": 1.8623771224307417, "eval_loss": 0.010852898471057415, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 2605 }, { "epoch": 1.8630920464700624, "grad_norm": 0.017411667853593826, "learning_rate": 7.898990094932826e-05, "loss": 0.0093, "step": 2606 }, { "epoch": 1.8638069705093834, "grad_norm": 0.017656084150075912, "learning_rate": 7.896955363172347e-05, "loss": 0.0125, "step": 2607 }, { "epoch": 1.8645218945487043, "grad_norm": 0.015261121094226837, "learning_rate": 7.894919908971225e-05, "loss": 0.0235, "step": 2608 }, { "epoch": 1.865236818588025, "grad_norm": 0.012452616356313229, "learning_rate": 7.892883732837062e-05, "loss": 0.017, "step": 2609 }, { "epoch": 1.8659517426273458, "grad_norm": 0.016627877950668335, "learning_rate": 7.890846835277637e-05, "loss": 0.0157, "step": 2610 }, { "epoch": 1.8659517426273458, "eval_loss": 0.010613624006509781, "eval_runtime": 4.5999, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 2610 }, { "epoch": 1.8666666666666667, "grad_norm": 0.015527507290244102, "learning_rate": 7.888809216800913e-05, "loss": 0.0153, "step": 2611 }, { "epoch": 1.8673815907059876, "grad_norm": 0.013565071858465672, "learning_rate": 7.886770877915027e-05, "loss": 0.0133, "step": 2612 }, { "epoch": 1.8680965147453084, "grad_norm": 0.015818271785974503, "learning_rate": 7.884731819128297e-05, "loss": 0.0145, "step": 2613 }, { "epoch": 1.868811438784629, "grad_norm": 0.015560990199446678, "learning_rate": 7.882692040949227e-05, "loss": 0.0184, "step": 2614 }, { "epoch": 1.86952636282395, "grad_norm": 0.018991338089108467, "learning_rate": 7.88065154388649e-05, "loss": 0.0187, "step": 2615 }, { "epoch": 1.86952636282395, "eval_loss": 0.010517898947000504, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 2615 }, { "epoch": 1.8702412868632707, "grad_norm": 0.016369735822081566, "learning_rate": 7.878610328448948e-05, "loss": 0.0199, "step": 2616 }, { "epoch": 1.8709562109025915, "grad_norm": 0.014514442533254623, "learning_rate": 7.876568395145637e-05, "loss": 0.0155, "step": 2617 }, { "epoch": 1.8716711349419124, "grad_norm": 0.018664076924324036, "learning_rate": 7.87452574448577e-05, "loss": 0.0135, "step": 2618 }, { "epoch": 1.8723860589812333, "grad_norm": 0.018005309626460075, "learning_rate": 7.872482376978745e-05, "loss": 0.0213, "step": 2619 }, { "epoch": 1.873100983020554, "grad_norm": 0.015741853043437004, "learning_rate": 7.870438293134132e-05, "loss": 0.0127, "step": 2620 }, { "epoch": 1.873100983020554, "eval_loss": 0.010577642358839512, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 2620 }, { "epoch": 1.8738159070598748, "grad_norm": 0.012904297560453415, "learning_rate": 7.868393493461687e-05, "loss": 0.0133, "step": 2621 }, { "epoch": 1.8745308310991957, "grad_norm": 0.011809982359409332, "learning_rate": 7.86634797847134e-05, "loss": 0.0148, "step": 2622 }, { "epoch": 1.8752457551385167, "grad_norm": 0.01555860135704279, "learning_rate": 7.864301748673198e-05, "loss": 0.0164, "step": 2623 }, { "epoch": 1.8759606791778374, "grad_norm": 0.015979325398802757, "learning_rate": 7.862254804577549e-05, "loss": 0.0125, "step": 2624 }, { "epoch": 1.876675603217158, "grad_norm": 0.01298864558339119, "learning_rate": 7.86020714669486e-05, "loss": 0.0158, "step": 2625 }, { "epoch": 1.876675603217158, "eval_loss": 0.01052735187113285, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 2625 }, { "epoch": 1.877390527256479, "grad_norm": 0.01762407273054123, "learning_rate": 7.858158775535772e-05, "loss": 0.0131, "step": 2626 }, { "epoch": 1.8781054512958, "grad_norm": 0.016235439106822014, "learning_rate": 7.856109691611109e-05, "loss": 0.0169, "step": 2627 }, { "epoch": 1.8788203753351207, "grad_norm": 0.014477214775979519, "learning_rate": 7.854059895431868e-05, "loss": 0.0177, "step": 2628 }, { "epoch": 1.8795352993744414, "grad_norm": 0.01299628708511591, "learning_rate": 7.852009387509227e-05, "loss": 0.0127, "step": 2629 }, { "epoch": 1.8802502234137624, "grad_norm": 0.013337585143744946, "learning_rate": 7.849958168354539e-05, "loss": 0.0203, "step": 2630 }, { "epoch": 1.8802502234137624, "eval_loss": 0.010524880141019821, "eval_runtime": 4.5765, "eval_samples_per_second": 10.925, "eval_steps_per_second": 2.841, "step": 2630 }, { "epoch": 1.880965147453083, "grad_norm": 0.02093205787241459, "learning_rate": 7.847906238479337e-05, "loss": 0.0266, "step": 2631 }, { "epoch": 1.8816800714924038, "grad_norm": 0.012887874618172646, "learning_rate": 7.845853598395327e-05, "loss": 0.0092, "step": 2632 }, { "epoch": 1.8823949955317247, "grad_norm": 0.018728451803326607, "learning_rate": 7.843800248614396e-05, "loss": 0.0147, "step": 2633 }, { "epoch": 1.8831099195710457, "grad_norm": 0.01674928143620491, "learning_rate": 7.841746189648611e-05, "loss": 0.0189, "step": 2634 }, { "epoch": 1.8838248436103664, "grad_norm": 0.018738271668553352, "learning_rate": 7.839691422010207e-05, "loss": 0.0167, "step": 2635 }, { "epoch": 1.8838248436103664, "eval_loss": 0.01049236860126257, "eval_runtime": 4.5964, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 2635 }, { "epoch": 1.8845397676496871, "grad_norm": 0.014893651939928532, "learning_rate": 7.837635946211603e-05, "loss": 0.0242, "step": 2636 }, { "epoch": 1.885254691689008, "grad_norm": 0.01712147891521454, "learning_rate": 7.83557976276539e-05, "loss": 0.0114, "step": 2637 }, { "epoch": 1.885969615728329, "grad_norm": 0.02068031020462513, "learning_rate": 7.833522872184337e-05, "loss": 0.0196, "step": 2638 }, { "epoch": 1.8866845397676497, "grad_norm": 0.0228471290320158, "learning_rate": 7.831465274981394e-05, "loss": 0.0201, "step": 2639 }, { "epoch": 1.8873994638069704, "grad_norm": 0.013511629775166512, "learning_rate": 7.829406971669679e-05, "loss": 0.0098, "step": 2640 }, { "epoch": 1.8873994638069704, "eval_loss": 0.010511635802686214, "eval_runtime": 4.6009, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 2640 }, { "epoch": 1.8881143878462914, "grad_norm": 0.012105714529752731, "learning_rate": 7.827347962762494e-05, "loss": 0.01, "step": 2641 }, { "epoch": 1.8888293118856123, "grad_norm": 0.013533581048250198, "learning_rate": 7.82528824877331e-05, "loss": 0.0115, "step": 2642 }, { "epoch": 1.8895442359249328, "grad_norm": 0.01754198595881462, "learning_rate": 7.823227830215776e-05, "loss": 0.0193, "step": 2643 }, { "epoch": 1.8902591599642538, "grad_norm": 0.01623760163784027, "learning_rate": 7.82116670760372e-05, "loss": 0.013, "step": 2644 }, { "epoch": 1.8909740840035747, "grad_norm": 0.01785467192530632, "learning_rate": 7.819104881451144e-05, "loss": 0.0118, "step": 2645 }, { "epoch": 1.8909740840035747, "eval_loss": 0.01039215549826622, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2645 }, { "epoch": 1.8916890080428954, "grad_norm": 0.01631089299917221, "learning_rate": 7.817042352272224e-05, "loss": 0.0197, "step": 2646 }, { "epoch": 1.8924039320822161, "grad_norm": 0.015538552775979042, "learning_rate": 7.814979120581312e-05, "loss": 0.0151, "step": 2647 }, { "epoch": 1.893118856121537, "grad_norm": 0.012875297106802464, "learning_rate": 7.812915186892934e-05, "loss": 0.0129, "step": 2648 }, { "epoch": 1.893833780160858, "grad_norm": 0.01638495735824108, "learning_rate": 7.810850551721792e-05, "loss": 0.0114, "step": 2649 }, { "epoch": 1.8945487042001787, "grad_norm": 0.01611967757344246, "learning_rate": 7.808785215582766e-05, "loss": 0.0133, "step": 2650 }, { "epoch": 1.8945487042001787, "eval_loss": 0.010419259779155254, "eval_runtime": 4.5781, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 2650 }, { "epoch": 1.8952636282394995, "grad_norm": 0.014646412804722786, "learning_rate": 7.806719178990905e-05, "loss": 0.012, "step": 2651 }, { "epoch": 1.8959785522788204, "grad_norm": 0.018089069053530693, "learning_rate": 7.804652442461439e-05, "loss": 0.0132, "step": 2652 }, { "epoch": 1.8966934763181413, "grad_norm": 0.01634528487920761, "learning_rate": 7.802585006509766e-05, "loss": 0.0149, "step": 2653 }, { "epoch": 1.897408400357462, "grad_norm": 0.01247189100831747, "learning_rate": 7.800516871651465e-05, "loss": 0.0108, "step": 2654 }, { "epoch": 1.8981233243967828, "grad_norm": 0.016367577016353607, "learning_rate": 7.798448038402282e-05, "loss": 0.019, "step": 2655 }, { "epoch": 1.8981233243967828, "eval_loss": 0.010352224111557007, "eval_runtime": 4.6267, "eval_samples_per_second": 10.807, "eval_steps_per_second": 2.81, "step": 2655 }, { "epoch": 1.8988382484361037, "grad_norm": 0.02160031348466873, "learning_rate": 7.796378507278144e-05, "loss": 0.0225, "step": 2656 }, { "epoch": 1.8995531724754244, "grad_norm": 0.018012192100286484, "learning_rate": 7.794308278795148e-05, "loss": 0.0126, "step": 2657 }, { "epoch": 1.9002680965147452, "grad_norm": 0.014489172957837582, "learning_rate": 7.792237353469566e-05, "loss": 0.0164, "step": 2658 }, { "epoch": 1.900983020554066, "grad_norm": 0.01855389028787613, "learning_rate": 7.790165731817846e-05, "loss": 0.0136, "step": 2659 }, { "epoch": 1.901697944593387, "grad_norm": 0.01722020097076893, "learning_rate": 7.788093414356606e-05, "loss": 0.0202, "step": 2660 }, { "epoch": 1.901697944593387, "eval_loss": 0.010437984019517899, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 2660 }, { "epoch": 1.9024128686327078, "grad_norm": 0.015707053244113922, "learning_rate": 7.786020401602637e-05, "loss": 0.0117, "step": 2661 }, { "epoch": 1.9031277926720285, "grad_norm": 0.015279739163815975, "learning_rate": 7.783946694072908e-05, "loss": 0.0155, "step": 2662 }, { "epoch": 1.9038427167113494, "grad_norm": 0.012460188008844852, "learning_rate": 7.78187229228456e-05, "loss": 0.0089, "step": 2663 }, { "epoch": 1.9045576407506704, "grad_norm": 0.015333103947341442, "learning_rate": 7.779797196754901e-05, "loss": 0.0179, "step": 2664 }, { "epoch": 1.905272564789991, "grad_norm": 0.012057370506227016, "learning_rate": 7.77772140800142e-05, "loss": 0.0143, "step": 2665 }, { "epoch": 1.905272564789991, "eval_loss": 0.010486087761819363, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 2665 }, { "epoch": 1.9059874888293118, "grad_norm": 0.014083595015108585, "learning_rate": 7.775644926541777e-05, "loss": 0.0137, "step": 2666 }, { "epoch": 1.9067024128686327, "grad_norm": 0.01688194088637829, "learning_rate": 7.773567752893802e-05, "loss": 0.0163, "step": 2667 }, { "epoch": 1.9074173369079537, "grad_norm": 0.013888372108340263, "learning_rate": 7.771489887575498e-05, "loss": 0.0187, "step": 2668 }, { "epoch": 1.9081322609472744, "grad_norm": 0.01474728249013424, "learning_rate": 7.769411331105043e-05, "loss": 0.0106, "step": 2669 }, { "epoch": 1.9088471849865951, "grad_norm": 0.012974468991160393, "learning_rate": 7.767332084000784e-05, "loss": 0.0138, "step": 2670 }, { "epoch": 1.9088471849865951, "eval_loss": 0.010579580441117287, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 2670 }, { "epoch": 1.909562109025916, "grad_norm": 0.01448733638972044, "learning_rate": 7.765252146781246e-05, "loss": 0.0089, "step": 2671 }, { "epoch": 1.9102770330652368, "grad_norm": 0.01736474595963955, "learning_rate": 7.763171519965117e-05, "loss": 0.0112, "step": 2672 }, { "epoch": 1.9109919571045575, "grad_norm": 0.018042748793959618, "learning_rate": 7.761090204071266e-05, "loss": 0.0173, "step": 2673 }, { "epoch": 1.9117068811438784, "grad_norm": 0.018508359789848328, "learning_rate": 7.75900819961873e-05, "loss": 0.0156, "step": 2674 }, { "epoch": 1.9124218051831994, "grad_norm": 0.017403388395905495, "learning_rate": 7.756925507126717e-05, "loss": 0.0184, "step": 2675 }, { "epoch": 1.9124218051831994, "eval_loss": 0.010713834315538406, "eval_runtime": 4.6456, "eval_samples_per_second": 10.763, "eval_steps_per_second": 2.798, "step": 2675 }, { "epoch": 1.91313672922252, "grad_norm": 0.01534940768033266, "learning_rate": 7.754842127114606e-05, "loss": 0.0167, "step": 2676 }, { "epoch": 1.9138516532618408, "grad_norm": 0.016181450337171555, "learning_rate": 7.75275806010195e-05, "loss": 0.0134, "step": 2677 }, { "epoch": 1.9145665773011618, "grad_norm": 0.018006587401032448, "learning_rate": 7.750673306608472e-05, "loss": 0.0159, "step": 2678 }, { "epoch": 1.9152815013404827, "grad_norm": 0.013993702828884125, "learning_rate": 7.748587867154067e-05, "loss": 0.009, "step": 2679 }, { "epoch": 1.9159964253798034, "grad_norm": 0.013075858354568481, "learning_rate": 7.746501742258801e-05, "loss": 0.0134, "step": 2680 }, { "epoch": 1.9159964253798034, "eval_loss": 0.010532299056649208, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 2680 }, { "epoch": 1.9167113494191241, "grad_norm": 0.015181598253548145, "learning_rate": 7.744414932442907e-05, "loss": 0.0217, "step": 2681 }, { "epoch": 1.917426273458445, "grad_norm": 0.014340697787702084, "learning_rate": 7.742327438226796e-05, "loss": 0.0175, "step": 2682 }, { "epoch": 1.918141197497766, "grad_norm": 0.012970870360732079, "learning_rate": 7.740239260131043e-05, "loss": 0.0161, "step": 2683 }, { "epoch": 1.9188561215370867, "grad_norm": 0.016966605558991432, "learning_rate": 7.738150398676397e-05, "loss": 0.0115, "step": 2684 }, { "epoch": 1.9195710455764075, "grad_norm": 0.015321689657866955, "learning_rate": 7.736060854383778e-05, "loss": 0.0123, "step": 2685 }, { "epoch": 1.9195710455764075, "eval_loss": 0.0104335006326437, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 2685 }, { "epoch": 1.9202859696157284, "grad_norm": 0.01697467267513275, "learning_rate": 7.733970627774274e-05, "loss": 0.0136, "step": 2686 }, { "epoch": 1.9210008936550491, "grad_norm": 0.02051052823662758, "learning_rate": 7.731879719369144e-05, "loss": 0.0275, "step": 2687 }, { "epoch": 1.9217158176943698, "grad_norm": 0.015293430536985397, "learning_rate": 7.729788129689819e-05, "loss": 0.0122, "step": 2688 }, { "epoch": 1.9224307417336908, "grad_norm": 0.015704011544585228, "learning_rate": 7.727695859257895e-05, "loss": 0.0085, "step": 2689 }, { "epoch": 1.9231456657730117, "grad_norm": 0.016621513292193413, "learning_rate": 7.725602908595144e-05, "loss": 0.0106, "step": 2690 }, { "epoch": 1.9231456657730117, "eval_loss": 0.010534669272601604, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 2690 }, { "epoch": 1.9238605898123324, "grad_norm": 0.02275245264172554, "learning_rate": 7.723509278223504e-05, "loss": 0.0151, "step": 2691 }, { "epoch": 1.9245755138516532, "grad_norm": 0.014133294112980366, "learning_rate": 7.72141496866508e-05, "loss": 0.0104, "step": 2692 }, { "epoch": 1.925290437890974, "grad_norm": 0.016548845916986465, "learning_rate": 7.719319980442153e-05, "loss": 0.0089, "step": 2693 }, { "epoch": 1.926005361930295, "grad_norm": 0.015185254625976086, "learning_rate": 7.717224314077169e-05, "loss": 0.0149, "step": 2694 }, { "epoch": 1.9267202859696158, "grad_norm": 0.01571507565677166, "learning_rate": 7.715127970092741e-05, "loss": 0.018, "step": 2695 }, { "epoch": 1.9267202859696158, "eval_loss": 0.010494131594896317, "eval_runtime": 4.6077, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 2695 }, { "epoch": 1.9274352100089365, "grad_norm": 0.011960411444306374, "learning_rate": 7.713030949011659e-05, "loss": 0.0125, "step": 2696 }, { "epoch": 1.9281501340482574, "grad_norm": 0.0138634592294693, "learning_rate": 7.71093325135687e-05, "loss": 0.0171, "step": 2697 }, { "epoch": 1.9288650580875784, "grad_norm": 0.016600294038653374, "learning_rate": 7.708834877651502e-05, "loss": 0.0174, "step": 2698 }, { "epoch": 1.9295799821268989, "grad_norm": 0.014880563132464886, "learning_rate": 7.706735828418842e-05, "loss": 0.0102, "step": 2699 }, { "epoch": 1.9302949061662198, "grad_norm": 0.015171181410551071, "learning_rate": 7.704636104182353e-05, "loss": 0.01, "step": 2700 }, { "epoch": 1.9302949061662198, "eval_loss": 0.010531996376812458, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 2700 }, { "epoch": 1.9310098302055407, "grad_norm": 0.014753127470612526, "learning_rate": 7.70253570546566e-05, "loss": 0.0146, "step": 2701 }, { "epoch": 1.9317247542448615, "grad_norm": 0.0153324194252491, "learning_rate": 7.700434632792559e-05, "loss": 0.0167, "step": 2702 }, { "epoch": 1.9324396782841822, "grad_norm": 0.014685209840536118, "learning_rate": 7.698332886687017e-05, "loss": 0.0143, "step": 2703 }, { "epoch": 1.9331546023235031, "grad_norm": 0.0159513670951128, "learning_rate": 7.696230467673162e-05, "loss": 0.014, "step": 2704 }, { "epoch": 1.933869526362824, "grad_norm": 0.01246909610927105, "learning_rate": 7.694127376275295e-05, "loss": 0.0154, "step": 2705 }, { "epoch": 1.933869526362824, "eval_loss": 0.010459614917635918, "eval_runtime": 4.5785, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 2705 }, { "epoch": 1.9345844504021448, "grad_norm": 0.014152633026242256, "learning_rate": 7.692023613017885e-05, "loss": 0.0204, "step": 2706 }, { "epoch": 1.9352993744414655, "grad_norm": 0.02050825208425522, "learning_rate": 7.689919178425564e-05, "loss": 0.0147, "step": 2707 }, { "epoch": 1.9360142984807864, "grad_norm": 0.014695177786052227, "learning_rate": 7.687814073023139e-05, "loss": 0.0142, "step": 2708 }, { "epoch": 1.9367292225201074, "grad_norm": 0.016353286802768707, "learning_rate": 7.685708297335574e-05, "loss": 0.02, "step": 2709 }, { "epoch": 1.937444146559428, "grad_norm": 0.01966244727373123, "learning_rate": 7.68360185188801e-05, "loss": 0.0157, "step": 2710 }, { "epoch": 1.937444146559428, "eval_loss": 0.01035169418901205, "eval_runtime": 4.6246, "eval_samples_per_second": 10.812, "eval_steps_per_second": 2.811, "step": 2710 }, { "epoch": 1.9381590705987488, "grad_norm": 0.013985752128064632, "learning_rate": 7.681494737205747e-05, "loss": 0.0112, "step": 2711 }, { "epoch": 1.9388739946380698, "grad_norm": 0.017079150304198265, "learning_rate": 7.679386953814263e-05, "loss": 0.0193, "step": 2712 }, { "epoch": 1.9395889186773905, "grad_norm": 0.014027983881533146, "learning_rate": 7.677278502239188e-05, "loss": 0.0147, "step": 2713 }, { "epoch": 1.9403038427167112, "grad_norm": 0.02100919559597969, "learning_rate": 7.675169383006329e-05, "loss": 0.0103, "step": 2714 }, { "epoch": 1.9410187667560321, "grad_norm": 0.016083192080259323, "learning_rate": 7.673059596641656e-05, "loss": 0.0156, "step": 2715 }, { "epoch": 1.9410187667560321, "eval_loss": 0.010120912455022335, "eval_runtime": 4.5886, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 2715 }, { "epoch": 1.941733690795353, "grad_norm": 0.013744882307946682, "learning_rate": 7.670949143671306e-05, "loss": 0.0153, "step": 2716 }, { "epoch": 1.9424486148346738, "grad_norm": 0.015043300576508045, "learning_rate": 7.668838024621586e-05, "loss": 0.0113, "step": 2717 }, { "epoch": 1.9431635388739945, "grad_norm": 0.016266686841845512, "learning_rate": 7.666726240018959e-05, "loss": 0.0201, "step": 2718 }, { "epoch": 1.9438784629133155, "grad_norm": 0.011061236262321472, "learning_rate": 7.664613790390064e-05, "loss": 0.0087, "step": 2719 }, { "epoch": 1.9445933869526364, "grad_norm": 0.012733415700495243, "learning_rate": 7.662500676261703e-05, "loss": 0.0102, "step": 2720 }, { "epoch": 1.9445933869526364, "eval_loss": 0.010032758116722107, "eval_runtime": 4.5861, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 2720 }, { "epoch": 1.9453083109919571, "grad_norm": 0.01761515438556671, "learning_rate": 7.660386898160839e-05, "loss": 0.0154, "step": 2721 }, { "epoch": 1.9460232350312778, "grad_norm": 0.014327536337077618, "learning_rate": 7.658272456614607e-05, "loss": 0.0163, "step": 2722 }, { "epoch": 1.9467381590705988, "grad_norm": 0.012856276705861092, "learning_rate": 7.656157352150308e-05, "loss": 0.0113, "step": 2723 }, { "epoch": 1.9474530831099197, "grad_norm": 0.013539072126150131, "learning_rate": 7.6540415852954e-05, "loss": 0.013, "step": 2724 }, { "epoch": 1.9481680071492404, "grad_norm": 0.015856340527534485, "learning_rate": 7.651925156577515e-05, "loss": 0.0127, "step": 2725 }, { "epoch": 1.9481680071492404, "eval_loss": 0.010095743462443352, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 2725 }, { "epoch": 1.9488829311885612, "grad_norm": 0.014959301799535751, "learning_rate": 7.649808066524442e-05, "loss": 0.0111, "step": 2726 }, { "epoch": 1.949597855227882, "grad_norm": 0.012842537835240364, "learning_rate": 7.647690315664144e-05, "loss": 0.0115, "step": 2727 }, { "epoch": 1.9503127792672028, "grad_norm": 0.011557304300367832, "learning_rate": 7.645571904524744e-05, "loss": 0.0127, "step": 2728 }, { "epoch": 1.9510277033065235, "grad_norm": 0.017397956922650337, "learning_rate": 7.643452833634526e-05, "loss": 0.0122, "step": 2729 }, { "epoch": 1.9517426273458445, "grad_norm": 0.015173534862697124, "learning_rate": 7.641333103521946e-05, "loss": 0.0132, "step": 2730 }, { "epoch": 1.9517426273458445, "eval_loss": 0.010042700916528702, "eval_runtime": 4.6, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 2730 }, { "epoch": 1.9524575513851654, "grad_norm": 0.014629957266151905, "learning_rate": 7.63921271471562e-05, "loss": 0.0152, "step": 2731 }, { "epoch": 1.9531724754244861, "grad_norm": 0.01557369064539671, "learning_rate": 7.637091667744326e-05, "loss": 0.0149, "step": 2732 }, { "epoch": 1.9538873994638069, "grad_norm": 0.018474621698260307, "learning_rate": 7.634969963137014e-05, "loss": 0.0192, "step": 2733 }, { "epoch": 1.9546023235031278, "grad_norm": 0.014963499270379543, "learning_rate": 7.63284760142279e-05, "loss": 0.0145, "step": 2734 }, { "epoch": 1.9553172475424487, "grad_norm": 0.01874992437660694, "learning_rate": 7.630724583130929e-05, "loss": 0.0208, "step": 2735 }, { "epoch": 1.9553172475424487, "eval_loss": 0.010232435539364815, "eval_runtime": 4.6056, "eval_samples_per_second": 10.856, "eval_steps_per_second": 2.823, "step": 2735 }, { "epoch": 1.9560321715817695, "grad_norm": 0.0168466754257679, "learning_rate": 7.628600908790866e-05, "loss": 0.0148, "step": 2736 }, { "epoch": 1.9567470956210902, "grad_norm": 0.013006910681724548, "learning_rate": 7.626476578932202e-05, "loss": 0.009, "step": 2737 }, { "epoch": 1.9574620196604111, "grad_norm": 0.015537447296082973, "learning_rate": 7.624351594084699e-05, "loss": 0.0243, "step": 2738 }, { "epoch": 1.958176943699732, "grad_norm": 0.01200074516236782, "learning_rate": 7.622225954778289e-05, "loss": 0.0083, "step": 2739 }, { "epoch": 1.9588918677390528, "grad_norm": 0.015852252021431923, "learning_rate": 7.620099661543059e-05, "loss": 0.0168, "step": 2740 }, { "epoch": 1.9588918677390528, "eval_loss": 0.010387305170297623, "eval_runtime": 4.6049, "eval_samples_per_second": 10.858, "eval_steps_per_second": 2.823, "step": 2740 }, { "epoch": 1.9596067917783735, "grad_norm": 0.015163201838731766, "learning_rate": 7.617972714909262e-05, "loss": 0.0124, "step": 2741 }, { "epoch": 1.9603217158176944, "grad_norm": 0.016517173498868942, "learning_rate": 7.615845115407316e-05, "loss": 0.0117, "step": 2742 }, { "epoch": 1.9610366398570152, "grad_norm": 0.014173021540045738, "learning_rate": 7.613716863567799e-05, "loss": 0.0183, "step": 2743 }, { "epoch": 1.9617515638963359, "grad_norm": 0.012549023143947124, "learning_rate": 7.611587959921454e-05, "loss": 0.0091, "step": 2744 }, { "epoch": 1.9624664879356568, "grad_norm": 0.014350423589348793, "learning_rate": 7.609458404999186e-05, "loss": 0.0164, "step": 2745 }, { "epoch": 1.9624664879356568, "eval_loss": 0.010242425836622715, "eval_runtime": 4.5913, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 2745 }, { "epoch": 1.9631814119749778, "grad_norm": 0.011453299783170223, "learning_rate": 7.607328199332058e-05, "loss": 0.0136, "step": 2746 }, { "epoch": 1.9638963360142985, "grad_norm": 0.01539953425526619, "learning_rate": 7.605197343451304e-05, "loss": 0.0181, "step": 2747 }, { "epoch": 1.9646112600536192, "grad_norm": 0.021237298846244812, "learning_rate": 7.603065837888315e-05, "loss": 0.0226, "step": 2748 }, { "epoch": 1.9653261840929401, "grad_norm": 0.01648072712123394, "learning_rate": 7.60093368317464e-05, "loss": 0.0163, "step": 2749 }, { "epoch": 1.966041108132261, "grad_norm": 0.014754453673958778, "learning_rate": 7.598800879841998e-05, "loss": 0.0148, "step": 2750 }, { "epoch": 1.966041108132261, "eval_loss": 0.010265312157571316, "eval_runtime": 4.5918, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 2750 }, { "epoch": 1.9667560321715818, "grad_norm": 0.015377020463347435, "learning_rate": 7.596667428422264e-05, "loss": 0.01, "step": 2751 }, { "epoch": 1.9674709562109025, "grad_norm": 0.019535059109330177, "learning_rate": 7.594533329447479e-05, "loss": 0.0239, "step": 2752 }, { "epoch": 1.9681858802502235, "grad_norm": 0.018854480236768723, "learning_rate": 7.59239858344984e-05, "loss": 0.0214, "step": 2753 }, { "epoch": 1.9689008042895444, "grad_norm": 0.015362120233476162, "learning_rate": 7.59026319096171e-05, "loss": 0.0187, "step": 2754 }, { "epoch": 1.9696157283288651, "grad_norm": 0.018100235611200333, "learning_rate": 7.58812715251561e-05, "loss": 0.0141, "step": 2755 }, { "epoch": 1.9696157283288651, "eval_loss": 0.01022448018193245, "eval_runtime": 4.6038, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.824, "step": 2755 }, { "epoch": 1.9703306523681858, "grad_norm": 0.01649276353418827, "learning_rate": 7.585990468644229e-05, "loss": 0.0168, "step": 2756 }, { "epoch": 1.9710455764075068, "grad_norm": 0.018278000876307487, "learning_rate": 7.583853139880406e-05, "loss": 0.0172, "step": 2757 }, { "epoch": 1.9717605004468275, "grad_norm": 0.014170791022479534, "learning_rate": 7.581715166757146e-05, "loss": 0.0147, "step": 2758 }, { "epoch": 1.9724754244861482, "grad_norm": 0.013064163736999035, "learning_rate": 7.579576549807621e-05, "loss": 0.0099, "step": 2759 }, { "epoch": 1.9731903485254692, "grad_norm": 0.015106619335711002, "learning_rate": 7.577437289565154e-05, "loss": 0.02, "step": 2760 }, { "epoch": 1.9731903485254692, "eval_loss": 0.010383247397840023, "eval_runtime": 4.5974, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 2760 }, { "epoch": 1.97390527256479, "grad_norm": 0.013147850520908833, "learning_rate": 7.575297386563232e-05, "loss": 0.0154, "step": 2761 }, { "epoch": 1.9746201966041108, "grad_norm": 0.026584889739751816, "learning_rate": 7.573156841335502e-05, "loss": 0.0197, "step": 2762 }, { "epoch": 1.9753351206434315, "grad_norm": 0.014140826649963856, "learning_rate": 7.571015654415774e-05, "loss": 0.0166, "step": 2763 }, { "epoch": 1.9760500446827525, "grad_norm": 0.013153063133358955, "learning_rate": 7.568873826338014e-05, "loss": 0.0091, "step": 2764 }, { "epoch": 1.9767649687220734, "grad_norm": 0.016707122325897217, "learning_rate": 7.56673135763635e-05, "loss": 0.013, "step": 2765 }, { "epoch": 1.9767649687220734, "eval_loss": 0.010451235808432102, "eval_runtime": 4.5784, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 2765 }, { "epoch": 1.9774798927613941, "grad_norm": 0.017530694603919983, "learning_rate": 7.56458824884507e-05, "loss": 0.0123, "step": 2766 }, { "epoch": 1.9781948168007149, "grad_norm": 0.017658604308962822, "learning_rate": 7.562444500498618e-05, "loss": 0.0207, "step": 2767 }, { "epoch": 1.9789097408400358, "grad_norm": 0.018995752558112144, "learning_rate": 7.560300113131603e-05, "loss": 0.0231, "step": 2768 }, { "epoch": 1.9796246648793567, "grad_norm": 0.012150323018431664, "learning_rate": 7.55815508727879e-05, "loss": 0.0093, "step": 2769 }, { "epoch": 1.9803395889186772, "grad_norm": 0.019069243222475052, "learning_rate": 7.556009423475105e-05, "loss": 0.015, "step": 2770 }, { "epoch": 1.9803395889186772, "eval_loss": 0.010391468182206154, "eval_runtime": 4.5906, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 2770 }, { "epoch": 1.9810545129579982, "grad_norm": 0.01792006567120552, "learning_rate": 7.553863122255633e-05, "loss": 0.0136, "step": 2771 }, { "epoch": 1.9817694369973191, "grad_norm": 0.01616881974041462, "learning_rate": 7.551716184155614e-05, "loss": 0.0112, "step": 2772 }, { "epoch": 1.9824843610366398, "grad_norm": 0.01632789894938469, "learning_rate": 7.549568609710451e-05, "loss": 0.0179, "step": 2773 }, { "epoch": 1.9831992850759605, "grad_norm": 0.012605365365743637, "learning_rate": 7.547420399455705e-05, "loss": 0.0125, "step": 2774 }, { "epoch": 1.9839142091152815, "grad_norm": 0.014612305909395218, "learning_rate": 7.545271553927095e-05, "loss": 0.0089, "step": 2775 }, { "epoch": 1.9839142091152815, "eval_loss": 0.010303495451807976, "eval_runtime": 4.5904, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 2775 }, { "epoch": 1.9846291331546024, "grad_norm": 0.017018044367432594, "learning_rate": 7.543122073660498e-05, "loss": 0.0117, "step": 2776 }, { "epoch": 1.9853440571939232, "grad_norm": 0.015771152451634407, "learning_rate": 7.540971959191951e-05, "loss": 0.0117, "step": 2777 }, { "epoch": 1.9860589812332439, "grad_norm": 0.015398617833852768, "learning_rate": 7.538821211057648e-05, "loss": 0.0112, "step": 2778 }, { "epoch": 1.9867739052725648, "grad_norm": 0.01748901791870594, "learning_rate": 7.536669829793939e-05, "loss": 0.0206, "step": 2779 }, { "epoch": 1.9874888293118858, "grad_norm": 0.016671577468514442, "learning_rate": 7.534517815937337e-05, "loss": 0.0118, "step": 2780 }, { "epoch": 1.9874888293118858, "eval_loss": 0.010154355317354202, "eval_runtime": 4.5908, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 2780 }, { "epoch": 1.9882037533512065, "grad_norm": 0.016292843967676163, "learning_rate": 7.532365170024506e-05, "loss": 0.0132, "step": 2781 }, { "epoch": 1.9889186773905272, "grad_norm": 0.0191281009465456, "learning_rate": 7.530211892592274e-05, "loss": 0.0229, "step": 2782 }, { "epoch": 1.9896336014298481, "grad_norm": 0.01330595463514328, "learning_rate": 7.528057984177624e-05, "loss": 0.0182, "step": 2783 }, { "epoch": 1.9903485254691688, "grad_norm": 0.01643500104546547, "learning_rate": 7.525903445317694e-05, "loss": 0.0174, "step": 2784 }, { "epoch": 1.9910634495084896, "grad_norm": 0.010827918536961079, "learning_rate": 7.523748276549784e-05, "loss": 0.0083, "step": 2785 }, { "epoch": 1.9910634495084896, "eval_loss": 0.010107134468853474, "eval_runtime": 4.6175, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.815, "step": 2785 }, { "epoch": 1.9917783735478105, "grad_norm": 0.014987809583544731, "learning_rate": 7.521592478411345e-05, "loss": 0.0215, "step": 2786 }, { "epoch": 1.9924932975871315, "grad_norm": 0.01301697175949812, "learning_rate": 7.519436051439991e-05, "loss": 0.0099, "step": 2787 }, { "epoch": 1.9932082216264522, "grad_norm": 0.015688005834817886, "learning_rate": 7.517278996173489e-05, "loss": 0.0251, "step": 2788 }, { "epoch": 1.993923145665773, "grad_norm": 0.015660086646676064, "learning_rate": 7.515121313149767e-05, "loss": 0.0127, "step": 2789 }, { "epoch": 1.9946380697050938, "grad_norm": 0.013669020496308804, "learning_rate": 7.512963002906901e-05, "loss": 0.0117, "step": 2790 }, { "epoch": 1.9946380697050938, "eval_loss": 0.010159878060221672, "eval_runtime": 4.6129, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.818, "step": 2790 }, { "epoch": 1.9953529937444148, "grad_norm": 0.013929364271461964, "learning_rate": 7.510804065983132e-05, "loss": 0.0099, "step": 2791 }, { "epoch": 1.9960679177837355, "grad_norm": 0.015856990590691566, "learning_rate": 7.508644502916857e-05, "loss": 0.0127, "step": 2792 }, { "epoch": 1.9967828418230562, "grad_norm": 0.01479620672762394, "learning_rate": 7.50648431424662e-05, "loss": 0.0149, "step": 2793 }, { "epoch": 1.9974977658623772, "grad_norm": 0.01875654235482216, "learning_rate": 7.504323500511131e-05, "loss": 0.0163, "step": 2794 }, { "epoch": 1.998212689901698, "grad_norm": 0.014324220828711987, "learning_rate": 7.502162062249251e-05, "loss": 0.0106, "step": 2795 }, { "epoch": 1.998212689901698, "eval_loss": 0.010389679111540318, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2795 }, { "epoch": 1.9989276139410188, "grad_norm": 0.01136006973683834, "learning_rate": 7.500000000000001e-05, "loss": 0.0083, "step": 2796 }, { "epoch": 1.9996425379803395, "grad_norm": 0.025740597397089005, "learning_rate": 7.497837314302551e-05, "loss": 0.0202, "step": 2797 }, { "epoch": 2.0003574620196605, "grad_norm": 0.022506462410092354, "learning_rate": 7.495674005696229e-05, "loss": 0.0182, "step": 2798 }, { "epoch": 2.0010723860589814, "grad_norm": 0.01593755930662155, "learning_rate": 7.493510074720523e-05, "loss": 0.0166, "step": 2799 }, { "epoch": 2.001787310098302, "grad_norm": 0.016513466835021973, "learning_rate": 7.491345521915071e-05, "loss": 0.0121, "step": 2800 }, { "epoch": 2.001787310098302, "eval_loss": 0.010284343734383583, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 2800 }, { "epoch": 2.002502234137623, "grad_norm": 0.01725495420396328, "learning_rate": 7.489180347819668e-05, "loss": 0.0204, "step": 2801 }, { "epoch": 2.003217158176944, "grad_norm": 0.01594533957540989, "learning_rate": 7.487014552974263e-05, "loss": 0.0113, "step": 2802 }, { "epoch": 2.0039320822162647, "grad_norm": 0.01524555403739214, "learning_rate": 7.484848137918959e-05, "loss": 0.0184, "step": 2803 }, { "epoch": 2.0046470062555852, "grad_norm": 0.018527541309595108, "learning_rate": 7.482681103194018e-05, "loss": 0.0115, "step": 2804 }, { "epoch": 2.005361930294906, "grad_norm": 0.013060818426311016, "learning_rate": 7.480513449339851e-05, "loss": 0.0119, "step": 2805 }, { "epoch": 2.005361930294906, "eval_loss": 0.010592029429972172, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 2805 }, { "epoch": 2.006076854334227, "grad_norm": 0.01772230863571167, "learning_rate": 7.478345176897027e-05, "loss": 0.0178, "step": 2806 }, { "epoch": 2.0067917783735476, "grad_norm": 0.018453652039170265, "learning_rate": 7.476176286406269e-05, "loss": 0.0164, "step": 2807 }, { "epoch": 2.0075067024128685, "grad_norm": 0.01647922955453396, "learning_rate": 7.474006778408453e-05, "loss": 0.0199, "step": 2808 }, { "epoch": 2.0082216264521895, "grad_norm": 0.01848197542130947, "learning_rate": 7.471836653444608e-05, "loss": 0.0171, "step": 2809 }, { "epoch": 2.0089365504915104, "grad_norm": 0.016430813819169998, "learning_rate": 7.469665912055919e-05, "loss": 0.0153, "step": 2810 }, { "epoch": 2.0089365504915104, "eval_loss": 0.010632091201841831, "eval_runtime": 4.6164, "eval_samples_per_second": 10.831, "eval_steps_per_second": 2.816, "step": 2810 }, { "epoch": 2.009651474530831, "grad_norm": 0.013028985820710659, "learning_rate": 7.467494554783724e-05, "loss": 0.0097, "step": 2811 }, { "epoch": 2.010366398570152, "grad_norm": 0.02219887636601925, "learning_rate": 7.465322582169516e-05, "loss": 0.0177, "step": 2812 }, { "epoch": 2.011081322609473, "grad_norm": 0.013063288293778896, "learning_rate": 7.463149994754938e-05, "loss": 0.0124, "step": 2813 }, { "epoch": 2.0117962466487938, "grad_norm": 0.014423822052776814, "learning_rate": 7.460976793081788e-05, "loss": 0.0177, "step": 2814 }, { "epoch": 2.0125111706881142, "grad_norm": 0.013914966024458408, "learning_rate": 7.458802977692017e-05, "loss": 0.0076, "step": 2815 }, { "epoch": 2.0125111706881142, "eval_loss": 0.010471931658685207, "eval_runtime": 4.6086, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 2815 }, { "epoch": 2.013226094727435, "grad_norm": 0.014096337370574474, "learning_rate": 7.456628549127734e-05, "loss": 0.0138, "step": 2816 }, { "epoch": 2.013941018766756, "grad_norm": 0.020940765738487244, "learning_rate": 7.454453507931192e-05, "loss": 0.0135, "step": 2817 }, { "epoch": 2.014655942806077, "grad_norm": 0.017914384603500366, "learning_rate": 7.452277854644802e-05, "loss": 0.0195, "step": 2818 }, { "epoch": 2.0153708668453976, "grad_norm": 0.01536078192293644, "learning_rate": 7.450101589811127e-05, "loss": 0.0111, "step": 2819 }, { "epoch": 2.0160857908847185, "grad_norm": 0.01389515120536089, "learning_rate": 7.447924713972882e-05, "loss": 0.0144, "step": 2820 }, { "epoch": 2.0160857908847185, "eval_loss": 0.01031541358679533, "eval_runtime": 4.6138, "eval_samples_per_second": 10.837, "eval_steps_per_second": 2.818, "step": 2820 }, { "epoch": 2.0168007149240395, "grad_norm": 0.022122904658317566, "learning_rate": 7.445747227672937e-05, "loss": 0.0176, "step": 2821 }, { "epoch": 2.01751563896336, "grad_norm": 0.017830805853009224, "learning_rate": 7.44356913145431e-05, "loss": 0.0089, "step": 2822 }, { "epoch": 2.018230563002681, "grad_norm": 0.014645189046859741, "learning_rate": 7.441390425860172e-05, "loss": 0.0096, "step": 2823 }, { "epoch": 2.018945487042002, "grad_norm": 0.017935441806912422, "learning_rate": 7.43921111143385e-05, "loss": 0.0142, "step": 2824 }, { "epoch": 2.0196604110813228, "grad_norm": 0.01506642997264862, "learning_rate": 7.437031188718818e-05, "loss": 0.0122, "step": 2825 }, { "epoch": 2.0196604110813228, "eval_loss": 0.0103773083537817, "eval_runtime": 4.5984, "eval_samples_per_second": 10.873, "eval_steps_per_second": 2.827, "step": 2825 }, { "epoch": 2.0203753351206433, "grad_norm": 0.016988664865493774, "learning_rate": 7.434850658258704e-05, "loss": 0.0128, "step": 2826 }, { "epoch": 2.021090259159964, "grad_norm": 0.0123400604352355, "learning_rate": 7.432669520597286e-05, "loss": 0.0073, "step": 2827 }, { "epoch": 2.021805183199285, "grad_norm": 0.01466449350118637, "learning_rate": 7.430487776278497e-05, "loss": 0.0133, "step": 2828 }, { "epoch": 2.022520107238606, "grad_norm": 0.015636177733540535, "learning_rate": 7.428305425846416e-05, "loss": 0.0158, "step": 2829 }, { "epoch": 2.0232350312779266, "grad_norm": 0.01146701443940401, "learning_rate": 7.426122469845277e-05, "loss": 0.0105, "step": 2830 }, { "epoch": 2.0232350312779266, "eval_loss": 0.010141163133084774, "eval_runtime": 4.6143, "eval_samples_per_second": 10.836, "eval_steps_per_second": 2.817, "step": 2830 }, { "epoch": 2.0239499553172475, "grad_norm": 0.0173387099057436, "learning_rate": 7.423938908819465e-05, "loss": 0.0133, "step": 2831 }, { "epoch": 2.0246648793565685, "grad_norm": 0.01859341375529766, "learning_rate": 7.421754743313514e-05, "loss": 0.01, "step": 2832 }, { "epoch": 2.025379803395889, "grad_norm": 0.011983807198703289, "learning_rate": 7.41956997387211e-05, "loss": 0.0114, "step": 2833 }, { "epoch": 2.02609472743521, "grad_norm": 0.01589386910200119, "learning_rate": 7.417384601040089e-05, "loss": 0.0137, "step": 2834 }, { "epoch": 2.026809651474531, "grad_norm": 0.018019139766693115, "learning_rate": 7.415198625362437e-05, "loss": 0.0131, "step": 2835 }, { "epoch": 2.026809651474531, "eval_loss": 0.010157739743590355, "eval_runtime": 4.6058, "eval_samples_per_second": 10.856, "eval_steps_per_second": 2.823, "step": 2835 }, { "epoch": 2.027524575513852, "grad_norm": 0.01607029139995575, "learning_rate": 7.413012047384292e-05, "loss": 0.0153, "step": 2836 }, { "epoch": 2.0282394995531723, "grad_norm": 0.012113233096897602, "learning_rate": 7.410824867650943e-05, "loss": 0.0064, "step": 2837 }, { "epoch": 2.0289544235924932, "grad_norm": 0.01893399842083454, "learning_rate": 7.408637086707824e-05, "loss": 0.0102, "step": 2838 }, { "epoch": 2.029669347631814, "grad_norm": 0.012716555036604404, "learning_rate": 7.406448705100521e-05, "loss": 0.008, "step": 2839 }, { "epoch": 2.030384271671135, "grad_norm": 0.024347158148884773, "learning_rate": 7.404259723374777e-05, "loss": 0.0145, "step": 2840 }, { "epoch": 2.030384271671135, "eval_loss": 0.01022709347307682, "eval_runtime": 4.6411, "eval_samples_per_second": 10.773, "eval_steps_per_second": 2.801, "step": 2840 }, { "epoch": 2.0310991957104556, "grad_norm": 0.02015512064099312, "learning_rate": 7.402070142076475e-05, "loss": 0.014, "step": 2841 }, { "epoch": 2.0318141197497765, "grad_norm": 0.013607632368803024, "learning_rate": 7.39987996175165e-05, "loss": 0.01, "step": 2842 }, { "epoch": 2.0325290437890975, "grad_norm": 0.03193064406514168, "learning_rate": 7.397689182946489e-05, "loss": 0.0234, "step": 2843 }, { "epoch": 2.0332439678284184, "grad_norm": 0.012918420135974884, "learning_rate": 7.395497806207327e-05, "loss": 0.0075, "step": 2844 }, { "epoch": 2.033958891867739, "grad_norm": 0.02341781184077263, "learning_rate": 7.393305832080649e-05, "loss": 0.0136, "step": 2845 }, { "epoch": 2.033958891867739, "eval_loss": 0.010325373150408268, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 2845 }, { "epoch": 2.03467381590706, "grad_norm": 0.019817622378468513, "learning_rate": 7.391113261113086e-05, "loss": 0.0112, "step": 2846 }, { "epoch": 2.035388739946381, "grad_norm": 0.019822105765342712, "learning_rate": 7.388920093851421e-05, "loss": 0.0125, "step": 2847 }, { "epoch": 2.0361036639857013, "grad_norm": 0.019257042557001114, "learning_rate": 7.386726330842583e-05, "loss": 0.0221, "step": 2848 }, { "epoch": 2.0368185880250222, "grad_norm": 0.016694245859980583, "learning_rate": 7.384531972633653e-05, "loss": 0.0145, "step": 2849 }, { "epoch": 2.037533512064343, "grad_norm": 0.017493506893515587, "learning_rate": 7.382337019771859e-05, "loss": 0.01, "step": 2850 }, { "epoch": 2.037533512064343, "eval_loss": 0.010384286753833294, "eval_runtime": 4.5743, "eval_samples_per_second": 10.931, "eval_steps_per_second": 2.842, "step": 2850 }, { "epoch": 2.038248436103664, "grad_norm": 0.016741806641221046, "learning_rate": 7.380141472804572e-05, "loss": 0.0125, "step": 2851 }, { "epoch": 2.0389633601429846, "grad_norm": 0.021301642060279846, "learning_rate": 7.377945332279322e-05, "loss": 0.0155, "step": 2852 }, { "epoch": 2.0396782841823056, "grad_norm": 0.013177583925426006, "learning_rate": 7.375748598743777e-05, "loss": 0.011, "step": 2853 }, { "epoch": 2.0403932082216265, "grad_norm": 0.025097178295254707, "learning_rate": 7.373551272745756e-05, "loss": 0.0175, "step": 2854 }, { "epoch": 2.0411081322609474, "grad_norm": 0.016154879704117775, "learning_rate": 7.371353354833231e-05, "loss": 0.0083, "step": 2855 }, { "epoch": 2.0411081322609474, "eval_loss": 0.010400224477052689, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 2855 }, { "epoch": 2.041823056300268, "grad_norm": 0.022230541333556175, "learning_rate": 7.369154845554314e-05, "loss": 0.0159, "step": 2856 }, { "epoch": 2.042537980339589, "grad_norm": 0.019851122051477432, "learning_rate": 7.366955745457269e-05, "loss": 0.0171, "step": 2857 }, { "epoch": 2.04325290437891, "grad_norm": 0.018974103033542633, "learning_rate": 7.364756055090506e-05, "loss": 0.0189, "step": 2858 }, { "epoch": 2.0439678284182308, "grad_norm": 0.04628434032201767, "learning_rate": 7.36255577500258e-05, "loss": 0.0122, "step": 2859 }, { "epoch": 2.0446827524575513, "grad_norm": 0.011842029169201851, "learning_rate": 7.360354905742197e-05, "loss": 0.0112, "step": 2860 }, { "epoch": 2.0446827524575513, "eval_loss": 0.010545932687819004, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 2860 }, { "epoch": 2.045397676496872, "grad_norm": 0.018658846616744995, "learning_rate": 7.358153447858209e-05, "loss": 0.0134, "step": 2861 }, { "epoch": 2.046112600536193, "grad_norm": 0.019544992595911026, "learning_rate": 7.355951401899613e-05, "loss": 0.0097, "step": 2862 }, { "epoch": 2.0468275245755136, "grad_norm": 0.01928049512207508, "learning_rate": 7.353748768415553e-05, "loss": 0.0153, "step": 2863 }, { "epoch": 2.0475424486148346, "grad_norm": 0.01929263211786747, "learning_rate": 7.35154554795532e-05, "loss": 0.017, "step": 2864 }, { "epoch": 2.0482573726541555, "grad_norm": 0.017008796334266663, "learning_rate": 7.349341741068354e-05, "loss": 0.0146, "step": 2865 }, { "epoch": 2.0482573726541555, "eval_loss": 0.01063214149326086, "eval_runtime": 4.595, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 2865 }, { "epoch": 2.0489722966934765, "grad_norm": 0.014163972809910774, "learning_rate": 7.347137348304237e-05, "loss": 0.0126, "step": 2866 }, { "epoch": 2.049687220732797, "grad_norm": 0.021038558334112167, "learning_rate": 7.344932370212699e-05, "loss": 0.0229, "step": 2867 }, { "epoch": 2.050402144772118, "grad_norm": 0.016537856310606003, "learning_rate": 7.342726807343616e-05, "loss": 0.0159, "step": 2868 }, { "epoch": 2.051117068811439, "grad_norm": 0.018305225297808647, "learning_rate": 7.340520660247008e-05, "loss": 0.0144, "step": 2869 }, { "epoch": 2.05183199285076, "grad_norm": 0.018462780863046646, "learning_rate": 7.338313929473045e-05, "loss": 0.0165, "step": 2870 }, { "epoch": 2.05183199285076, "eval_loss": 0.010821978561580181, "eval_runtime": 4.606, "eval_samples_per_second": 10.855, "eval_steps_per_second": 2.822, "step": 2870 }, { "epoch": 2.0525469168900803, "grad_norm": 0.04650963097810745, "learning_rate": 7.336106615572038e-05, "loss": 0.0141, "step": 2871 }, { "epoch": 2.0532618409294012, "grad_norm": 0.023093990981578827, "learning_rate": 7.333898719094448e-05, "loss": 0.0166, "step": 2872 }, { "epoch": 2.053976764968722, "grad_norm": 0.017307931557297707, "learning_rate": 7.331690240590875e-05, "loss": 0.0085, "step": 2873 }, { "epoch": 2.054691689008043, "grad_norm": 0.016092980280518532, "learning_rate": 7.32948118061207e-05, "loss": 0.0088, "step": 2874 }, { "epoch": 2.0554066130473636, "grad_norm": 0.01823550835251808, "learning_rate": 7.327271539708926e-05, "loss": 0.0175, "step": 2875 }, { "epoch": 2.0554066130473636, "eval_loss": 0.010896469466388226, "eval_runtime": 4.5974, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 2875 }, { "epoch": 2.0561215370866845, "grad_norm": 0.020226923748850822, "learning_rate": 7.325061318432483e-05, "loss": 0.018, "step": 2876 }, { "epoch": 2.0568364611260055, "grad_norm": 0.016039259731769562, "learning_rate": 7.322850517333923e-05, "loss": 0.0148, "step": 2877 }, { "epoch": 2.057551385165326, "grad_norm": 0.018236003816127777, "learning_rate": 7.320639136964576e-05, "loss": 0.0136, "step": 2878 }, { "epoch": 2.058266309204647, "grad_norm": 0.017416536808013916, "learning_rate": 7.31842717787591e-05, "loss": 0.0127, "step": 2879 }, { "epoch": 2.058981233243968, "grad_norm": 0.018147725611925125, "learning_rate": 7.316214640619546e-05, "loss": 0.0198, "step": 2880 }, { "epoch": 2.058981233243968, "eval_loss": 0.010810820385813713, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 2880 }, { "epoch": 2.059696157283289, "grad_norm": 0.021781329065561295, "learning_rate": 7.314001525747244e-05, "loss": 0.0131, "step": 2881 }, { "epoch": 2.0604110813226093, "grad_norm": 0.01431222353130579, "learning_rate": 7.311787833810908e-05, "loss": 0.0074, "step": 2882 }, { "epoch": 2.0611260053619302, "grad_norm": 0.018027139827609062, "learning_rate": 7.309573565362587e-05, "loss": 0.0126, "step": 2883 }, { "epoch": 2.061840929401251, "grad_norm": 0.01954074390232563, "learning_rate": 7.307358720954475e-05, "loss": 0.0142, "step": 2884 }, { "epoch": 2.062555853440572, "grad_norm": 0.015377465635538101, "learning_rate": 7.305143301138908e-05, "loss": 0.0126, "step": 2885 }, { "epoch": 2.062555853440572, "eval_loss": 0.010778198018670082, "eval_runtime": 4.6342, "eval_samples_per_second": 10.789, "eval_steps_per_second": 2.805, "step": 2885 }, { "epoch": 2.0632707774798926, "grad_norm": 0.016130464151501656, "learning_rate": 7.302927306468364e-05, "loss": 0.013, "step": 2886 }, { "epoch": 2.0639857015192136, "grad_norm": 0.021815277636051178, "learning_rate": 7.300710737495466e-05, "loss": 0.0132, "step": 2887 }, { "epoch": 2.0647006255585345, "grad_norm": 0.020189693197607994, "learning_rate": 7.298493594772985e-05, "loss": 0.0099, "step": 2888 }, { "epoch": 2.065415549597855, "grad_norm": 0.019275303930044174, "learning_rate": 7.296275878853825e-05, "loss": 0.009, "step": 2889 }, { "epoch": 2.066130473637176, "grad_norm": 0.01782870478928089, "learning_rate": 7.294057590291042e-05, "loss": 0.0133, "step": 2890 }, { "epoch": 2.066130473637176, "eval_loss": 0.01034427247941494, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 2890 }, { "epoch": 2.066845397676497, "grad_norm": 0.012394948862493038, "learning_rate": 7.291838729637829e-05, "loss": 0.0107, "step": 2891 }, { "epoch": 2.067560321715818, "grad_norm": 0.019523827359080315, "learning_rate": 7.289619297447525e-05, "loss": 0.0213, "step": 2892 }, { "epoch": 2.0682752457551383, "grad_norm": 0.015755435451865196, "learning_rate": 7.287399294273611e-05, "loss": 0.0183, "step": 2893 }, { "epoch": 2.0689901697944593, "grad_norm": 0.01585135981440544, "learning_rate": 7.285178720669707e-05, "loss": 0.0079, "step": 2894 }, { "epoch": 2.06970509383378, "grad_norm": 0.016108229756355286, "learning_rate": 7.282957577189581e-05, "loss": 0.0128, "step": 2895 }, { "epoch": 2.06970509383378, "eval_loss": 0.010344953276216984, "eval_runtime": 4.5933, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 2895 }, { "epoch": 2.070420017873101, "grad_norm": 0.009950278326869011, "learning_rate": 7.280735864387138e-05, "loss": 0.0102, "step": 2896 }, { "epoch": 2.0711349419124216, "grad_norm": 0.01730411686003208, "learning_rate": 7.27851358281643e-05, "loss": 0.0105, "step": 2897 }, { "epoch": 2.0718498659517426, "grad_norm": 0.013456201180815697, "learning_rate": 7.276290733031645e-05, "loss": 0.0119, "step": 2898 }, { "epoch": 2.0725647899910635, "grad_norm": 0.019675694406032562, "learning_rate": 7.274067315587116e-05, "loss": 0.022, "step": 2899 }, { "epoch": 2.0732797140303845, "grad_norm": 0.015960905700922012, "learning_rate": 7.27184333103732e-05, "loss": 0.008, "step": 2900 }, { "epoch": 2.0732797140303845, "eval_loss": 0.01042988896369934, "eval_runtime": 4.5791, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 2900 }, { "epoch": 2.073994638069705, "grad_norm": 0.019244616851210594, "learning_rate": 7.26961877993687e-05, "loss": 0.0131, "step": 2901 }, { "epoch": 2.074709562109026, "grad_norm": 0.020733274519443512, "learning_rate": 7.267393662840525e-05, "loss": 0.013, "step": 2902 }, { "epoch": 2.075424486148347, "grad_norm": 0.019659366458654404, "learning_rate": 7.265167980303178e-05, "loss": 0.0167, "step": 2903 }, { "epoch": 2.0761394101876673, "grad_norm": 0.019449196755886078, "learning_rate": 7.262941732879876e-05, "loss": 0.0128, "step": 2904 }, { "epoch": 2.0768543342269883, "grad_norm": 0.014336769469082355, "learning_rate": 7.260714921125794e-05, "loss": 0.0106, "step": 2905 }, { "epoch": 2.0768543342269883, "eval_loss": 0.01027598325163126, "eval_runtime": 4.5785, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 2905 }, { "epoch": 2.0775692582663092, "grad_norm": 0.012641250155866146, "learning_rate": 7.258487545596253e-05, "loss": 0.0109, "step": 2906 }, { "epoch": 2.07828418230563, "grad_norm": 0.024489626288414, "learning_rate": 7.256259606846714e-05, "loss": 0.0206, "step": 2907 }, { "epoch": 2.0789991063449507, "grad_norm": 0.018261360004544258, "learning_rate": 7.254031105432781e-05, "loss": 0.0091, "step": 2908 }, { "epoch": 2.0797140303842716, "grad_norm": 0.01443734485656023, "learning_rate": 7.251802041910194e-05, "loss": 0.0115, "step": 2909 }, { "epoch": 2.0804289544235925, "grad_norm": 0.020346196368336678, "learning_rate": 7.249572416834838e-05, "loss": 0.0141, "step": 2910 }, { "epoch": 2.0804289544235925, "eval_loss": 0.01040224265307188, "eval_runtime": 4.5825, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 2910 }, { "epoch": 2.0811438784629135, "grad_norm": 0.018647432327270508, "learning_rate": 7.24734223076273e-05, "loss": 0.0182, "step": 2911 }, { "epoch": 2.081858802502234, "grad_norm": 0.01830330304801464, "learning_rate": 7.245111484250038e-05, "loss": 0.0122, "step": 2912 }, { "epoch": 2.082573726541555, "grad_norm": 0.014368443749845028, "learning_rate": 7.242880177853062e-05, "loss": 0.0157, "step": 2913 }, { "epoch": 2.083288650580876, "grad_norm": 0.016726955771446228, "learning_rate": 7.240648312128242e-05, "loss": 0.0152, "step": 2914 }, { "epoch": 2.084003574620197, "grad_norm": 0.017280394211411476, "learning_rate": 7.23841588763216e-05, "loss": 0.0115, "step": 2915 }, { "epoch": 2.084003574620197, "eval_loss": 0.010449877008795738, "eval_runtime": 4.5794, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 2915 }, { "epoch": 2.0847184986595173, "grad_norm": 0.016593361273407936, "learning_rate": 7.236182904921535e-05, "loss": 0.0139, "step": 2916 }, { "epoch": 2.0854334226988382, "grad_norm": 0.014399289153516293, "learning_rate": 7.233949364553232e-05, "loss": 0.0107, "step": 2917 }, { "epoch": 2.086148346738159, "grad_norm": 0.025192998349666595, "learning_rate": 7.231715267084243e-05, "loss": 0.0099, "step": 2918 }, { "epoch": 2.0868632707774797, "grad_norm": 0.014963400550186634, "learning_rate": 7.229480613071708e-05, "loss": 0.0104, "step": 2919 }, { "epoch": 2.0875781948168006, "grad_norm": 0.020565906539559364, "learning_rate": 7.227245403072904e-05, "loss": 0.0198, "step": 2920 }, { "epoch": 2.0875781948168006, "eval_loss": 0.010415429249405861, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 2920 }, { "epoch": 2.0882931188561216, "grad_norm": 0.026527101173996925, "learning_rate": 7.225009637645248e-05, "loss": 0.0156, "step": 2921 }, { "epoch": 2.0890080428954425, "grad_norm": 0.019188061356544495, "learning_rate": 7.222773317346291e-05, "loss": 0.0195, "step": 2922 }, { "epoch": 2.089722966934763, "grad_norm": 0.01723632775247097, "learning_rate": 7.220536442733724e-05, "loss": 0.0106, "step": 2923 }, { "epoch": 2.090437890974084, "grad_norm": 0.018046867102384567, "learning_rate": 7.218299014365379e-05, "loss": 0.0151, "step": 2924 }, { "epoch": 2.091152815013405, "grad_norm": 0.021144768223166466, "learning_rate": 7.216061032799225e-05, "loss": 0.015, "step": 2925 }, { "epoch": 2.091152815013405, "eval_loss": 0.010642929933965206, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 2925 }, { "epoch": 2.091867739052726, "grad_norm": 0.020019041374325752, "learning_rate": 7.213822498593367e-05, "loss": 0.0147, "step": 2926 }, { "epoch": 2.0925826630920463, "grad_norm": 0.018655024468898773, "learning_rate": 7.211583412306048e-05, "loss": 0.01, "step": 2927 }, { "epoch": 2.0932975871313673, "grad_norm": 0.019514458253979683, "learning_rate": 7.209343774495651e-05, "loss": 0.0127, "step": 2928 }, { "epoch": 2.094012511170688, "grad_norm": 0.0189739428460598, "learning_rate": 7.207103585720697e-05, "loss": 0.0129, "step": 2929 }, { "epoch": 2.094727435210009, "grad_norm": 0.024551985785365105, "learning_rate": 7.20486284653984e-05, "loss": 0.0134, "step": 2930 }, { "epoch": 2.094727435210009, "eval_loss": 0.010589702986180782, "eval_runtime": 4.5822, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 2930 }, { "epoch": 2.0954423592493296, "grad_norm": 0.021512934938073158, "learning_rate": 7.202621557511874e-05, "loss": 0.0198, "step": 2931 }, { "epoch": 2.0961572832886506, "grad_norm": 0.014178425073623657, "learning_rate": 7.20037971919573e-05, "loss": 0.0117, "step": 2932 }, { "epoch": 2.0968722073279715, "grad_norm": 0.01696963608264923, "learning_rate": 7.198137332150479e-05, "loss": 0.0108, "step": 2933 }, { "epoch": 2.097587131367292, "grad_norm": 0.017923973500728607, "learning_rate": 7.195894396935324e-05, "loss": 0.0126, "step": 2934 }, { "epoch": 2.098302055406613, "grad_norm": 0.01846279576420784, "learning_rate": 7.193650914109603e-05, "loss": 0.012, "step": 2935 }, { "epoch": 2.098302055406613, "eval_loss": 0.01036645844578743, "eval_runtime": 4.5945, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.829, "step": 2935 }, { "epoch": 2.099016979445934, "grad_norm": 0.018349185585975647, "learning_rate": 7.1914068842328e-05, "loss": 0.0113, "step": 2936 }, { "epoch": 2.099731903485255, "grad_norm": 0.017919247969985008, "learning_rate": 7.189162307864526e-05, "loss": 0.0098, "step": 2937 }, { "epoch": 2.1004468275245753, "grad_norm": 0.01783485896885395, "learning_rate": 7.186917185564533e-05, "loss": 0.0149, "step": 2938 }, { "epoch": 2.1011617515638963, "grad_norm": 0.013748669996857643, "learning_rate": 7.184671517892707e-05, "loss": 0.0072, "step": 2939 }, { "epoch": 2.1018766756032172, "grad_norm": 0.017035244032740593, "learning_rate": 7.182425305409073e-05, "loss": 0.0171, "step": 2940 }, { "epoch": 2.1018766756032172, "eval_loss": 0.010259153321385384, "eval_runtime": 4.6188, "eval_samples_per_second": 10.825, "eval_steps_per_second": 2.815, "step": 2940 }, { "epoch": 2.102591599642538, "grad_norm": 0.012560413219034672, "learning_rate": 7.180178548673788e-05, "loss": 0.0079, "step": 2941 }, { "epoch": 2.1033065236818587, "grad_norm": 0.01891282945871353, "learning_rate": 7.177931248247148e-05, "loss": 0.0132, "step": 2942 }, { "epoch": 2.1040214477211796, "grad_norm": 0.017384078353643417, "learning_rate": 7.175683404689582e-05, "loss": 0.013, "step": 2943 }, { "epoch": 2.1047363717605005, "grad_norm": 0.016814185306429863, "learning_rate": 7.173435018561654e-05, "loss": 0.0165, "step": 2944 }, { "epoch": 2.1054512957998215, "grad_norm": 0.01899871602654457, "learning_rate": 7.17118609042407e-05, "loss": 0.0128, "step": 2945 }, { "epoch": 2.1054512957998215, "eval_loss": 0.010301359929144382, "eval_runtime": 4.5958, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 2945 }, { "epoch": 2.106166219839142, "grad_norm": 0.016130300238728523, "learning_rate": 7.168936620837661e-05, "loss": 0.0095, "step": 2946 }, { "epoch": 2.106881143878463, "grad_norm": 0.011628339067101479, "learning_rate": 7.166686610363399e-05, "loss": 0.0069, "step": 2947 }, { "epoch": 2.107596067917784, "grad_norm": 0.013788614422082901, "learning_rate": 7.164436059562392e-05, "loss": 0.0148, "step": 2948 }, { "epoch": 2.1083109919571044, "grad_norm": 0.013793330639600754, "learning_rate": 7.162184968995883e-05, "loss": 0.0118, "step": 2949 }, { "epoch": 2.1090259159964253, "grad_norm": 0.013731270097196102, "learning_rate": 7.15993333922524e-05, "loss": 0.0136, "step": 2950 }, { "epoch": 2.1090259159964253, "eval_loss": 0.010355561971664429, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2950 }, { "epoch": 2.1097408400357462, "grad_norm": 0.016757002100348473, "learning_rate": 7.157681170811978e-05, "loss": 0.0129, "step": 2951 }, { "epoch": 2.110455764075067, "grad_norm": 0.018631404265761375, "learning_rate": 7.155428464317741e-05, "loss": 0.0137, "step": 2952 }, { "epoch": 2.1111706881143877, "grad_norm": 0.015536020509898663, "learning_rate": 7.153175220304305e-05, "loss": 0.0126, "step": 2953 }, { "epoch": 2.1118856121537086, "grad_norm": 0.016663407906889915, "learning_rate": 7.150921439333584e-05, "loss": 0.0155, "step": 2954 }, { "epoch": 2.1126005361930296, "grad_norm": 0.019668668508529663, "learning_rate": 7.148667121967625e-05, "loss": 0.0127, "step": 2955 }, { "epoch": 2.1126005361930296, "eval_loss": 0.010153091512620449, "eval_runtime": 4.5798, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.839, "step": 2955 }, { "epoch": 2.1133154602323505, "grad_norm": 0.016908900812268257, "learning_rate": 7.146412268768605e-05, "loss": 0.0104, "step": 2956 }, { "epoch": 2.114030384271671, "grad_norm": 0.01923348568379879, "learning_rate": 7.144156880298843e-05, "loss": 0.018, "step": 2957 }, { "epoch": 2.114745308310992, "grad_norm": 0.01695333421230316, "learning_rate": 7.14190095712078e-05, "loss": 0.0089, "step": 2958 }, { "epoch": 2.115460232350313, "grad_norm": 0.02135460078716278, "learning_rate": 7.139644499797e-05, "loss": 0.0163, "step": 2959 }, { "epoch": 2.116175156389634, "grad_norm": 0.019607288762927055, "learning_rate": 7.137387508890218e-05, "loss": 0.0158, "step": 2960 }, { "epoch": 2.116175156389634, "eval_loss": 0.009928950108587742, "eval_runtime": 4.6322, "eval_samples_per_second": 10.794, "eval_steps_per_second": 2.806, "step": 2960 }, { "epoch": 2.1168900804289543, "grad_norm": 0.018155699595808983, "learning_rate": 7.135129984963277e-05, "loss": 0.0122, "step": 2961 }, { "epoch": 2.1176050044682753, "grad_norm": 0.016863422468304634, "learning_rate": 7.132871928579159e-05, "loss": 0.0133, "step": 2962 }, { "epoch": 2.118319928507596, "grad_norm": 0.019775396212935448, "learning_rate": 7.130613340300976e-05, "loss": 0.0161, "step": 2963 }, { "epoch": 2.1190348525469167, "grad_norm": 0.01743755303323269, "learning_rate": 7.128354220691973e-05, "loss": 0.0119, "step": 2964 }, { "epoch": 2.1197497765862376, "grad_norm": 0.017945243045687675, "learning_rate": 7.126094570315526e-05, "loss": 0.0147, "step": 2965 }, { "epoch": 2.1197497765862376, "eval_loss": 0.009932153858244419, "eval_runtime": 4.5874, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 2965 }, { "epoch": 2.1204647006255586, "grad_norm": 0.018510805442929268, "learning_rate": 7.12383438973515e-05, "loss": 0.0135, "step": 2966 }, { "epoch": 2.1211796246648795, "grad_norm": 0.015448021702468395, "learning_rate": 7.121573679514484e-05, "loss": 0.0121, "step": 2967 }, { "epoch": 2.1218945487042, "grad_norm": 0.014798887073993683, "learning_rate": 7.119312440217299e-05, "loss": 0.0152, "step": 2968 }, { "epoch": 2.122609472743521, "grad_norm": 0.01194338034838438, "learning_rate": 7.117050672407507e-05, "loss": 0.0097, "step": 2969 }, { "epoch": 2.123324396782842, "grad_norm": 0.020307593047618866, "learning_rate": 7.114788376649143e-05, "loss": 0.0227, "step": 2970 }, { "epoch": 2.123324396782842, "eval_loss": 0.010046811774373055, "eval_runtime": 4.6145, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 2970 }, { "epoch": 2.124039320822163, "grad_norm": 0.019273854792118073, "learning_rate": 7.112525553506377e-05, "loss": 0.0185, "step": 2971 }, { "epoch": 2.1247542448614833, "grad_norm": 0.015546942129731178, "learning_rate": 7.11026220354351e-05, "loss": 0.0136, "step": 2972 }, { "epoch": 2.1254691689008043, "grad_norm": 0.016437415033578873, "learning_rate": 7.107998327324976e-05, "loss": 0.0139, "step": 2973 }, { "epoch": 2.1261840929401252, "grad_norm": 0.01803497038781643, "learning_rate": 7.105733925415337e-05, "loss": 0.0123, "step": 2974 }, { "epoch": 2.126899016979446, "grad_norm": 0.013385222293436527, "learning_rate": 7.103468998379288e-05, "loss": 0.0105, "step": 2975 }, { "epoch": 2.126899016979446, "eval_loss": 0.009993726387619972, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 2975 }, { "epoch": 2.1276139410187667, "grad_norm": 0.012239388190209866, "learning_rate": 7.101203546781655e-05, "loss": 0.0147, "step": 2976 }, { "epoch": 2.1283288650580876, "grad_norm": 0.01804666593670845, "learning_rate": 7.098937571187396e-05, "loss": 0.0162, "step": 2977 }, { "epoch": 2.1290437890974085, "grad_norm": 0.017578937113285065, "learning_rate": 7.0966710721616e-05, "loss": 0.0242, "step": 2978 }, { "epoch": 2.129758713136729, "grad_norm": 0.016204817220568657, "learning_rate": 7.09440405026948e-05, "loss": 0.0106, "step": 2979 }, { "epoch": 2.13047363717605, "grad_norm": 0.012433252297341824, "learning_rate": 7.092136506076387e-05, "loss": 0.0112, "step": 2980 }, { "epoch": 2.13047363717605, "eval_loss": 0.010094443336129189, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 2980 }, { "epoch": 2.131188561215371, "grad_norm": 0.020138714462518692, "learning_rate": 7.089868440147799e-05, "loss": 0.0115, "step": 2981 }, { "epoch": 2.131903485254692, "grad_norm": 0.017541907727718353, "learning_rate": 7.087599853049328e-05, "loss": 0.0125, "step": 2982 }, { "epoch": 2.1326184092940124, "grad_norm": 0.014821851626038551, "learning_rate": 7.085330745346706e-05, "loss": 0.0101, "step": 2983 }, { "epoch": 2.1333333333333333, "grad_norm": 0.016062945127487183, "learning_rate": 7.083061117605806e-05, "loss": 0.014, "step": 2984 }, { "epoch": 2.1340482573726542, "grad_norm": 0.016823386773467064, "learning_rate": 7.080790970392626e-05, "loss": 0.0107, "step": 2985 }, { "epoch": 2.1340482573726542, "eval_loss": 0.009982681833207607, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 2985 }, { "epoch": 2.1347631814119747, "grad_norm": 0.013316268101334572, "learning_rate": 7.078520304273293e-05, "loss": 0.0086, "step": 2986 }, { "epoch": 2.1354781054512957, "grad_norm": 0.019062906503677368, "learning_rate": 7.076249119814062e-05, "loss": 0.0121, "step": 2987 }, { "epoch": 2.1361930294906166, "grad_norm": 0.023034851998090744, "learning_rate": 7.07397741758132e-05, "loss": 0.0146, "step": 2988 }, { "epoch": 2.1369079535299376, "grad_norm": 0.022965317592024803, "learning_rate": 7.071705198141588e-05, "loss": 0.0085, "step": 2989 }, { "epoch": 2.137622877569258, "grad_norm": 0.012807912193238735, "learning_rate": 7.069432462061505e-05, "loss": 0.0105, "step": 2990 }, { "epoch": 2.137622877569258, "eval_loss": 0.009898060001432896, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 2990 }, { "epoch": 2.138337801608579, "grad_norm": 0.016879089176654816, "learning_rate": 7.067159209907845e-05, "loss": 0.0155, "step": 2991 }, { "epoch": 2.1390527256479, "grad_norm": 0.01707887463271618, "learning_rate": 7.064885442247509e-05, "loss": 0.0147, "step": 2992 }, { "epoch": 2.139767649687221, "grad_norm": 0.018186049535870552, "learning_rate": 7.062611159647532e-05, "loss": 0.0094, "step": 2993 }, { "epoch": 2.1404825737265414, "grad_norm": 0.0186820887029171, "learning_rate": 7.060336362675069e-05, "loss": 0.0114, "step": 2994 }, { "epoch": 2.1411974977658623, "grad_norm": 0.01625782623887062, "learning_rate": 7.058061051897407e-05, "loss": 0.0129, "step": 2995 }, { "epoch": 2.1411974977658623, "eval_loss": 0.00973506085574627, "eval_runtime": 4.5939, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 2995 }, { "epoch": 2.1419124218051833, "grad_norm": 0.0155295729637146, "learning_rate": 7.055785227881963e-05, "loss": 0.0107, "step": 2996 }, { "epoch": 2.142627345844504, "grad_norm": 0.017406875267624855, "learning_rate": 7.05350889119628e-05, "loss": 0.0121, "step": 2997 }, { "epoch": 2.1433422698838247, "grad_norm": 0.01538984663784504, "learning_rate": 7.051232042408029e-05, "loss": 0.0076, "step": 2998 }, { "epoch": 2.1440571939231456, "grad_norm": 0.01655731350183487, "learning_rate": 7.048954682085008e-05, "loss": 0.0136, "step": 2999 }, { "epoch": 2.1447721179624666, "grad_norm": 0.015635129064321518, "learning_rate": 7.046676810795146e-05, "loss": 0.0098, "step": 3000 }, { "epoch": 2.1447721179624666, "eval_loss": 0.009829813614487648, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 3000 }, { "epoch": 2.145487042001787, "grad_norm": 0.01677563041448593, "learning_rate": 7.044398429106494e-05, "loss": 0.0106, "step": 3001 }, { "epoch": 2.146201966041108, "grad_norm": 0.0155470697209239, "learning_rate": 7.042119537587236e-05, "loss": 0.0095, "step": 3002 }, { "epoch": 2.146916890080429, "grad_norm": 0.022752033546566963, "learning_rate": 7.039840136805679e-05, "loss": 0.0115, "step": 3003 }, { "epoch": 2.14763181411975, "grad_norm": 0.01773679628968239, "learning_rate": 7.037560227330258e-05, "loss": 0.017, "step": 3004 }, { "epoch": 2.1483467381590704, "grad_norm": 0.01975167728960514, "learning_rate": 7.035279809729534e-05, "loss": 0.0161, "step": 3005 }, { "epoch": 2.1483467381590704, "eval_loss": 0.010025224648416042, "eval_runtime": 4.5952, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 3005 }, { "epoch": 2.1490616621983913, "grad_norm": 0.019655002281069756, "learning_rate": 7.032998884572199e-05, "loss": 0.0183, "step": 3006 }, { "epoch": 2.1497765862377123, "grad_norm": 0.01561768539249897, "learning_rate": 7.030717452427066e-05, "loss": 0.0089, "step": 3007 }, { "epoch": 2.1504915102770332, "grad_norm": 0.019992509856820107, "learning_rate": 7.028435513863078e-05, "loss": 0.0138, "step": 3008 }, { "epoch": 2.1512064343163537, "grad_norm": 0.013749406673014164, "learning_rate": 7.026153069449301e-05, "loss": 0.0102, "step": 3009 }, { "epoch": 2.1519213583556747, "grad_norm": 0.01459167804569006, "learning_rate": 7.023870119754934e-05, "loss": 0.0129, "step": 3010 }, { "epoch": 2.1519213583556747, "eval_loss": 0.00998094491660595, "eval_runtime": 4.5954, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 3010 }, { "epoch": 2.1526362823949956, "grad_norm": 0.016756294295191765, "learning_rate": 7.021586665349294e-05, "loss": 0.0116, "step": 3011 }, { "epoch": 2.1533512064343165, "grad_norm": 0.02224813401699066, "learning_rate": 7.019302706801826e-05, "loss": 0.0112, "step": 3012 }, { "epoch": 2.154066130473637, "grad_norm": 0.017907684668898582, "learning_rate": 7.017018244682105e-05, "loss": 0.0114, "step": 3013 }, { "epoch": 2.154781054512958, "grad_norm": 0.01605028286576271, "learning_rate": 7.014733279559828e-05, "loss": 0.017, "step": 3014 }, { "epoch": 2.155495978552279, "grad_norm": 0.014892744831740856, "learning_rate": 7.012447812004816e-05, "loss": 0.0131, "step": 3015 }, { "epoch": 2.155495978552279, "eval_loss": 0.010226205922663212, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 3015 }, { "epoch": 2.1562109025915994, "grad_norm": 0.01661045104265213, "learning_rate": 7.010161842587019e-05, "loss": 0.0112, "step": 3016 }, { "epoch": 2.1569258266309204, "grad_norm": 0.01753130368888378, "learning_rate": 7.00787537187651e-05, "loss": 0.0177, "step": 3017 }, { "epoch": 2.1576407506702413, "grad_norm": 0.017087625339627266, "learning_rate": 7.005588400443487e-05, "loss": 0.012, "step": 3018 }, { "epoch": 2.1583556747095622, "grad_norm": 0.019175851717591286, "learning_rate": 7.003300928858273e-05, "loss": 0.0131, "step": 3019 }, { "epoch": 2.1590705987488827, "grad_norm": 0.02113555371761322, "learning_rate": 7.001012957691317e-05, "loss": 0.0212, "step": 3020 }, { "epoch": 2.1590705987488827, "eval_loss": 0.010114088654518127, "eval_runtime": 4.5956, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 3020 }, { "epoch": 2.1597855227882037, "grad_norm": 0.016520004719495773, "learning_rate": 6.998724487513191e-05, "loss": 0.0123, "step": 3021 }, { "epoch": 2.1605004468275246, "grad_norm": 0.020978718996047974, "learning_rate": 6.996435518894593e-05, "loss": 0.0148, "step": 3022 }, { "epoch": 2.1612153708668456, "grad_norm": 0.01586601883172989, "learning_rate": 6.994146052406342e-05, "loss": 0.0077, "step": 3023 }, { "epoch": 2.161930294906166, "grad_norm": 0.019245650619268417, "learning_rate": 6.991856088619386e-05, "loss": 0.0208, "step": 3024 }, { "epoch": 2.162645218945487, "grad_norm": 0.015129555016756058, "learning_rate": 6.989565628104795e-05, "loss": 0.0076, "step": 3025 }, { "epoch": 2.162645218945487, "eval_loss": 0.010254974476993084, "eval_runtime": 4.5878, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 3025 }, { "epoch": 2.163360142984808, "grad_norm": 0.014291026629507542, "learning_rate": 6.98727467143376e-05, "loss": 0.0122, "step": 3026 }, { "epoch": 2.164075067024129, "grad_norm": 0.01796852797269821, "learning_rate": 6.984983219177598e-05, "loss": 0.0135, "step": 3027 }, { "epoch": 2.1647899910634494, "grad_norm": 0.016815420240163803, "learning_rate": 6.982691271907751e-05, "loss": 0.0093, "step": 3028 }, { "epoch": 2.1655049151027703, "grad_norm": 0.015511254779994488, "learning_rate": 6.980398830195785e-05, "loss": 0.0132, "step": 3029 }, { "epoch": 2.1662198391420913, "grad_norm": 0.012681188993155956, "learning_rate": 6.978105894613385e-05, "loss": 0.0079, "step": 3030 }, { "epoch": 2.1662198391420913, "eval_loss": 0.010066603310406208, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3030 }, { "epoch": 2.1669347631814118, "grad_norm": 0.020665330812335014, "learning_rate": 6.97581246573236e-05, "loss": 0.0143, "step": 3031 }, { "epoch": 2.1676496872207327, "grad_norm": 0.019907204434275627, "learning_rate": 6.973518544124646e-05, "loss": 0.0123, "step": 3032 }, { "epoch": 2.1683646112600536, "grad_norm": 0.020120183005928993, "learning_rate": 6.971224130362301e-05, "loss": 0.0183, "step": 3033 }, { "epoch": 2.1690795352993746, "grad_norm": 0.01941312663257122, "learning_rate": 6.9689292250175e-05, "loss": 0.0154, "step": 3034 }, { "epoch": 2.169794459338695, "grad_norm": 0.017957376316189766, "learning_rate": 6.96663382866255e-05, "loss": 0.015, "step": 3035 }, { "epoch": 2.169794459338695, "eval_loss": 0.010083586908876896, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3035 }, { "epoch": 2.170509383378016, "grad_norm": 0.013880457729101181, "learning_rate": 6.964337941869871e-05, "loss": 0.0114, "step": 3036 }, { "epoch": 2.171224307417337, "grad_norm": 0.016864771023392677, "learning_rate": 6.962041565212012e-05, "loss": 0.008, "step": 3037 }, { "epoch": 2.171939231456658, "grad_norm": 0.023968873545527458, "learning_rate": 6.959744699261641e-05, "loss": 0.0123, "step": 3038 }, { "epoch": 2.1726541554959784, "grad_norm": 0.02215608023107052, "learning_rate": 6.957447344591548e-05, "loss": 0.0146, "step": 3039 }, { "epoch": 2.1733690795352993, "grad_norm": 0.02090241201221943, "learning_rate": 6.955149501774646e-05, "loss": 0.0154, "step": 3040 }, { "epoch": 2.1733690795352993, "eval_loss": 0.010462143458425999, "eval_runtime": 4.5862, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 3040 }, { "epoch": 2.1740840035746203, "grad_norm": 0.018165411427617073, "learning_rate": 6.952851171383972e-05, "loss": 0.0088, "step": 3041 }, { "epoch": 2.1747989276139412, "grad_norm": 0.016519321128726006, "learning_rate": 6.950552353992678e-05, "loss": 0.0082, "step": 3042 }, { "epoch": 2.1755138516532617, "grad_norm": 0.015902971848845482, "learning_rate": 6.948253050174043e-05, "loss": 0.0107, "step": 3043 }, { "epoch": 2.1762287756925827, "grad_norm": 0.016599606722593307, "learning_rate": 6.945953260501466e-05, "loss": 0.0134, "step": 3044 }, { "epoch": 2.1769436997319036, "grad_norm": 0.01708199642598629, "learning_rate": 6.943652985548467e-05, "loss": 0.0113, "step": 3045 }, { "epoch": 2.1769436997319036, "eval_loss": 0.010325698181986809, "eval_runtime": 4.61, "eval_samples_per_second": 10.846, "eval_steps_per_second": 2.82, "step": 3045 }, { "epoch": 2.177658623771224, "grad_norm": 0.016933774575591087, "learning_rate": 6.941352225888687e-05, "loss": 0.0127, "step": 3046 }, { "epoch": 2.178373547810545, "grad_norm": 0.020485728979110718, "learning_rate": 6.939050982095888e-05, "loss": 0.0154, "step": 3047 }, { "epoch": 2.179088471849866, "grad_norm": 0.01692415028810501, "learning_rate": 6.936749254743951e-05, "loss": 0.0119, "step": 3048 }, { "epoch": 2.179803395889187, "grad_norm": 0.014648730866611004, "learning_rate": 6.934447044406881e-05, "loss": 0.0078, "step": 3049 }, { "epoch": 2.1805183199285074, "grad_norm": 0.018518343567848206, "learning_rate": 6.932144351658801e-05, "loss": 0.0151, "step": 3050 }, { "epoch": 2.1805183199285074, "eval_loss": 0.010192682035267353, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 3050 }, { "epoch": 2.1812332439678284, "grad_norm": 0.012108501978218555, "learning_rate": 6.929841177073953e-05, "loss": 0.0072, "step": 3051 }, { "epoch": 2.1819481680071493, "grad_norm": 0.025235211476683617, "learning_rate": 6.927537521226704e-05, "loss": 0.0133, "step": 3052 }, { "epoch": 2.1826630920464702, "grad_norm": 0.014778101816773415, "learning_rate": 6.925233384691534e-05, "loss": 0.0077, "step": 3053 }, { "epoch": 2.1833780160857907, "grad_norm": 0.020369229838252068, "learning_rate": 6.922928768043053e-05, "loss": 0.0166, "step": 3054 }, { "epoch": 2.1840929401251117, "grad_norm": 0.018023647367954254, "learning_rate": 6.92062367185598e-05, "loss": 0.0129, "step": 3055 }, { "epoch": 2.1840929401251117, "eval_loss": 0.009942298755049706, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 3055 }, { "epoch": 2.1848078641644326, "grad_norm": 0.01570039428770542, "learning_rate": 6.918318096705157e-05, "loss": 0.016, "step": 3056 }, { "epoch": 2.1855227882037536, "grad_norm": 0.014152316376566887, "learning_rate": 6.916012043165552e-05, "loss": 0.0099, "step": 3057 }, { "epoch": 2.186237712243074, "grad_norm": 0.016467120498418808, "learning_rate": 6.913705511812243e-05, "loss": 0.01, "step": 3058 }, { "epoch": 2.186952636282395, "grad_norm": 0.0182468481361866, "learning_rate": 6.911398503220432e-05, "loss": 0.0192, "step": 3059 }, { "epoch": 2.187667560321716, "grad_norm": 0.016344645991921425, "learning_rate": 6.909091017965439e-05, "loss": 0.0097, "step": 3060 }, { "epoch": 2.187667560321716, "eval_loss": 0.009900795295834541, "eval_runtime": 4.6437, "eval_samples_per_second": 10.767, "eval_steps_per_second": 2.799, "step": 3060 }, { "epoch": 2.1883824843610364, "grad_norm": 0.017482930794358253, "learning_rate": 6.906783056622704e-05, "loss": 0.0129, "step": 3061 }, { "epoch": 2.1890974084003574, "grad_norm": 0.016825906932353973, "learning_rate": 6.904474619767784e-05, "loss": 0.0156, "step": 3062 }, { "epoch": 2.1898123324396783, "grad_norm": 0.015985067933797836, "learning_rate": 6.902165707976354e-05, "loss": 0.0122, "step": 3063 }, { "epoch": 2.1905272564789993, "grad_norm": 0.015850050374865532, "learning_rate": 6.899856321824212e-05, "loss": 0.0119, "step": 3064 }, { "epoch": 2.1912421805183198, "grad_norm": 0.018299292773008347, "learning_rate": 6.897546461887268e-05, "loss": 0.0084, "step": 3065 }, { "epoch": 2.1912421805183198, "eval_loss": 0.010028209537267685, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 3065 }, { "epoch": 2.1919571045576407, "grad_norm": 0.01883627101778984, "learning_rate": 6.895236128741554e-05, "loss": 0.0118, "step": 3066 }, { "epoch": 2.1926720285969616, "grad_norm": 0.01760055497288704, "learning_rate": 6.892925322963221e-05, "loss": 0.012, "step": 3067 }, { "epoch": 2.1933869526362826, "grad_norm": 0.017374027520418167, "learning_rate": 6.890614045128533e-05, "loss": 0.015, "step": 3068 }, { "epoch": 2.194101876675603, "grad_norm": 0.014431554824113846, "learning_rate": 6.888302295813878e-05, "loss": 0.0121, "step": 3069 }, { "epoch": 2.194816800714924, "grad_norm": 0.01727464236319065, "learning_rate": 6.885990075595756e-05, "loss": 0.0179, "step": 3070 }, { "epoch": 2.194816800714924, "eval_loss": 0.010155171155929565, "eval_runtime": 4.6007, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 3070 }, { "epoch": 2.195531724754245, "grad_norm": 0.012992617674171925, "learning_rate": 6.883677385050789e-05, "loss": 0.0116, "step": 3071 }, { "epoch": 2.196246648793566, "grad_norm": 0.017002295702695847, "learning_rate": 6.881364224755709e-05, "loss": 0.0108, "step": 3072 }, { "epoch": 2.1969615728328864, "grad_norm": 0.015580698847770691, "learning_rate": 6.879050595287378e-05, "loss": 0.0095, "step": 3073 }, { "epoch": 2.1976764968722073, "grad_norm": 0.018373090773820877, "learning_rate": 6.876736497222763e-05, "loss": 0.01, "step": 3074 }, { "epoch": 2.1983914209115283, "grad_norm": 0.019792910665273666, "learning_rate": 6.874421931138949e-05, "loss": 0.0107, "step": 3075 }, { "epoch": 2.1983914209115283, "eval_loss": 0.01001086737960577, "eval_runtime": 4.6306, "eval_samples_per_second": 10.798, "eval_steps_per_second": 2.807, "step": 3075 }, { "epoch": 2.199106344950849, "grad_norm": 0.015774035826325417, "learning_rate": 6.872106897613147e-05, "loss": 0.0093, "step": 3076 }, { "epoch": 2.1998212689901697, "grad_norm": 0.02174397185444832, "learning_rate": 6.869791397222674e-05, "loss": 0.0172, "step": 3077 }, { "epoch": 2.2005361930294907, "grad_norm": 0.016149967908859253, "learning_rate": 6.867475430544971e-05, "loss": 0.0144, "step": 3078 }, { "epoch": 2.2012511170688116, "grad_norm": 0.015423720702528954, "learning_rate": 6.86515899815759e-05, "loss": 0.0077, "step": 3079 }, { "epoch": 2.201966041108132, "grad_norm": 0.01851598359644413, "learning_rate": 6.8628421006382e-05, "loss": 0.0238, "step": 3080 }, { "epoch": 2.201966041108132, "eval_loss": 0.010015136562287807, "eval_runtime": 4.656, "eval_samples_per_second": 10.739, "eval_steps_per_second": 2.792, "step": 3080 }, { "epoch": 2.202680965147453, "grad_norm": 0.023705413565039635, "learning_rate": 6.860524738564592e-05, "loss": 0.0152, "step": 3081 }, { "epoch": 2.203395889186774, "grad_norm": 0.018077727407217026, "learning_rate": 6.858206912514665e-05, "loss": 0.0127, "step": 3082 }, { "epoch": 2.204110813226095, "grad_norm": 0.01674989052116871, "learning_rate": 6.855888623066434e-05, "loss": 0.0167, "step": 3083 }, { "epoch": 2.2048257372654154, "grad_norm": 0.018090294674038887, "learning_rate": 6.853569870798036e-05, "loss": 0.0114, "step": 3084 }, { "epoch": 2.2055406613047364, "grad_norm": 0.018888697028160095, "learning_rate": 6.85125065628772e-05, "loss": 0.0125, "step": 3085 }, { "epoch": 2.2055406613047364, "eval_loss": 0.010086395777761936, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3085 }, { "epoch": 2.2062555853440573, "grad_norm": 0.022591065615415573, "learning_rate": 6.848930980113848e-05, "loss": 0.0192, "step": 3086 }, { "epoch": 2.2069705093833782, "grad_norm": 0.016423305496573448, "learning_rate": 6.846610842854901e-05, "loss": 0.0125, "step": 3087 }, { "epoch": 2.2076854334226987, "grad_norm": 0.018129687756299973, "learning_rate": 6.844290245089473e-05, "loss": 0.0119, "step": 3088 }, { "epoch": 2.2084003574620197, "grad_norm": 0.0190680343657732, "learning_rate": 6.841969187396271e-05, "loss": 0.012, "step": 3089 }, { "epoch": 2.2091152815013406, "grad_norm": 0.020276609808206558, "learning_rate": 6.83964767035412e-05, "loss": 0.0094, "step": 3090 }, { "epoch": 2.2091152815013406, "eval_loss": 0.01005486585199833, "eval_runtime": 4.6019, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 3090 }, { "epoch": 2.209830205540661, "grad_norm": 0.01541316881775856, "learning_rate": 6.837325694541959e-05, "loss": 0.0118, "step": 3091 }, { "epoch": 2.210545129579982, "grad_norm": 0.019309567287564278, "learning_rate": 6.83500326053884e-05, "loss": 0.0144, "step": 3092 }, { "epoch": 2.211260053619303, "grad_norm": 0.019109319895505905, "learning_rate": 6.832680368923929e-05, "loss": 0.0107, "step": 3093 }, { "epoch": 2.211974977658624, "grad_norm": 0.018425550311803818, "learning_rate": 6.830357020276509e-05, "loss": 0.0197, "step": 3094 }, { "epoch": 2.2126899016979444, "grad_norm": 0.011846463195979595, "learning_rate": 6.828033215175974e-05, "loss": 0.0073, "step": 3095 }, { "epoch": 2.2126899016979444, "eval_loss": 0.010147897526621819, "eval_runtime": 4.5915, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 3095 }, { "epoch": 2.2134048257372654, "grad_norm": 0.01785621978342533, "learning_rate": 6.825708954201831e-05, "loss": 0.0083, "step": 3096 }, { "epoch": 2.2141197497765863, "grad_norm": 0.01443384774029255, "learning_rate": 6.823384237933706e-05, "loss": 0.0093, "step": 3097 }, { "epoch": 2.2148346738159073, "grad_norm": 0.015467622317373753, "learning_rate": 6.821059066951331e-05, "loss": 0.0136, "step": 3098 }, { "epoch": 2.2155495978552278, "grad_norm": 0.013993385247886181, "learning_rate": 6.818733441834561e-05, "loss": 0.0143, "step": 3099 }, { "epoch": 2.2162645218945487, "grad_norm": 0.013736213557422161, "learning_rate": 6.816407363163354e-05, "loss": 0.0064, "step": 3100 }, { "epoch": 2.2162645218945487, "eval_loss": 0.009984134696424007, "eval_runtime": 4.5949, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 3100 }, { "epoch": 2.2169794459338696, "grad_norm": 0.017630426213145256, "learning_rate": 6.814080831517787e-05, "loss": 0.0153, "step": 3101 }, { "epoch": 2.2176943699731906, "grad_norm": 0.013127855025231838, "learning_rate": 6.811753847478051e-05, "loss": 0.0106, "step": 3102 }, { "epoch": 2.218409294012511, "grad_norm": 0.017087023705244064, "learning_rate": 6.809426411624443e-05, "loss": 0.0164, "step": 3103 }, { "epoch": 2.219124218051832, "grad_norm": 0.0164384413510561, "learning_rate": 6.807098524537381e-05, "loss": 0.0126, "step": 3104 }, { "epoch": 2.219839142091153, "grad_norm": 0.02815346233546734, "learning_rate": 6.804770186797392e-05, "loss": 0.0158, "step": 3105 }, { "epoch": 2.219839142091153, "eval_loss": 0.009989176876842976, "eval_runtime": 4.6098, "eval_samples_per_second": 10.847, "eval_steps_per_second": 2.82, "step": 3105 }, { "epoch": 2.2205540661304735, "grad_norm": 0.013883410952985287, "learning_rate": 6.802441398985113e-05, "loss": 0.0086, "step": 3106 }, { "epoch": 2.2212689901697944, "grad_norm": 0.014530137181282043, "learning_rate": 6.800112161681296e-05, "loss": 0.0086, "step": 3107 }, { "epoch": 2.2219839142091153, "grad_norm": 0.01732686534523964, "learning_rate": 6.797782475466806e-05, "loss": 0.008, "step": 3108 }, { "epoch": 2.2226988382484363, "grad_norm": 0.01644459180533886, "learning_rate": 6.795452340922616e-05, "loss": 0.0153, "step": 3109 }, { "epoch": 2.223413762287757, "grad_norm": 0.014836164191365242, "learning_rate": 6.793121758629817e-05, "loss": 0.0105, "step": 3110 }, { "epoch": 2.223413762287757, "eval_loss": 0.009990953840315342, "eval_runtime": 4.6111, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 3110 }, { "epoch": 2.2241286863270777, "grad_norm": 0.015164658427238464, "learning_rate": 6.790790729169604e-05, "loss": 0.0118, "step": 3111 }, { "epoch": 2.2248436103663987, "grad_norm": 0.01842506043612957, "learning_rate": 6.788459253123289e-05, "loss": 0.012, "step": 3112 }, { "epoch": 2.225558534405719, "grad_norm": 0.016241341829299927, "learning_rate": 6.786127331072294e-05, "loss": 0.0113, "step": 3113 }, { "epoch": 2.22627345844504, "grad_norm": 0.021502694115042686, "learning_rate": 6.783794963598153e-05, "loss": 0.0091, "step": 3114 }, { "epoch": 2.226988382484361, "grad_norm": 0.01938605308532715, "learning_rate": 6.781462151282508e-05, "loss": 0.014, "step": 3115 }, { "epoch": 2.226988382484361, "eval_loss": 0.010039269924163818, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 3115 }, { "epoch": 2.227703306523682, "grad_norm": 0.0193506870418787, "learning_rate": 6.779128894707115e-05, "loss": 0.0145, "step": 3116 }, { "epoch": 2.2284182305630025, "grad_norm": 0.01932734251022339, "learning_rate": 6.77679519445384e-05, "loss": 0.0148, "step": 3117 }, { "epoch": 2.2291331546023234, "grad_norm": 0.014081226661801338, "learning_rate": 6.774461051104662e-05, "loss": 0.0122, "step": 3118 }, { "epoch": 2.2298480786416444, "grad_norm": 0.016871746629476547, "learning_rate": 6.772126465241662e-05, "loss": 0.012, "step": 3119 }, { "epoch": 2.2305630026809653, "grad_norm": 0.02446085959672928, "learning_rate": 6.769791437447041e-05, "loss": 0.0136, "step": 3120 }, { "epoch": 2.2305630026809653, "eval_loss": 0.01007230207324028, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3120 }, { "epoch": 2.231277926720286, "grad_norm": 0.013774069957435131, "learning_rate": 6.767455968303107e-05, "loss": 0.0096, "step": 3121 }, { "epoch": 2.2319928507596067, "grad_norm": 0.018608558923006058, "learning_rate": 6.765120058392278e-05, "loss": 0.0229, "step": 3122 }, { "epoch": 2.2327077747989277, "grad_norm": 0.017187783494591713, "learning_rate": 6.76278370829708e-05, "loss": 0.0185, "step": 3123 }, { "epoch": 2.2334226988382486, "grad_norm": 0.012246660888195038, "learning_rate": 6.76044691860015e-05, "loss": 0.0097, "step": 3124 }, { "epoch": 2.234137622877569, "grad_norm": 0.018777592107653618, "learning_rate": 6.758109689884236e-05, "loss": 0.0115, "step": 3125 }, { "epoch": 2.234137622877569, "eval_loss": 0.010109632275998592, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 3125 }, { "epoch": 2.23485254691689, "grad_norm": 0.017879802733659744, "learning_rate": 6.755772022732195e-05, "loss": 0.0111, "step": 3126 }, { "epoch": 2.235567470956211, "grad_norm": 0.017339229583740234, "learning_rate": 6.753433917726991e-05, "loss": 0.011, "step": 3127 }, { "epoch": 2.2362823949955315, "grad_norm": 0.0199753288179636, "learning_rate": 6.7510953754517e-05, "loss": 0.0192, "step": 3128 }, { "epoch": 2.2369973190348524, "grad_norm": 0.018676964566111565, "learning_rate": 6.748756396489506e-05, "loss": 0.0097, "step": 3129 }, { "epoch": 2.2377122430741734, "grad_norm": 0.01942336931824684, "learning_rate": 6.746416981423702e-05, "loss": 0.0138, "step": 3130 }, { "epoch": 2.2377122430741734, "eval_loss": 0.010111427865922451, "eval_runtime": 4.5981, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 3130 }, { "epoch": 2.2384271671134943, "grad_norm": 0.01814490184187889, "learning_rate": 6.744077130837687e-05, "loss": 0.0085, "step": 3131 }, { "epoch": 2.239142091152815, "grad_norm": 0.02433619275689125, "learning_rate": 6.741736845314977e-05, "loss": 0.015, "step": 3132 }, { "epoch": 2.2398570151921358, "grad_norm": 0.020878614857792854, "learning_rate": 6.739396125439184e-05, "loss": 0.0134, "step": 3133 }, { "epoch": 2.2405719392314567, "grad_norm": 0.018903259187936783, "learning_rate": 6.73705497179404e-05, "loss": 0.0163, "step": 3134 }, { "epoch": 2.2412868632707776, "grad_norm": 0.016467466950416565, "learning_rate": 6.734713384963379e-05, "loss": 0.0179, "step": 3135 }, { "epoch": 2.2412868632707776, "eval_loss": 0.009839136153459549, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3135 }, { "epoch": 2.242001787310098, "grad_norm": 0.015827175229787827, "learning_rate": 6.732371365531143e-05, "loss": 0.0086, "step": 3136 }, { "epoch": 2.242716711349419, "grad_norm": 0.014420528896152973, "learning_rate": 6.730028914081384e-05, "loss": 0.0099, "step": 3137 }, { "epoch": 2.24343163538874, "grad_norm": 0.01984776183962822, "learning_rate": 6.727686031198262e-05, "loss": 0.013, "step": 3138 }, { "epoch": 2.244146559428061, "grad_norm": 0.01857919618487358, "learning_rate": 6.72534271746604e-05, "loss": 0.0111, "step": 3139 }, { "epoch": 2.2448614834673815, "grad_norm": 0.016841834411025047, "learning_rate": 6.722998973469096e-05, "loss": 0.0143, "step": 3140 }, { "epoch": 2.2448614834673815, "eval_loss": 0.009801780804991722, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 3140 }, { "epoch": 2.2455764075067024, "grad_norm": 0.014884110540151596, "learning_rate": 6.720654799791908e-05, "loss": 0.0077, "step": 3141 }, { "epoch": 2.2462913315460233, "grad_norm": 0.017427101731300354, "learning_rate": 6.718310197019066e-05, "loss": 0.0148, "step": 3142 }, { "epoch": 2.247006255585344, "grad_norm": 0.020451853051781654, "learning_rate": 6.715965165735266e-05, "loss": 0.0127, "step": 3143 }, { "epoch": 2.2477211796246648, "grad_norm": 0.022063354030251503, "learning_rate": 6.713619706525307e-05, "loss": 0.0198, "step": 3144 }, { "epoch": 2.2484361036639857, "grad_norm": 0.02093210071325302, "learning_rate": 6.7112738199741e-05, "loss": 0.0138, "step": 3145 }, { "epoch": 2.2484361036639857, "eval_loss": 0.009911084547638893, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 3145 }, { "epoch": 2.2491510277033067, "grad_norm": 0.01695036143064499, "learning_rate": 6.708927506666664e-05, "loss": 0.0103, "step": 3146 }, { "epoch": 2.249865951742627, "grad_norm": 0.01817784272134304, "learning_rate": 6.706580767188115e-05, "loss": 0.0152, "step": 3147 }, { "epoch": 2.250580875781948, "grad_norm": 0.019394846633076668, "learning_rate": 6.704233602123685e-05, "loss": 0.0116, "step": 3148 }, { "epoch": 2.251295799821269, "grad_norm": 0.015593942254781723, "learning_rate": 6.701886012058706e-05, "loss": 0.0135, "step": 3149 }, { "epoch": 2.25201072386059, "grad_norm": 0.019808832556009293, "learning_rate": 6.69953799757862e-05, "loss": 0.0197, "step": 3150 }, { "epoch": 2.25201072386059, "eval_loss": 0.009672330692410469, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 3150 }, { "epoch": 2.2527256478999105, "grad_norm": 0.02198665402829647, "learning_rate": 6.697189559268972e-05, "loss": 0.0086, "step": 3151 }, { "epoch": 2.2534405719392314, "grad_norm": 0.018736548721790314, "learning_rate": 6.694840697715415e-05, "loss": 0.0133, "step": 3152 }, { "epoch": 2.2541554959785524, "grad_norm": 0.023304982110857964, "learning_rate": 6.692491413503705e-05, "loss": 0.0157, "step": 3153 }, { "epoch": 2.2548704200178733, "grad_norm": 0.015394008718430996, "learning_rate": 6.690141707219705e-05, "loss": 0.0125, "step": 3154 }, { "epoch": 2.255585344057194, "grad_norm": 0.014510693028569221, "learning_rate": 6.687791579449384e-05, "loss": 0.0109, "step": 3155 }, { "epoch": 2.255585344057194, "eval_loss": 0.009618471376597881, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3155 }, { "epoch": 2.2563002680965147, "grad_norm": 0.017935093492269516, "learning_rate": 6.685441030778817e-05, "loss": 0.0152, "step": 3156 }, { "epoch": 2.2570151921358357, "grad_norm": 0.012564858421683311, "learning_rate": 6.683090061794178e-05, "loss": 0.0127, "step": 3157 }, { "epoch": 2.257730116175156, "grad_norm": 0.015331783331930637, "learning_rate": 6.680738673081752e-05, "loss": 0.0267, "step": 3158 }, { "epoch": 2.258445040214477, "grad_norm": 0.01894300989806652, "learning_rate": 6.678386865227928e-05, "loss": 0.0103, "step": 3159 }, { "epoch": 2.259159964253798, "grad_norm": 0.018628042191267014, "learning_rate": 6.676034638819197e-05, "loss": 0.0115, "step": 3160 }, { "epoch": 2.259159964253798, "eval_loss": 0.009664652869105339, "eval_runtime": 4.5955, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 3160 }, { "epoch": 2.259874888293119, "grad_norm": 0.019381757825613022, "learning_rate": 6.673681994442153e-05, "loss": 0.0161, "step": 3161 }, { "epoch": 2.2605898123324395, "grad_norm": 0.018019860610365868, "learning_rate": 6.671328932683499e-05, "loss": 0.0119, "step": 3162 }, { "epoch": 2.2613047363717604, "grad_norm": 0.013938314281404018, "learning_rate": 6.668975454130042e-05, "loss": 0.0115, "step": 3163 }, { "epoch": 2.2620196604110814, "grad_norm": 0.01897287555038929, "learning_rate": 6.666621559368687e-05, "loss": 0.0175, "step": 3164 }, { "epoch": 2.2627345844504023, "grad_norm": 0.02305835857987404, "learning_rate": 6.664267248986447e-05, "loss": 0.0133, "step": 3165 }, { "epoch": 2.2627345844504023, "eval_loss": 0.009602353908121586, "eval_runtime": 4.5974, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 3165 }, { "epoch": 2.263449508489723, "grad_norm": 0.020935140550136566, "learning_rate": 6.661912523570441e-05, "loss": 0.0144, "step": 3166 }, { "epoch": 2.2641644325290438, "grad_norm": 0.013003851287066936, "learning_rate": 6.659557383707887e-05, "loss": 0.0073, "step": 3167 }, { "epoch": 2.2648793565683647, "grad_norm": 0.015069188550114632, "learning_rate": 6.65720182998611e-05, "loss": 0.0077, "step": 3168 }, { "epoch": 2.2655942806076856, "grad_norm": 0.019950054585933685, "learning_rate": 6.654845862992531e-05, "loss": 0.0124, "step": 3169 }, { "epoch": 2.266309204647006, "grad_norm": 0.016726158559322357, "learning_rate": 6.652489483314686e-05, "loss": 0.0099, "step": 3170 }, { "epoch": 2.266309204647006, "eval_loss": 0.009500124491751194, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 3170 }, { "epoch": 2.267024128686327, "grad_norm": 0.01836283691227436, "learning_rate": 6.650132691540204e-05, "loss": 0.0118, "step": 3171 }, { "epoch": 2.267739052725648, "grad_norm": 0.014882808551192284, "learning_rate": 6.647775488256818e-05, "loss": 0.0103, "step": 3172 }, { "epoch": 2.2684539767649685, "grad_norm": 0.019425077363848686, "learning_rate": 6.645417874052368e-05, "loss": 0.0153, "step": 3173 }, { "epoch": 2.2691689008042895, "grad_norm": 0.014919253066182137, "learning_rate": 6.643059849514794e-05, "loss": 0.013, "step": 3174 }, { "epoch": 2.2698838248436104, "grad_norm": 0.014405988156795502, "learning_rate": 6.64070141523214e-05, "loss": 0.0098, "step": 3175 }, { "epoch": 2.2698838248436104, "eval_loss": 0.009611258283257484, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 3175 }, { "epoch": 2.2705987488829313, "grad_norm": 0.014156864024698734, "learning_rate": 6.638342571792549e-05, "loss": 0.0077, "step": 3176 }, { "epoch": 2.271313672922252, "grad_norm": 0.018733222037553787, "learning_rate": 6.635983319784265e-05, "loss": 0.0111, "step": 3177 }, { "epoch": 2.2720285969615728, "grad_norm": 0.017615923658013344, "learning_rate": 6.633623659795642e-05, "loss": 0.0108, "step": 3178 }, { "epoch": 2.2727435210008937, "grad_norm": 0.01639215461909771, "learning_rate": 6.631263592415127e-05, "loss": 0.0127, "step": 3179 }, { "epoch": 2.2734584450402147, "grad_norm": 0.02167407236993313, "learning_rate": 6.628903118231273e-05, "loss": 0.0164, "step": 3180 }, { "epoch": 2.2734584450402147, "eval_loss": 0.009652877226471901, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 3180 }, { "epoch": 2.274173369079535, "grad_norm": 0.020877394825220108, "learning_rate": 6.626542237832732e-05, "loss": 0.0126, "step": 3181 }, { "epoch": 2.274888293118856, "grad_norm": 0.018046122044324875, "learning_rate": 6.62418095180826e-05, "loss": 0.0108, "step": 3182 }, { "epoch": 2.275603217158177, "grad_norm": 0.013074154034256935, "learning_rate": 6.621819260746712e-05, "loss": 0.0069, "step": 3183 }, { "epoch": 2.276318141197498, "grad_norm": 0.01898675225675106, "learning_rate": 6.619457165237046e-05, "loss": 0.0092, "step": 3184 }, { "epoch": 2.2770330652368185, "grad_norm": 0.02012728340923786, "learning_rate": 6.617094665868318e-05, "loss": 0.0101, "step": 3185 }, { "epoch": 2.2770330652368185, "eval_loss": 0.009880658239126205, "eval_runtime": 4.6192, "eval_samples_per_second": 10.824, "eval_steps_per_second": 2.814, "step": 3185 }, { "epoch": 2.2777479892761394, "grad_norm": 0.017324084416031837, "learning_rate": 6.614731763229685e-05, "loss": 0.0133, "step": 3186 }, { "epoch": 2.2784629133154604, "grad_norm": 0.020245078951120377, "learning_rate": 6.612368457910411e-05, "loss": 0.0176, "step": 3187 }, { "epoch": 2.279177837354781, "grad_norm": 0.014084434136748314, "learning_rate": 6.610004750499853e-05, "loss": 0.0085, "step": 3188 }, { "epoch": 2.279892761394102, "grad_norm": 0.014876192435622215, "learning_rate": 6.607640641587468e-05, "loss": 0.008, "step": 3189 }, { "epoch": 2.2806076854334227, "grad_norm": 0.019076062366366386, "learning_rate": 6.60527613176282e-05, "loss": 0.0211, "step": 3190 }, { "epoch": 2.2806076854334227, "eval_loss": 0.009866029024124146, "eval_runtime": 4.5787, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 3190 }, { "epoch": 2.2813226094727437, "grad_norm": 0.015055665746331215, "learning_rate": 6.602911221615567e-05, "loss": 0.013, "step": 3191 }, { "epoch": 2.282037533512064, "grad_norm": 0.025397367775440216, "learning_rate": 6.600545911735468e-05, "loss": 0.016, "step": 3192 }, { "epoch": 2.282752457551385, "grad_norm": 0.01860181987285614, "learning_rate": 6.59818020271238e-05, "loss": 0.0134, "step": 3193 }, { "epoch": 2.283467381590706, "grad_norm": 0.01817397028207779, "learning_rate": 6.595814095136266e-05, "loss": 0.0093, "step": 3194 }, { "epoch": 2.284182305630027, "grad_norm": 0.01892389915883541, "learning_rate": 6.593447589597184e-05, "loss": 0.0081, "step": 3195 }, { "epoch": 2.284182305630027, "eval_loss": 0.009824429638683796, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 3195 }, { "epoch": 2.2848972296693475, "grad_norm": 0.013383728452026844, "learning_rate": 6.59108068668529e-05, "loss": 0.0073, "step": 3196 }, { "epoch": 2.2856121537086684, "grad_norm": 0.02202701009809971, "learning_rate": 6.588713386990836e-05, "loss": 0.0155, "step": 3197 }, { "epoch": 2.2863270777479894, "grad_norm": 0.018851755186915398, "learning_rate": 6.586345691104187e-05, "loss": 0.0138, "step": 3198 }, { "epoch": 2.2870420017873103, "grad_norm": 0.020024498924613, "learning_rate": 6.583977599615792e-05, "loss": 0.0128, "step": 3199 }, { "epoch": 2.287756925826631, "grad_norm": 0.01245646458119154, "learning_rate": 6.581609113116203e-05, "loss": 0.0073, "step": 3200 }, { "epoch": 2.287756925826631, "eval_loss": 0.009934240952134132, "eval_runtime": 4.604, "eval_samples_per_second": 10.86, "eval_steps_per_second": 2.824, "step": 3200 }, { "epoch": 2.2884718498659518, "grad_norm": 0.018188104033470154, "learning_rate": 6.579240232196072e-05, "loss": 0.013, "step": 3201 }, { "epoch": 2.2891867739052727, "grad_norm": 0.02027604728937149, "learning_rate": 6.576870957446151e-05, "loss": 0.017, "step": 3202 }, { "epoch": 2.289901697944593, "grad_norm": 0.01758650504052639, "learning_rate": 6.574501289457286e-05, "loss": 0.012, "step": 3203 }, { "epoch": 2.290616621983914, "grad_norm": 0.015166034922003746, "learning_rate": 6.572131228820425e-05, "loss": 0.0107, "step": 3204 }, { "epoch": 2.291331546023235, "grad_norm": 0.013355134055018425, "learning_rate": 6.56976077612661e-05, "loss": 0.0071, "step": 3205 }, { "epoch": 2.291331546023235, "eval_loss": 0.01011651661247015, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 3205 }, { "epoch": 2.292046470062556, "grad_norm": 0.017531683668494225, "learning_rate": 6.567389931966983e-05, "loss": 0.0135, "step": 3206 }, { "epoch": 2.2927613941018765, "grad_norm": 0.015915097668766975, "learning_rate": 6.565018696932785e-05, "loss": 0.0109, "step": 3207 }, { "epoch": 2.2934763181411975, "grad_norm": 0.016427991911768913, "learning_rate": 6.562647071615351e-05, "loss": 0.0092, "step": 3208 }, { "epoch": 2.2941912421805184, "grad_norm": 0.017033372074365616, "learning_rate": 6.560275056606115e-05, "loss": 0.0096, "step": 3209 }, { "epoch": 2.294906166219839, "grad_norm": 0.0177847221493721, "learning_rate": 6.55790265249661e-05, "loss": 0.0147, "step": 3210 }, { "epoch": 2.294906166219839, "eval_loss": 0.010159610770642757, "eval_runtime": 4.5915, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 3210 }, { "epoch": 2.29562109025916, "grad_norm": 0.01806946098804474, "learning_rate": 6.555529859878466e-05, "loss": 0.0095, "step": 3211 }, { "epoch": 2.2963360142984808, "grad_norm": 0.021242450922727585, "learning_rate": 6.553156679343404e-05, "loss": 0.0123, "step": 3212 }, { "epoch": 2.2970509383378017, "grad_norm": 0.015453580766916275, "learning_rate": 6.550783111483248e-05, "loss": 0.011, "step": 3213 }, { "epoch": 2.2977658623771227, "grad_norm": 0.02047438733279705, "learning_rate": 6.548409156889919e-05, "loss": 0.0142, "step": 3214 }, { "epoch": 2.298480786416443, "grad_norm": 0.016424845904111862, "learning_rate": 6.546034816155429e-05, "loss": 0.0185, "step": 3215 }, { "epoch": 2.298480786416443, "eval_loss": 0.009988977573812008, "eval_runtime": 4.5965, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 3215 }, { "epoch": 2.299195710455764, "grad_norm": 0.017491476610302925, "learning_rate": 6.54366008987189e-05, "loss": 0.0126, "step": 3216 }, { "epoch": 2.299910634495085, "grad_norm": 0.015367579646408558, "learning_rate": 6.54128497863151e-05, "loss": 0.0163, "step": 3217 }, { "epoch": 2.3006255585344055, "grad_norm": 0.018877778202295303, "learning_rate": 6.538909483026593e-05, "loss": 0.0258, "step": 3218 }, { "epoch": 2.3013404825737265, "grad_norm": 0.018683848902583122, "learning_rate": 6.536533603649536e-05, "loss": 0.0153, "step": 3219 }, { "epoch": 2.3020554066130474, "grad_norm": 0.018975157290697098, "learning_rate": 6.534157341092837e-05, "loss": 0.0106, "step": 3220 }, { "epoch": 2.3020554066130474, "eval_loss": 0.009782003238797188, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3220 }, { "epoch": 2.3027703306523684, "grad_norm": 0.01677733100950718, "learning_rate": 6.531780695949086e-05, "loss": 0.0125, "step": 3221 }, { "epoch": 2.303485254691689, "grad_norm": 0.01547824777662754, "learning_rate": 6.529403668810968e-05, "loss": 0.0123, "step": 3222 }, { "epoch": 2.30420017873101, "grad_norm": 0.016377724707126617, "learning_rate": 6.527026260271265e-05, "loss": 0.0147, "step": 3223 }, { "epoch": 2.3049151027703307, "grad_norm": 0.016896866261959076, "learning_rate": 6.524648470922854e-05, "loss": 0.0127, "step": 3224 }, { "epoch": 2.3056300268096512, "grad_norm": 0.015490647405385971, "learning_rate": 6.522270301358703e-05, "loss": 0.0081, "step": 3225 }, { "epoch": 2.3056300268096512, "eval_loss": 0.00986756756901741, "eval_runtime": 4.5932, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 3225 }, { "epoch": 2.306344950848972, "grad_norm": 0.015489483252167702, "learning_rate": 6.519891752171883e-05, "loss": 0.0166, "step": 3226 }, { "epoch": 2.307059874888293, "grad_norm": 0.01885935477912426, "learning_rate": 6.517512823955554e-05, "loss": 0.0132, "step": 3227 }, { "epoch": 2.307774798927614, "grad_norm": 0.015161351300776005, "learning_rate": 6.51513351730297e-05, "loss": 0.0143, "step": 3228 }, { "epoch": 2.308489722966935, "grad_norm": 0.018472766503691673, "learning_rate": 6.512753832807479e-05, "loss": 0.0161, "step": 3229 }, { "epoch": 2.3092046470062555, "grad_norm": 0.020608697086572647, "learning_rate": 6.510373771062527e-05, "loss": 0.0139, "step": 3230 }, { "epoch": 2.3092046470062555, "eval_loss": 0.009803999215364456, "eval_runtime": 4.5886, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3230 }, { "epoch": 2.3099195710455764, "grad_norm": 0.0176352858543396, "learning_rate": 6.507993332661653e-05, "loss": 0.0151, "step": 3231 }, { "epoch": 2.3106344950848974, "grad_norm": 0.015002024360001087, "learning_rate": 6.505612518198489e-05, "loss": 0.0123, "step": 3232 }, { "epoch": 2.311349419124218, "grad_norm": 0.020060280337929726, "learning_rate": 6.503231328266757e-05, "loss": 0.0087, "step": 3233 }, { "epoch": 2.312064343163539, "grad_norm": 0.021372254937887192, "learning_rate": 6.500849763460282e-05, "loss": 0.0189, "step": 3234 }, { "epoch": 2.3127792672028598, "grad_norm": 0.017569255083799362, "learning_rate": 6.498467824372973e-05, "loss": 0.0143, "step": 3235 }, { "epoch": 2.3127792672028598, "eval_loss": 0.0099440673366189, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 3235 }, { "epoch": 2.3134941912421807, "grad_norm": 0.017939802259206772, "learning_rate": 6.49608551159884e-05, "loss": 0.0138, "step": 3236 }, { "epoch": 2.314209115281501, "grad_norm": 0.018032539635896683, "learning_rate": 6.493702825731976e-05, "loss": 0.0091, "step": 3237 }, { "epoch": 2.314924039320822, "grad_norm": 0.014781145378947258, "learning_rate": 6.49131976736658e-05, "loss": 0.0128, "step": 3238 }, { "epoch": 2.315638963360143, "grad_norm": 0.020789168775081635, "learning_rate": 6.488936337096935e-05, "loss": 0.0122, "step": 3239 }, { "epoch": 2.3163538873994636, "grad_norm": 0.014443067833781242, "learning_rate": 6.486552535517418e-05, "loss": 0.0096, "step": 3240 }, { "epoch": 2.3163538873994636, "eval_loss": 0.009844990447163582, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 3240 }, { "epoch": 2.3170688114387845, "grad_norm": 0.017310429364442825, "learning_rate": 6.4841683632225e-05, "loss": 0.01, "step": 3241 }, { "epoch": 2.3177837354781055, "grad_norm": 0.019465569406747818, "learning_rate": 6.481783820806745e-05, "loss": 0.015, "step": 3242 }, { "epoch": 2.3184986595174264, "grad_norm": 0.016461048275232315, "learning_rate": 6.479398908864809e-05, "loss": 0.0124, "step": 3243 }, { "epoch": 2.3192135835567473, "grad_norm": 0.018093977123498917, "learning_rate": 6.47701362799144e-05, "loss": 0.0158, "step": 3244 }, { "epoch": 2.319928507596068, "grad_norm": 0.019395513460040092, "learning_rate": 6.474627978781474e-05, "loss": 0.0134, "step": 3245 }, { "epoch": 2.319928507596068, "eval_loss": 0.009721918031573296, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 3245 }, { "epoch": 2.3206434316353888, "grad_norm": 0.016584139317274094, "learning_rate": 6.472241961829846e-05, "loss": 0.0163, "step": 3246 }, { "epoch": 2.3213583556747097, "grad_norm": 0.01956738717854023, "learning_rate": 6.469855577731578e-05, "loss": 0.0133, "step": 3247 }, { "epoch": 2.32207327971403, "grad_norm": 0.013377035036683083, "learning_rate": 6.467468827081786e-05, "loss": 0.0106, "step": 3248 }, { "epoch": 2.322788203753351, "grad_norm": 0.013291360810399055, "learning_rate": 6.465081710475674e-05, "loss": 0.0122, "step": 3249 }, { "epoch": 2.323503127792672, "grad_norm": 0.021195678040385246, "learning_rate": 6.462694228508541e-05, "loss": 0.0101, "step": 3250 }, { "epoch": 2.323503127792672, "eval_loss": 0.009581039659678936, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3250 }, { "epoch": 2.324218051831993, "grad_norm": 0.01574682630598545, "learning_rate": 6.460306381775775e-05, "loss": 0.0081, "step": 3251 }, { "epoch": 2.3249329758713135, "grad_norm": 0.018020719289779663, "learning_rate": 6.457918170872854e-05, "loss": 0.0175, "step": 3252 }, { "epoch": 2.3256478999106345, "grad_norm": 0.014965585432946682, "learning_rate": 6.455529596395353e-05, "loss": 0.0115, "step": 3253 }, { "epoch": 2.3263628239499554, "grad_norm": 0.020055389031767845, "learning_rate": 6.453140658938928e-05, "loss": 0.0123, "step": 3254 }, { "epoch": 2.327077747989276, "grad_norm": 0.01799721270799637, "learning_rate": 6.450751359099331e-05, "loss": 0.0118, "step": 3255 }, { "epoch": 2.327077747989276, "eval_loss": 0.009475861676037312, "eval_runtime": 4.5795, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 3255 }, { "epoch": 2.327792672028597, "grad_norm": 0.022897280752658844, "learning_rate": 6.448361697472408e-05, "loss": 0.0201, "step": 3256 }, { "epoch": 2.328507596067918, "grad_norm": 0.015257220715284348, "learning_rate": 6.445971674654086e-05, "loss": 0.0105, "step": 3257 }, { "epoch": 2.3292225201072387, "grad_norm": 0.021170759573578835, "learning_rate": 6.443581291240391e-05, "loss": 0.0194, "step": 3258 }, { "epoch": 2.3299374441465597, "grad_norm": 0.017286285758018494, "learning_rate": 6.441190547827434e-05, "loss": 0.0133, "step": 3259 }, { "epoch": 2.33065236818588, "grad_norm": 0.02160918340086937, "learning_rate": 6.438799445011415e-05, "loss": 0.0151, "step": 3260 }, { "epoch": 2.33065236818588, "eval_loss": 0.009372801519930363, "eval_runtime": 4.5914, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 3260 }, { "epoch": 2.331367292225201, "grad_norm": 0.018713094294071198, "learning_rate": 6.436407983388627e-05, "loss": 0.0093, "step": 3261 }, { "epoch": 2.332082216264522, "grad_norm": 0.015013180673122406, "learning_rate": 6.434016163555452e-05, "loss": 0.0134, "step": 3262 }, { "epoch": 2.3327971403038426, "grad_norm": 0.016699612140655518, "learning_rate": 6.431623986108359e-05, "loss": 0.0137, "step": 3263 }, { "epoch": 2.3335120643431635, "grad_norm": 0.017591489478945732, "learning_rate": 6.429231451643907e-05, "loss": 0.0124, "step": 3264 }, { "epoch": 2.3342269883824844, "grad_norm": 0.015094764530658722, "learning_rate": 6.426838560758746e-05, "loss": 0.0079, "step": 3265 }, { "epoch": 2.3342269883824844, "eval_loss": 0.009425508789718151, "eval_runtime": 4.6167, "eval_samples_per_second": 10.83, "eval_steps_per_second": 2.816, "step": 3265 }, { "epoch": 2.3349419124218054, "grad_norm": 0.019675321877002716, "learning_rate": 6.42444531404961e-05, "loss": 0.0214, "step": 3266 }, { "epoch": 2.335656836461126, "grad_norm": 0.012207364663481712, "learning_rate": 6.422051712113331e-05, "loss": 0.0076, "step": 3267 }, { "epoch": 2.336371760500447, "grad_norm": 0.01658012717962265, "learning_rate": 6.419657755546819e-05, "loss": 0.0144, "step": 3268 }, { "epoch": 2.3370866845397678, "grad_norm": 0.017320554703474045, "learning_rate": 6.417263444947079e-05, "loss": 0.0109, "step": 3269 }, { "epoch": 2.3378016085790883, "grad_norm": 0.026425890624523163, "learning_rate": 6.414868780911203e-05, "loss": 0.0161, "step": 3270 }, { "epoch": 2.3378016085790883, "eval_loss": 0.009574533440172672, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3270 }, { "epoch": 2.338516532618409, "grad_norm": 0.019065361469984055, "learning_rate": 6.412473764036369e-05, "loss": 0.0089, "step": 3271 }, { "epoch": 2.33923145665773, "grad_norm": 0.01806054823100567, "learning_rate": 6.410078394919846e-05, "loss": 0.0089, "step": 3272 }, { "epoch": 2.339946380697051, "grad_norm": 0.014141797088086605, "learning_rate": 6.407682674158988e-05, "loss": 0.0089, "step": 3273 }, { "epoch": 2.3406613047363716, "grad_norm": 0.020034853368997574, "learning_rate": 6.405286602351239e-05, "loss": 0.0118, "step": 3274 }, { "epoch": 2.3413762287756925, "grad_norm": 0.019923310726881027, "learning_rate": 6.402890180094129e-05, "loss": 0.0102, "step": 3275 }, { "epoch": 2.3413762287756925, "eval_loss": 0.009545685723423958, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3275 }, { "epoch": 2.3420911528150135, "grad_norm": 0.018465327098965645, "learning_rate": 6.400493407985278e-05, "loss": 0.0131, "step": 3276 }, { "epoch": 2.3428060768543344, "grad_norm": 0.023568429052829742, "learning_rate": 6.398096286622388e-05, "loss": 0.0162, "step": 3277 }, { "epoch": 2.343521000893655, "grad_norm": 0.016402000561356544, "learning_rate": 6.395698816603253e-05, "loss": 0.0076, "step": 3278 }, { "epoch": 2.344235924932976, "grad_norm": 0.022733358666300774, "learning_rate": 6.393300998525755e-05, "loss": 0.0128, "step": 3279 }, { "epoch": 2.3449508489722968, "grad_norm": 0.0172229316085577, "learning_rate": 6.390902832987856e-05, "loss": 0.0157, "step": 3280 }, { "epoch": 2.3449508489722968, "eval_loss": 0.009529939852654934, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3280 }, { "epoch": 2.3456657730116177, "grad_norm": 0.02039439044892788, "learning_rate": 6.388504320587611e-05, "loss": 0.0181, "step": 3281 }, { "epoch": 2.346380697050938, "grad_norm": 0.018611880019307137, "learning_rate": 6.386105461923159e-05, "loss": 0.0111, "step": 3282 }, { "epoch": 2.347095621090259, "grad_norm": 0.016700999811291695, "learning_rate": 6.383706257592725e-05, "loss": 0.0096, "step": 3283 }, { "epoch": 2.34781054512958, "grad_norm": 0.02597668208181858, "learning_rate": 6.381306708194622e-05, "loss": 0.0118, "step": 3284 }, { "epoch": 2.3485254691689006, "grad_norm": 0.013592258095741272, "learning_rate": 6.378906814327246e-05, "loss": 0.0077, "step": 3285 }, { "epoch": 2.3485254691689006, "eval_loss": 0.009458527900278568, "eval_runtime": 4.5997, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 3285 }, { "epoch": 2.3492403932082215, "grad_norm": 0.013293185271322727, "learning_rate": 6.376506576589082e-05, "loss": 0.0141, "step": 3286 }, { "epoch": 2.3499553172475425, "grad_norm": 0.014458824880421162, "learning_rate": 6.374105995578701e-05, "loss": 0.0103, "step": 3287 }, { "epoch": 2.3506702412868634, "grad_norm": 0.012725003995001316, "learning_rate": 6.371705071894756e-05, "loss": 0.01, "step": 3288 }, { "epoch": 2.351385165326184, "grad_norm": 0.016511287540197372, "learning_rate": 6.369303806135989e-05, "loss": 0.0151, "step": 3289 }, { "epoch": 2.352100089365505, "grad_norm": 0.019150128588080406, "learning_rate": 6.366902198901224e-05, "loss": 0.0214, "step": 3290 }, { "epoch": 2.352100089365505, "eval_loss": 0.009460482746362686, "eval_runtime": 4.6023, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 3290 }, { "epoch": 2.352815013404826, "grad_norm": 0.020471425727009773, "learning_rate": 6.364500250789375e-05, "loss": 0.0147, "step": 3291 }, { "epoch": 2.3535299374441467, "grad_norm": 0.015080884099006653, "learning_rate": 6.362097962399435e-05, "loss": 0.013, "step": 3292 }, { "epoch": 2.3542448614834672, "grad_norm": 0.019148577004671097, "learning_rate": 6.359695334330487e-05, "loss": 0.0115, "step": 3293 }, { "epoch": 2.354959785522788, "grad_norm": 0.022610969841480255, "learning_rate": 6.357292367181698e-05, "loss": 0.0161, "step": 3294 }, { "epoch": 2.355674709562109, "grad_norm": 0.019149817526340485, "learning_rate": 6.354889061552314e-05, "loss": 0.0152, "step": 3295 }, { "epoch": 2.355674709562109, "eval_loss": 0.00944237969815731, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 3295 }, { "epoch": 2.35638963360143, "grad_norm": 0.01764768362045288, "learning_rate": 6.352485418041673e-05, "loss": 0.0162, "step": 3296 }, { "epoch": 2.3571045576407506, "grad_norm": 0.01688946597278118, "learning_rate": 6.350081437249191e-05, "loss": 0.0114, "step": 3297 }, { "epoch": 2.3578194816800715, "grad_norm": 0.02650202438235283, "learning_rate": 6.347677119774374e-05, "loss": 0.0217, "step": 3298 }, { "epoch": 2.3585344057193924, "grad_norm": 0.014013836160302162, "learning_rate": 6.345272466216807e-05, "loss": 0.008, "step": 3299 }, { "epoch": 2.359249329758713, "grad_norm": 0.016815293580293655, "learning_rate": 6.342867477176164e-05, "loss": 0.0093, "step": 3300 }, { "epoch": 2.359249329758713, "eval_loss": 0.009330006316304207, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 3300 }, { "epoch": 2.359964253798034, "grad_norm": 0.018917132169008255, "learning_rate": 6.340462153252195e-05, "loss": 0.0144, "step": 3301 }, { "epoch": 2.360679177837355, "grad_norm": 0.016243966296315193, "learning_rate": 6.338056495044739e-05, "loss": 0.0115, "step": 3302 }, { "epoch": 2.3613941018766758, "grad_norm": 0.019128812476992607, "learning_rate": 6.33565050315372e-05, "loss": 0.0109, "step": 3303 }, { "epoch": 2.3621090259159963, "grad_norm": 0.02014065906405449, "learning_rate": 6.33324417817914e-05, "loss": 0.014, "step": 3304 }, { "epoch": 2.362823949955317, "grad_norm": 0.018699241802096367, "learning_rate": 6.330837520721088e-05, "loss": 0.0149, "step": 3305 }, { "epoch": 2.362823949955317, "eval_loss": 0.009231657721102238, "eval_runtime": 4.6026, "eval_samples_per_second": 10.863, "eval_steps_per_second": 2.824, "step": 3305 }, { "epoch": 2.363538873994638, "grad_norm": 0.020442673936486244, "learning_rate": 6.328430531379733e-05, "loss": 0.0156, "step": 3306 }, { "epoch": 2.364253798033959, "grad_norm": 0.01674860157072544, "learning_rate": 6.32602321075533e-05, "loss": 0.0146, "step": 3307 }, { "epoch": 2.3649687220732796, "grad_norm": 0.016650347039103508, "learning_rate": 6.323615559448213e-05, "loss": 0.0109, "step": 3308 }, { "epoch": 2.3656836461126005, "grad_norm": 0.01630430854856968, "learning_rate": 6.321207578058803e-05, "loss": 0.0093, "step": 3309 }, { "epoch": 2.3663985701519215, "grad_norm": 0.021382467821240425, "learning_rate": 6.318799267187596e-05, "loss": 0.0139, "step": 3310 }, { "epoch": 2.3663985701519215, "eval_loss": 0.009415339678525925, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 3310 }, { "epoch": 2.3671134941912424, "grad_norm": 0.01827232912182808, "learning_rate": 6.31639062743518e-05, "loss": 0.0135, "step": 3311 }, { "epoch": 2.367828418230563, "grad_norm": 0.01573651283979416, "learning_rate": 6.313981659402219e-05, "loss": 0.0129, "step": 3312 }, { "epoch": 2.368543342269884, "grad_norm": 0.01800157129764557, "learning_rate": 6.311572363689457e-05, "loss": 0.0092, "step": 3313 }, { "epoch": 2.3692582663092048, "grad_norm": 0.01851542294025421, "learning_rate": 6.309162740897721e-05, "loss": 0.0116, "step": 3314 }, { "epoch": 2.3699731903485253, "grad_norm": 0.01767602376639843, "learning_rate": 6.306752791627928e-05, "loss": 0.0121, "step": 3315 }, { "epoch": 2.3699731903485253, "eval_loss": 0.009445647709071636, "eval_runtime": 4.6154, "eval_samples_per_second": 10.833, "eval_steps_per_second": 2.817, "step": 3315 }, { "epoch": 2.370688114387846, "grad_norm": 0.016512447968125343, "learning_rate": 6.304342516481064e-05, "loss": 0.0125, "step": 3316 }, { "epoch": 2.371403038427167, "grad_norm": 0.019033141434192657, "learning_rate": 6.301931916058201e-05, "loss": 0.0203, "step": 3317 }, { "epoch": 2.372117962466488, "grad_norm": 0.015537947416305542, "learning_rate": 6.299520990960497e-05, "loss": 0.0119, "step": 3318 }, { "epoch": 2.3728328865058086, "grad_norm": 0.015785856172442436, "learning_rate": 6.297109741789183e-05, "loss": 0.0098, "step": 3319 }, { "epoch": 2.3735478105451295, "grad_norm": 0.016148267313838005, "learning_rate": 6.294698169145578e-05, "loss": 0.0159, "step": 3320 }, { "epoch": 2.3735478105451295, "eval_loss": 0.009365484118461609, "eval_runtime": 4.603, "eval_samples_per_second": 10.863, "eval_steps_per_second": 2.824, "step": 3320 }, { "epoch": 2.3742627345844505, "grad_norm": 0.019807908684015274, "learning_rate": 6.292286273631074e-05, "loss": 0.0118, "step": 3321 }, { "epoch": 2.3749776586237714, "grad_norm": 0.024418503046035767, "learning_rate": 6.289874055847151e-05, "loss": 0.0127, "step": 3322 }, { "epoch": 2.375692582663092, "grad_norm": 0.014673243276774883, "learning_rate": 6.287461516395365e-05, "loss": 0.0128, "step": 3323 }, { "epoch": 2.376407506702413, "grad_norm": 0.018226513639092445, "learning_rate": 6.285048655877355e-05, "loss": 0.0121, "step": 3324 }, { "epoch": 2.377122430741734, "grad_norm": 0.017108533531427383, "learning_rate": 6.282635474894835e-05, "loss": 0.0148, "step": 3325 }, { "epoch": 2.377122430741734, "eval_loss": 0.009401784278452396, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 3325 }, { "epoch": 2.3778373547810547, "grad_norm": 0.013429906219244003, "learning_rate": 6.280221974049606e-05, "loss": 0.0076, "step": 3326 }, { "epoch": 2.3785522788203752, "grad_norm": 0.017960578203201294, "learning_rate": 6.277808153943543e-05, "loss": 0.0148, "step": 3327 }, { "epoch": 2.379267202859696, "grad_norm": 0.015902835875749588, "learning_rate": 6.275394015178603e-05, "loss": 0.011, "step": 3328 }, { "epoch": 2.379982126899017, "grad_norm": 0.016354067251086235, "learning_rate": 6.272979558356821e-05, "loss": 0.009, "step": 3329 }, { "epoch": 2.3806970509383376, "grad_norm": 0.015547605231404305, "learning_rate": 6.270564784080314e-05, "loss": 0.0074, "step": 3330 }, { "epoch": 2.3806970509383376, "eval_loss": 0.009378605522215366, "eval_runtime": 4.5916, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 3330 }, { "epoch": 2.3814119749776586, "grad_norm": 0.015657292678952217, "learning_rate": 6.268149692951278e-05, "loss": 0.0135, "step": 3331 }, { "epoch": 2.3821268990169795, "grad_norm": 0.015607520937919617, "learning_rate": 6.265734285571984e-05, "loss": 0.0091, "step": 3332 }, { "epoch": 2.3828418230563004, "grad_norm": 0.02130364067852497, "learning_rate": 6.263318562544787e-05, "loss": 0.013, "step": 3333 }, { "epoch": 2.383556747095621, "grad_norm": 0.01870208978652954, "learning_rate": 6.260902524472116e-05, "loss": 0.0074, "step": 3334 }, { "epoch": 2.384271671134942, "grad_norm": 0.015118790790438652, "learning_rate": 6.258486171956483e-05, "loss": 0.0099, "step": 3335 }, { "epoch": 2.384271671134942, "eval_loss": 0.009334078058600426, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 3335 }, { "epoch": 2.384986595174263, "grad_norm": 0.013515829108655453, "learning_rate": 6.256069505600473e-05, "loss": 0.0103, "step": 3336 }, { "epoch": 2.3857015192135833, "grad_norm": 0.017385706305503845, "learning_rate": 6.253652526006756e-05, "loss": 0.0166, "step": 3337 }, { "epoch": 2.3864164432529043, "grad_norm": 0.019069842994213104, "learning_rate": 6.251235233778075e-05, "loss": 0.0139, "step": 3338 }, { "epoch": 2.387131367292225, "grad_norm": 0.0182189978659153, "learning_rate": 6.248817629517253e-05, "loss": 0.0096, "step": 3339 }, { "epoch": 2.387846291331546, "grad_norm": 0.01977207511663437, "learning_rate": 6.24639971382719e-05, "loss": 0.0167, "step": 3340 }, { "epoch": 2.387846291331546, "eval_loss": 0.009425446391105652, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 3340 }, { "epoch": 2.388561215370867, "grad_norm": 0.019351039081811905, "learning_rate": 6.243981487310864e-05, "loss": 0.0146, "step": 3341 }, { "epoch": 2.3892761394101876, "grad_norm": 0.022850804030895233, "learning_rate": 6.241562950571331e-05, "loss": 0.0097, "step": 3342 }, { "epoch": 2.3899910634495085, "grad_norm": 0.016643507406115532, "learning_rate": 6.239144104211724e-05, "loss": 0.0146, "step": 3343 }, { "epoch": 2.3907059874888295, "grad_norm": 0.015775946900248528, "learning_rate": 6.236724948835252e-05, "loss": 0.0093, "step": 3344 }, { "epoch": 2.39142091152815, "grad_norm": 0.01731893979012966, "learning_rate": 6.234305485045206e-05, "loss": 0.0148, "step": 3345 }, { "epoch": 2.39142091152815, "eval_loss": 0.00944032147526741, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3345 }, { "epoch": 2.392135835567471, "grad_norm": 0.016395872458815575, "learning_rate": 6.231885713444943e-05, "loss": 0.0131, "step": 3346 }, { "epoch": 2.392850759606792, "grad_norm": 0.017628459259867668, "learning_rate": 6.229465634637913e-05, "loss": 0.0115, "step": 3347 }, { "epoch": 2.3935656836461128, "grad_norm": 0.01553442794829607, "learning_rate": 6.227045249227627e-05, "loss": 0.0146, "step": 3348 }, { "epoch": 2.3942806076854333, "grad_norm": 0.017038587480783463, "learning_rate": 6.22462455781768e-05, "loss": 0.0164, "step": 3349 }, { "epoch": 2.394995531724754, "grad_norm": 0.015518592670559883, "learning_rate": 6.222203561011742e-05, "loss": 0.0113, "step": 3350 }, { "epoch": 2.394995531724754, "eval_loss": 0.009708782657980919, "eval_runtime": 4.5792, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 3350 }, { "epoch": 2.395710455764075, "grad_norm": 0.015092727728188038, "learning_rate": 6.219782259413562e-05, "loss": 0.0097, "step": 3351 }, { "epoch": 2.3964253798033956, "grad_norm": 0.02212216891348362, "learning_rate": 6.21736065362696e-05, "loss": 0.0097, "step": 3352 }, { "epoch": 2.3971403038427166, "grad_norm": 0.01420438103377819, "learning_rate": 6.214938744255837e-05, "loss": 0.0081, "step": 3353 }, { "epoch": 2.3978552278820375, "grad_norm": 0.017264986410737038, "learning_rate": 6.212516531904164e-05, "loss": 0.0081, "step": 3354 }, { "epoch": 2.3985701519213585, "grad_norm": 0.027851592749357224, "learning_rate": 6.210094017175991e-05, "loss": 0.014, "step": 3355 }, { "epoch": 2.3985701519213585, "eval_loss": 0.009890664368867874, "eval_runtime": 4.6385, "eval_samples_per_second": 10.779, "eval_steps_per_second": 2.803, "step": 3355 }, { "epoch": 2.3992850759606794, "grad_norm": 0.017885785549879074, "learning_rate": 6.207671200675445e-05, "loss": 0.0126, "step": 3356 }, { "epoch": 2.4, "grad_norm": 0.01871647872030735, "learning_rate": 6.205248083006724e-05, "loss": 0.0128, "step": 3357 }, { "epoch": 2.400714924039321, "grad_norm": 0.014735453762114048, "learning_rate": 6.202824664774106e-05, "loss": 0.0104, "step": 3358 }, { "epoch": 2.401429848078642, "grad_norm": 0.011554025113582611, "learning_rate": 6.200400946581939e-05, "loss": 0.0067, "step": 3359 }, { "epoch": 2.4021447721179623, "grad_norm": 0.01840071938931942, "learning_rate": 6.197976929034648e-05, "loss": 0.0142, "step": 3360 }, { "epoch": 2.4021447721179623, "eval_loss": 0.009929602034389973, "eval_runtime": 4.5909, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 3360 }, { "epoch": 2.4028596961572832, "grad_norm": 0.016608715057373047, "learning_rate": 6.195552612736735e-05, "loss": 0.0118, "step": 3361 }, { "epoch": 2.403574620196604, "grad_norm": 0.015038000419735909, "learning_rate": 6.19312799829277e-05, "loss": 0.0133, "step": 3362 }, { "epoch": 2.404289544235925, "grad_norm": 0.015409574843943119, "learning_rate": 6.190703086307406e-05, "loss": 0.0132, "step": 3363 }, { "epoch": 2.4050044682752456, "grad_norm": 0.014659140259027481, "learning_rate": 6.188277877385365e-05, "loss": 0.0171, "step": 3364 }, { "epoch": 2.4057193923145666, "grad_norm": 0.01504739560186863, "learning_rate": 6.185852372131442e-05, "loss": 0.0123, "step": 3365 }, { "epoch": 2.4057193923145666, "eval_loss": 0.010063400492072105, "eval_runtime": 4.5986, "eval_samples_per_second": 10.873, "eval_steps_per_second": 2.827, "step": 3365 }, { "epoch": 2.4064343163538875, "grad_norm": 0.021255960687994957, "learning_rate": 6.183426571150508e-05, "loss": 0.0225, "step": 3366 }, { "epoch": 2.407149240393208, "grad_norm": 0.017804032191634178, "learning_rate": 6.18100047504751e-05, "loss": 0.0101, "step": 3367 }, { "epoch": 2.407864164432529, "grad_norm": 0.018053723499178886, "learning_rate": 6.178574084427463e-05, "loss": 0.0115, "step": 3368 }, { "epoch": 2.40857908847185, "grad_norm": 0.013102248311042786, "learning_rate": 6.176147399895461e-05, "loss": 0.0099, "step": 3369 }, { "epoch": 2.409294012511171, "grad_norm": 0.013774948194622993, "learning_rate": 6.173720422056665e-05, "loss": 0.0119, "step": 3370 }, { "epoch": 2.409294012511171, "eval_loss": 0.009863738901913166, "eval_runtime": 4.5958, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 3370 }, { "epoch": 2.4100089365504918, "grad_norm": 0.016010990366339684, "learning_rate": 6.171293151516319e-05, "loss": 0.0079, "step": 3371 }, { "epoch": 2.4107238605898123, "grad_norm": 0.018678369000554085, "learning_rate": 6.16886558887973e-05, "loss": 0.0117, "step": 3372 }, { "epoch": 2.411438784629133, "grad_norm": 0.01710657589137554, "learning_rate": 6.166437734752281e-05, "loss": 0.0122, "step": 3373 }, { "epoch": 2.412153708668454, "grad_norm": 0.02291407622396946, "learning_rate": 6.16400958973943e-05, "loss": 0.0127, "step": 3374 }, { "epoch": 2.4128686327077746, "grad_norm": 0.019036073237657547, "learning_rate": 6.161581154446708e-05, "loss": 0.0116, "step": 3375 }, { "epoch": 2.4128686327077746, "eval_loss": 0.009592990390956402, "eval_runtime": 4.6043, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 3375 }, { "epoch": 2.4135835567470956, "grad_norm": 0.019555917009711266, "learning_rate": 6.159152429479714e-05, "loss": 0.0153, "step": 3376 }, { "epoch": 2.4142984807864165, "grad_norm": 0.016339672729372978, "learning_rate": 6.156723415444123e-05, "loss": 0.0098, "step": 3377 }, { "epoch": 2.4150134048257375, "grad_norm": 0.016233092173933983, "learning_rate": 6.154294112945677e-05, "loss": 0.0154, "step": 3378 }, { "epoch": 2.415728328865058, "grad_norm": 0.015106016770005226, "learning_rate": 6.1518645225902e-05, "loss": 0.0081, "step": 3379 }, { "epoch": 2.416443252904379, "grad_norm": 0.022043349221348763, "learning_rate": 6.149434644983576e-05, "loss": 0.0089, "step": 3380 }, { "epoch": 2.416443252904379, "eval_loss": 0.009514921344816685, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3380 }, { "epoch": 2.4171581769437, "grad_norm": 0.02364426478743553, "learning_rate": 6.147004480731769e-05, "loss": 0.0169, "step": 3381 }, { "epoch": 2.4178731009830203, "grad_norm": 0.018236972391605377, "learning_rate": 6.14457403044081e-05, "loss": 0.0109, "step": 3382 }, { "epoch": 2.4185880250223413, "grad_norm": 0.012169533409178257, "learning_rate": 6.142143294716806e-05, "loss": 0.0109, "step": 3383 }, { "epoch": 2.419302949061662, "grad_norm": 0.01875213347375393, "learning_rate": 6.139712274165928e-05, "loss": 0.0145, "step": 3384 }, { "epoch": 2.420017873100983, "grad_norm": 0.02597617171704769, "learning_rate": 6.137280969394425e-05, "loss": 0.0179, "step": 3385 }, { "epoch": 2.420017873100983, "eval_loss": 0.009544355794787407, "eval_runtime": 4.6184, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.815, "step": 3385 }, { "epoch": 2.4207327971403036, "grad_norm": 0.021262547001242638, "learning_rate": 6.134849381008613e-05, "loss": 0.0206, "step": 3386 }, { "epoch": 2.4214477211796246, "grad_norm": 0.020313594490289688, "learning_rate": 6.13241750961488e-05, "loss": 0.0129, "step": 3387 }, { "epoch": 2.4221626452189455, "grad_norm": 0.01497766375541687, "learning_rate": 6.129985355819684e-05, "loss": 0.0072, "step": 3388 }, { "epoch": 2.4228775692582665, "grad_norm": 0.01889696717262268, "learning_rate": 6.127552920229555e-05, "loss": 0.0133, "step": 3389 }, { "epoch": 2.423592493297587, "grad_norm": 0.020335815846920013, "learning_rate": 6.12512020345109e-05, "loss": 0.0131, "step": 3390 }, { "epoch": 2.423592493297587, "eval_loss": 0.009651973843574524, "eval_runtime": 4.6222, "eval_samples_per_second": 10.817, "eval_steps_per_second": 2.812, "step": 3390 }, { "epoch": 2.424307417336908, "grad_norm": 0.019402077421545982, "learning_rate": 6.12268720609096e-05, "loss": 0.0151, "step": 3391 }, { "epoch": 2.425022341376229, "grad_norm": 0.015210150741040707, "learning_rate": 6.120253928755904e-05, "loss": 0.0104, "step": 3392 }, { "epoch": 2.42573726541555, "grad_norm": 0.015726817771792412, "learning_rate": 6.117820372052728e-05, "loss": 0.0109, "step": 3393 }, { "epoch": 2.4264521894548703, "grad_norm": 0.018173905089497566, "learning_rate": 6.115386536588315e-05, "loss": 0.0092, "step": 3394 }, { "epoch": 2.4271671134941912, "grad_norm": 0.02479671686887741, "learning_rate": 6.11295242296961e-05, "loss": 0.0134, "step": 3395 }, { "epoch": 2.4271671134941912, "eval_loss": 0.00976383313536644, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3395 }, { "epoch": 2.427882037533512, "grad_norm": 0.017326192930340767, "learning_rate": 6.11051803180363e-05, "loss": 0.0101, "step": 3396 }, { "epoch": 2.4285969615728327, "grad_norm": 0.017149219289422035, "learning_rate": 6.108083363697465e-05, "loss": 0.0152, "step": 3397 }, { "epoch": 2.4293118856121536, "grad_norm": 0.018986837938427925, "learning_rate": 6.105648419258271e-05, "loss": 0.0128, "step": 3398 }, { "epoch": 2.4300268096514746, "grad_norm": 0.018708785995841026, "learning_rate": 6.103213199093267e-05, "loss": 0.0106, "step": 3399 }, { "epoch": 2.4307417336907955, "grad_norm": 0.015742896124720573, "learning_rate": 6.100777703809752e-05, "loss": 0.012, "step": 3400 }, { "epoch": 2.4307417336907955, "eval_loss": 0.00970021728426218, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3400 }, { "epoch": 2.431456657730116, "grad_norm": 0.01190438773483038, "learning_rate": 6.098341934015087e-05, "loss": 0.0101, "step": 3401 }, { "epoch": 2.432171581769437, "grad_norm": 0.013551713898777962, "learning_rate": 6.095905890316701e-05, "loss": 0.008, "step": 3402 }, { "epoch": 2.432886505808758, "grad_norm": 0.013467702083289623, "learning_rate": 6.0934695733220936e-05, "loss": 0.0098, "step": 3403 }, { "epoch": 2.433601429848079, "grad_norm": 0.012235189788043499, "learning_rate": 6.091032983638832e-05, "loss": 0.0079, "step": 3404 }, { "epoch": 2.4343163538873993, "grad_norm": 0.018229028210043907, "learning_rate": 6.088596121874551e-05, "loss": 0.012, "step": 3405 }, { "epoch": 2.4343163538873993, "eval_loss": 0.009772331453859806, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 3405 }, { "epoch": 2.4350312779267203, "grad_norm": 0.014894582331180573, "learning_rate": 6.086158988636953e-05, "loss": 0.0084, "step": 3406 }, { "epoch": 2.435746201966041, "grad_norm": 0.018844403326511383, "learning_rate": 6.083721584533809e-05, "loss": 0.0137, "step": 3407 }, { "epoch": 2.436461126005362, "grad_norm": 0.019582679495215416, "learning_rate": 6.081283910172956e-05, "loss": 0.0173, "step": 3408 }, { "epoch": 2.4371760500446826, "grad_norm": 0.018382998183369637, "learning_rate": 6.078845966162302e-05, "loss": 0.012, "step": 3409 }, { "epoch": 2.4378909740840036, "grad_norm": 0.01657530851662159, "learning_rate": 6.076407753109818e-05, "loss": 0.0092, "step": 3410 }, { "epoch": 2.4378909740840036, "eval_loss": 0.009701786562800407, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 3410 }, { "epoch": 2.4386058981233245, "grad_norm": 0.01972203515470028, "learning_rate": 6.073969271623543e-05, "loss": 0.013, "step": 3411 }, { "epoch": 2.439320822162645, "grad_norm": 0.020947791635990143, "learning_rate": 6.0715305223115856e-05, "loss": 0.0134, "step": 3412 }, { "epoch": 2.440035746201966, "grad_norm": 0.016873184591531754, "learning_rate": 6.0690915057821186e-05, "loss": 0.012, "step": 3413 }, { "epoch": 2.440750670241287, "grad_norm": 0.01756645366549492, "learning_rate": 6.066652222643381e-05, "loss": 0.0106, "step": 3414 }, { "epoch": 2.441465594280608, "grad_norm": 0.02129620686173439, "learning_rate": 6.064212673503682e-05, "loss": 0.0116, "step": 3415 }, { "epoch": 2.441465594280608, "eval_loss": 0.00976303219795227, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3415 }, { "epoch": 2.4421805183199283, "grad_norm": 0.021420640870928764, "learning_rate": 6.061772858971392e-05, "loss": 0.0143, "step": 3416 }, { "epoch": 2.4428954423592493, "grad_norm": 0.016686882823705673, "learning_rate": 6.059332779654953e-05, "loss": 0.0145, "step": 3417 }, { "epoch": 2.44361036639857, "grad_norm": 0.019699474796652794, "learning_rate": 6.056892436162866e-05, "loss": 0.0146, "step": 3418 }, { "epoch": 2.444325290437891, "grad_norm": 0.022529814392328262, "learning_rate": 6.054451829103706e-05, "loss": 0.0127, "step": 3419 }, { "epoch": 2.4450402144772116, "grad_norm": 0.019233040511608124, "learning_rate": 6.052010959086109e-05, "loss": 0.017, "step": 3420 }, { "epoch": 2.4450402144772116, "eval_loss": 0.00963368359953165, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3420 }, { "epoch": 2.4457551385165326, "grad_norm": 0.01971440389752388, "learning_rate": 6.049569826718776e-05, "loss": 0.0113, "step": 3421 }, { "epoch": 2.4464700625558535, "grad_norm": 0.01614081673324108, "learning_rate": 6.047128432610476e-05, "loss": 0.0116, "step": 3422 }, { "epoch": 2.4471849865951745, "grad_norm": 0.014616294763982296, "learning_rate": 6.044686777370042e-05, "loss": 0.0104, "step": 3423 }, { "epoch": 2.447899910634495, "grad_norm": 0.019068460911512375, "learning_rate": 6.042244861606372e-05, "loss": 0.0204, "step": 3424 }, { "epoch": 2.448614834673816, "grad_norm": 0.016893452033400536, "learning_rate": 6.0398026859284294e-05, "loss": 0.0105, "step": 3425 }, { "epoch": 2.448614834673816, "eval_loss": 0.009648531675338745, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 3425 }, { "epoch": 2.449329758713137, "grad_norm": 0.024889735504984856, "learning_rate": 6.037360250945242e-05, "loss": 0.0193, "step": 3426 }, { "epoch": 2.4500446827524573, "grad_norm": 0.019148793071508408, "learning_rate": 6.034917557265902e-05, "loss": 0.0084, "step": 3427 }, { "epoch": 2.4507596067917783, "grad_norm": 0.016787320375442505, "learning_rate": 6.032474605499568e-05, "loss": 0.0164, "step": 3428 }, { "epoch": 2.4514745308310992, "grad_norm": 0.015665946528315544, "learning_rate": 6.030031396255462e-05, "loss": 0.0091, "step": 3429 }, { "epoch": 2.45218945487042, "grad_norm": 0.020273562520742416, "learning_rate": 6.027587930142866e-05, "loss": 0.0155, "step": 3430 }, { "epoch": 2.45218945487042, "eval_loss": 0.009786204434931278, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 3430 }, { "epoch": 2.4529043789097407, "grad_norm": 0.015022347681224346, "learning_rate": 6.0251442077711314e-05, "loss": 0.0134, "step": 3431 }, { "epoch": 2.4536193029490616, "grad_norm": 0.016127416864037514, "learning_rate": 6.022700229749676e-05, "loss": 0.012, "step": 3432 }, { "epoch": 2.4543342269883826, "grad_norm": 0.021690774708986282, "learning_rate": 6.020255996687973e-05, "loss": 0.0174, "step": 3433 }, { "epoch": 2.4550491510277035, "grad_norm": 0.013413789682090282, "learning_rate": 6.0178115091955645e-05, "loss": 0.0104, "step": 3434 }, { "epoch": 2.455764075067024, "grad_norm": 0.014026465825736523, "learning_rate": 6.0153667678820535e-05, "loss": 0.0126, "step": 3435 }, { "epoch": 2.455764075067024, "eval_loss": 0.009933208115398884, "eval_runtime": 4.6031, "eval_samples_per_second": 10.862, "eval_steps_per_second": 2.824, "step": 3435 }, { "epoch": 2.456478999106345, "grad_norm": 0.01901358552277088, "learning_rate": 6.012921773357112e-05, "loss": 0.013, "step": 3436 }, { "epoch": 2.457193923145666, "grad_norm": 0.02045724354684353, "learning_rate": 6.0104765262304676e-05, "loss": 0.0117, "step": 3437 }, { "epoch": 2.457908847184987, "grad_norm": 0.02159615233540535, "learning_rate": 6.0080310271119135e-05, "loss": 0.0109, "step": 3438 }, { "epoch": 2.4586237712243073, "grad_norm": 0.01697409525513649, "learning_rate": 6.0055852766113095e-05, "loss": 0.0117, "step": 3439 }, { "epoch": 2.4593386952636282, "grad_norm": 0.01605617068707943, "learning_rate": 6.003139275338573e-05, "loss": 0.0097, "step": 3440 }, { "epoch": 2.4593386952636282, "eval_loss": 0.010002126917243004, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3440 }, { "epoch": 2.460053619302949, "grad_norm": 0.017724348232150078, "learning_rate": 6.0006930239036864e-05, "loss": 0.0137, "step": 3441 }, { "epoch": 2.4607685433422697, "grad_norm": 0.019545704126358032, "learning_rate": 5.9982465229166954e-05, "loss": 0.0137, "step": 3442 }, { "epoch": 2.4614834673815906, "grad_norm": 0.017232725396752357, "learning_rate": 5.9957997729877045e-05, "loss": 0.0083, "step": 3443 }, { "epoch": 2.4621983914209116, "grad_norm": 0.02455362305045128, "learning_rate": 5.993352774726885e-05, "loss": 0.0173, "step": 3444 }, { "epoch": 2.4629133154602325, "grad_norm": 0.015937859192490578, "learning_rate": 5.990905528744466e-05, "loss": 0.0106, "step": 3445 }, { "epoch": 2.4629133154602325, "eval_loss": 0.010002224706113338, "eval_runtime": 4.5819, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3445 }, { "epoch": 2.463628239499553, "grad_norm": 0.018792642280459404, "learning_rate": 5.988458035650739e-05, "loss": 0.011, "step": 3446 }, { "epoch": 2.464343163538874, "grad_norm": 0.021153537556529045, "learning_rate": 5.9860102960560595e-05, "loss": 0.0157, "step": 3447 }, { "epoch": 2.465058087578195, "grad_norm": 0.019189300015568733, "learning_rate": 5.983562310570844e-05, "loss": 0.0109, "step": 3448 }, { "epoch": 2.465773011617516, "grad_norm": 0.015236149542033672, "learning_rate": 5.9811140798055676e-05, "loss": 0.0116, "step": 3449 }, { "epoch": 2.4664879356568363, "grad_norm": 0.03388509526848793, "learning_rate": 5.978665604370767e-05, "loss": 0.0156, "step": 3450 }, { "epoch": 2.4664879356568363, "eval_loss": 0.009977494366466999, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 3450 }, { "epoch": 2.4672028596961573, "grad_norm": 0.01954769901931286, "learning_rate": 5.976216884877045e-05, "loss": 0.0103, "step": 3451 }, { "epoch": 2.467917783735478, "grad_norm": 0.017812075093388557, "learning_rate": 5.9737679219350584e-05, "loss": 0.0122, "step": 3452 }, { "epoch": 2.468632707774799, "grad_norm": 0.016709204763174057, "learning_rate": 5.971318716155529e-05, "loss": 0.0077, "step": 3453 }, { "epoch": 2.4693476318141196, "grad_norm": 0.015323394909501076, "learning_rate": 5.968869268149239e-05, "loss": 0.0099, "step": 3454 }, { "epoch": 2.4700625558534406, "grad_norm": 0.020438119769096375, "learning_rate": 5.966419578527027e-05, "loss": 0.0114, "step": 3455 }, { "epoch": 2.4700625558534406, "eval_loss": 0.010041872970759869, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3455 }, { "epoch": 2.4707774798927615, "grad_norm": 0.019027957692742348, "learning_rate": 5.9639696478997985e-05, "loss": 0.0103, "step": 3456 }, { "epoch": 2.471492403932082, "grad_norm": 0.017779743298888206, "learning_rate": 5.961519476878513e-05, "loss": 0.0117, "step": 3457 }, { "epoch": 2.472207327971403, "grad_norm": 0.019004274159669876, "learning_rate": 5.9590690660741946e-05, "loss": 0.0154, "step": 3458 }, { "epoch": 2.472922252010724, "grad_norm": 0.021636338904500008, "learning_rate": 5.956618416097921e-05, "loss": 0.0121, "step": 3459 }, { "epoch": 2.473637176050045, "grad_norm": 0.019424673169851303, "learning_rate": 5.9541675275608364e-05, "loss": 0.0111, "step": 3460 }, { "epoch": 2.473637176050045, "eval_loss": 0.010223720222711563, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 3460 }, { "epoch": 2.4743521000893653, "grad_norm": 0.018691087141633034, "learning_rate": 5.951716401074143e-05, "loss": 0.0105, "step": 3461 }, { "epoch": 2.4750670241286863, "grad_norm": 0.028487956151366234, "learning_rate": 5.949265037249097e-05, "loss": 0.0147, "step": 3462 }, { "epoch": 2.4757819481680072, "grad_norm": 0.019114570692181587, "learning_rate": 5.946813436697021e-05, "loss": 0.0153, "step": 3463 }, { "epoch": 2.4764968722073277, "grad_norm": 0.017707299441099167, "learning_rate": 5.944361600029291e-05, "loss": 0.0108, "step": 3464 }, { "epoch": 2.4772117962466487, "grad_norm": 0.020124312490224838, "learning_rate": 5.941909527857348e-05, "loss": 0.0123, "step": 3465 }, { "epoch": 2.4772117962466487, "eval_loss": 0.010037915781140327, "eval_runtime": 4.5819, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3465 }, { "epoch": 2.4779267202859696, "grad_norm": 0.016201965510845184, "learning_rate": 5.9394572207926835e-05, "loss": 0.0185, "step": 3466 }, { "epoch": 2.4786416443252905, "grad_norm": 0.014470250345766544, "learning_rate": 5.937004679446854e-05, "loss": 0.007, "step": 3467 }, { "epoch": 2.4793565683646115, "grad_norm": 0.015046031214296818, "learning_rate": 5.934551904431473e-05, "loss": 0.0166, "step": 3468 }, { "epoch": 2.480071492403932, "grad_norm": 0.01298821996897459, "learning_rate": 5.932098896358212e-05, "loss": 0.0075, "step": 3469 }, { "epoch": 2.480786416443253, "grad_norm": 0.018033862113952637, "learning_rate": 5.9296456558388e-05, "loss": 0.0097, "step": 3470 }, { "epoch": 2.480786416443253, "eval_loss": 0.010007909499108791, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 3470 }, { "epoch": 2.481501340482574, "grad_norm": 0.016725976020097733, "learning_rate": 5.927192183485023e-05, "loss": 0.0127, "step": 3471 }, { "epoch": 2.4822162645218944, "grad_norm": 0.015829216688871384, "learning_rate": 5.924738479908728e-05, "loss": 0.0147, "step": 3472 }, { "epoch": 2.4829311885612153, "grad_norm": 0.014802739024162292, "learning_rate": 5.922284545721817e-05, "loss": 0.0121, "step": 3473 }, { "epoch": 2.4836461126005362, "grad_norm": 0.014283434487879276, "learning_rate": 5.919830381536249e-05, "loss": 0.0076, "step": 3474 }, { "epoch": 2.484361036639857, "grad_norm": 0.018074046820402145, "learning_rate": 5.917375987964044e-05, "loss": 0.015, "step": 3475 }, { "epoch": 2.484361036639857, "eval_loss": 0.01002482883632183, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3475 }, { "epoch": 2.4850759606791777, "grad_norm": 0.017207078635692596, "learning_rate": 5.914921365617276e-05, "loss": 0.0119, "step": 3476 }, { "epoch": 2.4857908847184986, "grad_norm": 0.015229638665914536, "learning_rate": 5.9124665151080785e-05, "loss": 0.013, "step": 3477 }, { "epoch": 2.4865058087578196, "grad_norm": 0.021182935684919357, "learning_rate": 5.910011437048637e-05, "loss": 0.0114, "step": 3478 }, { "epoch": 2.48722073279714, "grad_norm": 0.019293207675218582, "learning_rate": 5.9075561320511996e-05, "loss": 0.0142, "step": 3479 }, { "epoch": 2.487935656836461, "grad_norm": 0.021062985062599182, "learning_rate": 5.905100600728067e-05, "loss": 0.0105, "step": 3480 }, { "epoch": 2.487935656836461, "eval_loss": 0.01005937997251749, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 3480 }, { "epoch": 2.488650580875782, "grad_norm": 0.0144242774695158, "learning_rate": 5.9026448436916005e-05, "loss": 0.011, "step": 3481 }, { "epoch": 2.489365504915103, "grad_norm": 0.021668747067451477, "learning_rate": 5.9001888615542125e-05, "loss": 0.0164, "step": 3482 }, { "epoch": 2.490080428954424, "grad_norm": 0.01595638133585453, "learning_rate": 5.897732654928373e-05, "loss": 0.0131, "step": 3483 }, { "epoch": 2.4907953529937443, "grad_norm": 0.020938875153660774, "learning_rate": 5.8952762244266127e-05, "loss": 0.017, "step": 3484 }, { "epoch": 2.4915102770330653, "grad_norm": 0.019689641892910004, "learning_rate": 5.892819570661511e-05, "loss": 0.0175, "step": 3485 }, { "epoch": 2.4915102770330653, "eval_loss": 0.009995713829994202, "eval_runtime": 4.6046, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 3485 }, { "epoch": 2.492225201072386, "grad_norm": 0.016851546242833138, "learning_rate": 5.890362694245709e-05, "loss": 0.011, "step": 3486 }, { "epoch": 2.4929401251117067, "grad_norm": 0.015689700841903687, "learning_rate": 5.887905595791898e-05, "loss": 0.0129, "step": 3487 }, { "epoch": 2.4936550491510276, "grad_norm": 0.021521905437111855, "learning_rate": 5.8854482759128316e-05, "loss": 0.0142, "step": 3488 }, { "epoch": 2.4943699731903486, "grad_norm": 0.01819470338523388, "learning_rate": 5.882990735221312e-05, "loss": 0.0125, "step": 3489 }, { "epoch": 2.4950848972296695, "grad_norm": 0.01597260683774948, "learning_rate": 5.8805329743301976e-05, "loss": 0.0137, "step": 3490 }, { "epoch": 2.4950848972296695, "eval_loss": 0.009986396878957748, "eval_runtime": 4.5862, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 3490 }, { "epoch": 2.49579982126899, "grad_norm": 0.01721685193479061, "learning_rate": 5.878074993852405e-05, "loss": 0.009, "step": 3491 }, { "epoch": 2.496514745308311, "grad_norm": 0.017892388626933098, "learning_rate": 5.875616794400902e-05, "loss": 0.0094, "step": 3492 }, { "epoch": 2.497229669347632, "grad_norm": 0.019711239263415337, "learning_rate": 5.8731583765887156e-05, "loss": 0.014, "step": 3493 }, { "epoch": 2.4979445933869524, "grad_norm": 0.02416810765862465, "learning_rate": 5.8706997410289225e-05, "loss": 0.019, "step": 3494 }, { "epoch": 2.4986595174262733, "grad_norm": 0.025683537125587463, "learning_rate": 5.868240888334653e-05, "loss": 0.0197, "step": 3495 }, { "epoch": 2.4986595174262733, "eval_loss": 0.010187397710978985, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 3495 }, { "epoch": 2.4993744414655943, "grad_norm": 0.021041138097643852, "learning_rate": 5.865781819119096e-05, "loss": 0.0098, "step": 3496 }, { "epoch": 2.5000893655049152, "grad_norm": 0.01913432776927948, "learning_rate": 5.863322533995495e-05, "loss": 0.015, "step": 3497 }, { "epoch": 2.500804289544236, "grad_norm": 0.02121681720018387, "learning_rate": 5.860863033577141e-05, "loss": 0.0149, "step": 3498 }, { "epoch": 2.5015192135835567, "grad_norm": 0.023808063939213753, "learning_rate": 5.858403318477384e-05, "loss": 0.0221, "step": 3499 }, { "epoch": 2.5022341376228776, "grad_norm": 0.016874536871910095, "learning_rate": 5.855943389309626e-05, "loss": 0.0109, "step": 3500 }, { "epoch": 2.5022341376228776, "eval_loss": 0.010328605771064758, "eval_runtime": 4.5921, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 3500 }, { "epoch": 2.5029490616621985, "grad_norm": 0.019658658653497696, "learning_rate": 5.8534832466873225e-05, "loss": 0.015, "step": 3501 }, { "epoch": 2.503663985701519, "grad_norm": 0.016742104664444923, "learning_rate": 5.851022891223982e-05, "loss": 0.0096, "step": 3502 }, { "epoch": 2.50437890974084, "grad_norm": 0.012133250944316387, "learning_rate": 5.848562323533164e-05, "loss": 0.0099, "step": 3503 }, { "epoch": 2.505093833780161, "grad_norm": 0.014404000714421272, "learning_rate": 5.846101544228488e-05, "loss": 0.0093, "step": 3504 }, { "epoch": 2.505808757819482, "grad_norm": 0.016077160835266113, "learning_rate": 5.843640553923617e-05, "loss": 0.0115, "step": 3505 }, { "epoch": 2.505808757819482, "eval_loss": 0.010135157965123653, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3505 }, { "epoch": 2.5065236818588024, "grad_norm": 0.019873319193720818, "learning_rate": 5.841179353232273e-05, "loss": 0.0105, "step": 3506 }, { "epoch": 2.5072386058981233, "grad_norm": 0.017132965847849846, "learning_rate": 5.838717942768226e-05, "loss": 0.0115, "step": 3507 }, { "epoch": 2.5079535299374442, "grad_norm": 0.01699494943022728, "learning_rate": 5.8362563231453035e-05, "loss": 0.0118, "step": 3508 }, { "epoch": 2.5086684539767647, "grad_norm": 0.015039406716823578, "learning_rate": 5.8337944949773825e-05, "loss": 0.013, "step": 3509 }, { "epoch": 2.5093833780160857, "grad_norm": 0.014556331560015678, "learning_rate": 5.831332458878391e-05, "loss": 0.0112, "step": 3510 }, { "epoch": 2.5093833780160857, "eval_loss": 0.01007898524403572, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 3510 }, { "epoch": 2.5100983020554066, "grad_norm": 0.018019551411271095, "learning_rate": 5.8288702154623095e-05, "loss": 0.0104, "step": 3511 }, { "epoch": 2.5108132260947276, "grad_norm": 0.019007589668035507, "learning_rate": 5.8264077653431716e-05, "loss": 0.0132, "step": 3512 }, { "epoch": 2.5115281501340485, "grad_norm": 0.019002562388777733, "learning_rate": 5.8239451091350595e-05, "loss": 0.0122, "step": 3513 }, { "epoch": 2.512243074173369, "grad_norm": 0.017620397731661797, "learning_rate": 5.82148224745211e-05, "loss": 0.0159, "step": 3514 }, { "epoch": 2.51295799821269, "grad_norm": 0.02146136946976185, "learning_rate": 5.819019180908509e-05, "loss": 0.0124, "step": 3515 }, { "epoch": 2.51295799821269, "eval_loss": 0.010165588930249214, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3515 }, { "epoch": 2.513672922252011, "grad_norm": 0.016548877581954002, "learning_rate": 5.8165559101184955e-05, "loss": 0.0101, "step": 3516 }, { "epoch": 2.5143878462913314, "grad_norm": 0.01386423222720623, "learning_rate": 5.814092435696358e-05, "loss": 0.0092, "step": 3517 }, { "epoch": 2.5151027703306523, "grad_norm": 0.018445773050189018, "learning_rate": 5.8116287582564334e-05, "loss": 0.0146, "step": 3518 }, { "epoch": 2.5158176943699733, "grad_norm": 0.018669217824935913, "learning_rate": 5.809164878413114e-05, "loss": 0.0138, "step": 3519 }, { "epoch": 2.516532618409294, "grad_norm": 0.019198989495635033, "learning_rate": 5.8067007967808405e-05, "loss": 0.0104, "step": 3520 }, { "epoch": 2.516532618409294, "eval_loss": 0.010393330827355385, "eval_runtime": 4.5873, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3520 }, { "epoch": 2.5172475424486147, "grad_norm": 0.017724812030792236, "learning_rate": 5.804236513974104e-05, "loss": 0.0146, "step": 3521 }, { "epoch": 2.5179624664879356, "grad_norm": 0.01789853163063526, "learning_rate": 5.8017720306074454e-05, "loss": 0.0118, "step": 3522 }, { "epoch": 2.5186773905272566, "grad_norm": 0.01878749392926693, "learning_rate": 5.799307347295454e-05, "loss": 0.0151, "step": 3523 }, { "epoch": 2.519392314566577, "grad_norm": 0.021202344447374344, "learning_rate": 5.7968424646527744e-05, "loss": 0.0171, "step": 3524 }, { "epoch": 2.520107238605898, "grad_norm": 0.015414373949170113, "learning_rate": 5.794377383294094e-05, "loss": 0.0089, "step": 3525 }, { "epoch": 2.520107238605898, "eval_loss": 0.010301931761205196, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 3525 }, { "epoch": 2.520822162645219, "grad_norm": 0.018814286217093468, "learning_rate": 5.791912103834154e-05, "loss": 0.0113, "step": 3526 }, { "epoch": 2.52153708668454, "grad_norm": 0.02109438180923462, "learning_rate": 5.789446626887742e-05, "loss": 0.0139, "step": 3527 }, { "epoch": 2.522252010723861, "grad_norm": 0.015207774937152863, "learning_rate": 5.786980953069702e-05, "loss": 0.01, "step": 3528 }, { "epoch": 2.5229669347631813, "grad_norm": 0.018458876758813858, "learning_rate": 5.784515082994917e-05, "loss": 0.012, "step": 3529 }, { "epoch": 2.5236818588025023, "grad_norm": 0.015563370659947395, "learning_rate": 5.782049017278326e-05, "loss": 0.0105, "step": 3530 }, { "epoch": 2.5236818588025023, "eval_loss": 0.010197172872722149, "eval_runtime": 4.5813, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3530 }, { "epoch": 2.524396782841823, "grad_norm": 0.01510550081729889, "learning_rate": 5.779582756534914e-05, "loss": 0.0117, "step": 3531 }, { "epoch": 2.5251117068811437, "grad_norm": 0.019614163786172867, "learning_rate": 5.777116301379717e-05, "loss": 0.0091, "step": 3532 }, { "epoch": 2.5258266309204647, "grad_norm": 0.012254537083208561, "learning_rate": 5.7746496524278174e-05, "loss": 0.0071, "step": 3533 }, { "epoch": 2.5265415549597856, "grad_norm": 0.023883704096078873, "learning_rate": 5.772182810294344e-05, "loss": 0.017, "step": 3534 }, { "epoch": 2.5272564789991065, "grad_norm": 0.020292742177844048, "learning_rate": 5.7697157755944775e-05, "loss": 0.0108, "step": 3535 }, { "epoch": 2.5272564789991065, "eval_loss": 0.009960038587450981, "eval_runtime": 4.5898, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 3535 }, { "epoch": 2.527971403038427, "grad_norm": 0.0187588669359684, "learning_rate": 5.767248548943446e-05, "loss": 0.0156, "step": 3536 }, { "epoch": 2.528686327077748, "grad_norm": 0.023695221170783043, "learning_rate": 5.764781130956525e-05, "loss": 0.0122, "step": 3537 }, { "epoch": 2.529401251117069, "grad_norm": 0.023351481184363365, "learning_rate": 5.762313522249036e-05, "loss": 0.0103, "step": 3538 }, { "epoch": 2.5301161751563894, "grad_norm": 0.018328336998820305, "learning_rate": 5.7598457234363476e-05, "loss": 0.0093, "step": 3539 }, { "epoch": 2.5308310991957104, "grad_norm": 0.02068609744310379, "learning_rate": 5.757377735133882e-05, "loss": 0.0143, "step": 3540 }, { "epoch": 2.5308310991957104, "eval_loss": 0.00970041286200285, "eval_runtime": 4.621, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.813, "step": 3540 }, { "epoch": 2.5315460232350313, "grad_norm": 0.017650270834565163, "learning_rate": 5.754909557957103e-05, "loss": 0.0132, "step": 3541 }, { "epoch": 2.5322609472743522, "grad_norm": 0.013323877938091755, "learning_rate": 5.7524411925215224e-05, "loss": 0.0095, "step": 3542 }, { "epoch": 2.532975871313673, "grad_norm": 0.01318918727338314, "learning_rate": 5.7499726394426976e-05, "loss": 0.0107, "step": 3543 }, { "epoch": 2.5336907953529937, "grad_norm": 0.018731018528342247, "learning_rate": 5.747503899336237e-05, "loss": 0.0171, "step": 3544 }, { "epoch": 2.5344057193923146, "grad_norm": 0.016538523137569427, "learning_rate": 5.745034972817794e-05, "loss": 0.0142, "step": 3545 }, { "epoch": 2.5344057193923146, "eval_loss": 0.00967005081474781, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 3545 }, { "epoch": 2.535120643431635, "grad_norm": 0.018686026334762573, "learning_rate": 5.7425658605030654e-05, "loss": 0.0103, "step": 3546 }, { "epoch": 2.535835567470956, "grad_norm": 0.019754508510231972, "learning_rate": 5.7400965630077966e-05, "loss": 0.0146, "step": 3547 }, { "epoch": 2.536550491510277, "grad_norm": 0.015965700149536133, "learning_rate": 5.73762708094778e-05, "loss": 0.0091, "step": 3548 }, { "epoch": 2.537265415549598, "grad_norm": 0.015056990087032318, "learning_rate": 5.7351574149388555e-05, "loss": 0.0078, "step": 3549 }, { "epoch": 2.537980339588919, "grad_norm": 0.016744662076234818, "learning_rate": 5.732687565596904e-05, "loss": 0.0105, "step": 3550 }, { "epoch": 2.537980339588919, "eval_loss": 0.009834264405071735, "eval_runtime": 4.6107, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.82, "step": 3550 }, { "epoch": 2.5386952636282394, "grad_norm": 0.016611026600003242, "learning_rate": 5.730217533537853e-05, "loss": 0.0129, "step": 3551 }, { "epoch": 2.5394101876675603, "grad_norm": 0.01803259737789631, "learning_rate": 5.72774731937768e-05, "loss": 0.0083, "step": 3552 }, { "epoch": 2.5401251117068813, "grad_norm": 0.021304497495293617, "learning_rate": 5.725276923732405e-05, "loss": 0.0213, "step": 3553 }, { "epoch": 2.5408400357462018, "grad_norm": 0.019326023757457733, "learning_rate": 5.722806347218095e-05, "loss": 0.0136, "step": 3554 }, { "epoch": 2.5415549597855227, "grad_norm": 0.011779394000768661, "learning_rate": 5.720335590450857e-05, "loss": 0.0062, "step": 3555 }, { "epoch": 2.5415549597855227, "eval_loss": 0.009845789521932602, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 3555 }, { "epoch": 2.5422698838248436, "grad_norm": 0.0190576259046793, "learning_rate": 5.7178646540468495e-05, "loss": 0.0154, "step": 3556 }, { "epoch": 2.5429848078641646, "grad_norm": 0.01564040221273899, "learning_rate": 5.7153935386222715e-05, "loss": 0.0085, "step": 3557 }, { "epoch": 2.5436997319034855, "grad_norm": 0.018993815407156944, "learning_rate": 5.7129222447933684e-05, "loss": 0.0141, "step": 3558 }, { "epoch": 2.544414655942806, "grad_norm": 0.02334866300225258, "learning_rate": 5.710450773176428e-05, "loss": 0.0165, "step": 3559 }, { "epoch": 2.545129579982127, "grad_norm": 0.014683226123452187, "learning_rate": 5.707979124387788e-05, "loss": 0.0108, "step": 3560 }, { "epoch": 2.545129579982127, "eval_loss": 0.00991287361830473, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3560 }, { "epoch": 2.5458445040214475, "grad_norm": 0.01732531189918518, "learning_rate": 5.705507299043822e-05, "loss": 0.0161, "step": 3561 }, { "epoch": 2.5465594280607684, "grad_norm": 0.01732584834098816, "learning_rate": 5.703035297760956e-05, "loss": 0.0138, "step": 3562 }, { "epoch": 2.5472743521000893, "grad_norm": 0.01803402602672577, "learning_rate": 5.700563121155651e-05, "loss": 0.0121, "step": 3563 }, { "epoch": 2.5479892761394103, "grad_norm": 0.01961555890738964, "learning_rate": 5.6980907698444206e-05, "loss": 0.011, "step": 3564 }, { "epoch": 2.5487042001787312, "grad_norm": 0.01698184758424759, "learning_rate": 5.695618244443818e-05, "loss": 0.0084, "step": 3565 }, { "epoch": 2.5487042001787312, "eval_loss": 0.009838726371526718, "eval_runtime": 4.6139, "eval_samples_per_second": 10.837, "eval_steps_per_second": 2.818, "step": 3565 }, { "epoch": 2.5494191242180517, "grad_norm": 0.017451032996177673, "learning_rate": 5.693145545570439e-05, "loss": 0.012, "step": 3566 }, { "epoch": 2.5501340482573727, "grad_norm": 0.020952828228473663, "learning_rate": 5.690672673840921e-05, "loss": 0.0138, "step": 3567 }, { "epoch": 2.5508489722966936, "grad_norm": 0.019665561616420746, "learning_rate": 5.688199629871952e-05, "loss": 0.0095, "step": 3568 }, { "epoch": 2.551563896336014, "grad_norm": 0.01642211526632309, "learning_rate": 5.6857264142802535e-05, "loss": 0.0118, "step": 3569 }, { "epoch": 2.552278820375335, "grad_norm": 0.01725701056420803, "learning_rate": 5.683253027682597e-05, "loss": 0.0102, "step": 3570 }, { "epoch": 2.552278820375335, "eval_loss": 0.009783279150724411, "eval_runtime": 4.5882, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 3570 }, { "epoch": 2.552993744414656, "grad_norm": 0.019369741901755333, "learning_rate": 5.680779470695791e-05, "loss": 0.0083, "step": 3571 }, { "epoch": 2.553708668453977, "grad_norm": 0.018769899383187294, "learning_rate": 5.678305743936692e-05, "loss": 0.0112, "step": 3572 }, { "epoch": 2.5544235924932974, "grad_norm": 0.016634538769721985, "learning_rate": 5.675831848022195e-05, "loss": 0.0117, "step": 3573 }, { "epoch": 2.5551385165326184, "grad_norm": 0.017604121938347816, "learning_rate": 5.6733577835692374e-05, "loss": 0.0107, "step": 3574 }, { "epoch": 2.5558534405719393, "grad_norm": 0.016330348327755928, "learning_rate": 5.670883551194803e-05, "loss": 0.0098, "step": 3575 }, { "epoch": 2.5558534405719393, "eval_loss": 0.009748725220561028, "eval_runtime": 4.6176, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.815, "step": 3575 }, { "epoch": 2.55656836461126, "grad_norm": 0.018252486363053322, "learning_rate": 5.66840915151591e-05, "loss": 0.0156, "step": 3576 }, { "epoch": 2.5572832886505807, "grad_norm": 0.01912430115044117, "learning_rate": 5.665934585149626e-05, "loss": 0.0135, "step": 3577 }, { "epoch": 2.5579982126899017, "grad_norm": 0.01513961423188448, "learning_rate": 5.663459852713055e-05, "loss": 0.0116, "step": 3578 }, { "epoch": 2.5587131367292226, "grad_norm": 0.01818014122545719, "learning_rate": 5.660984954823342e-05, "loss": 0.014, "step": 3579 }, { "epoch": 2.5594280607685436, "grad_norm": 0.016291329637169838, "learning_rate": 5.6585098920976785e-05, "loss": 0.0076, "step": 3580 }, { "epoch": 2.5594280607685436, "eval_loss": 0.009794091805815697, "eval_runtime": 4.6109, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.819, "step": 3580 }, { "epoch": 2.560142984807864, "grad_norm": 0.020286403596401215, "learning_rate": 5.656034665153294e-05, "loss": 0.0147, "step": 3581 }, { "epoch": 2.560857908847185, "grad_norm": 0.019734598696231842, "learning_rate": 5.653559274607455e-05, "loss": 0.0123, "step": 3582 }, { "epoch": 2.561572832886506, "grad_norm": 0.028509529307484627, "learning_rate": 5.6510837210774746e-05, "loss": 0.0197, "step": 3583 }, { "epoch": 2.5622877569258264, "grad_norm": 0.015357032418251038, "learning_rate": 5.648608005180707e-05, "loss": 0.0133, "step": 3584 }, { "epoch": 2.5630026809651474, "grad_norm": 0.021490972489118576, "learning_rate": 5.6461321275345405e-05, "loss": 0.0136, "step": 3585 }, { "epoch": 2.5630026809651474, "eval_loss": 0.009757224470376968, "eval_runtime": 4.5903, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 3585 }, { "epoch": 2.5637176050044683, "grad_norm": 0.015702730044722557, "learning_rate": 5.64365608875641e-05, "loss": 0.0078, "step": 3586 }, { "epoch": 2.5644325290437893, "grad_norm": 0.020602749660611153, "learning_rate": 5.641179889463788e-05, "loss": 0.011, "step": 3587 }, { "epoch": 2.5651474530831098, "grad_norm": 0.020175818353891373, "learning_rate": 5.6387035302741864e-05, "loss": 0.0139, "step": 3588 }, { "epoch": 2.5658623771224307, "grad_norm": 0.019049881026148796, "learning_rate": 5.63622701180516e-05, "loss": 0.0109, "step": 3589 }, { "epoch": 2.5665773011617516, "grad_norm": 0.013467571698129177, "learning_rate": 5.6337503346742995e-05, "loss": 0.0095, "step": 3590 }, { "epoch": 2.5665773011617516, "eval_loss": 0.009805697947740555, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 3590 }, { "epoch": 2.567292225201072, "grad_norm": 0.017252733930945396, "learning_rate": 5.631273499499236e-05, "loss": 0.0084, "step": 3591 }, { "epoch": 2.568007149240393, "grad_norm": 0.02046464942395687, "learning_rate": 5.6287965068976425e-05, "loss": 0.015, "step": 3592 }, { "epoch": 2.568722073279714, "grad_norm": 0.018521279096603394, "learning_rate": 5.62631935748723e-05, "loss": 0.0083, "step": 3593 }, { "epoch": 2.569436997319035, "grad_norm": 0.018549978733062744, "learning_rate": 5.6238420518857474e-05, "loss": 0.0115, "step": 3594 }, { "epoch": 2.570151921358356, "grad_norm": 0.017956461757421494, "learning_rate": 5.621364590710981e-05, "loss": 0.0119, "step": 3595 }, { "epoch": 2.570151921358356, "eval_loss": 0.009891853667795658, "eval_runtime": 4.5862, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 3595 }, { "epoch": 2.5708668453976764, "grad_norm": 0.025391001254320145, "learning_rate": 5.618886974580761e-05, "loss": 0.0215, "step": 3596 }, { "epoch": 2.5715817694369973, "grad_norm": 0.020184624940156937, "learning_rate": 5.6164092041129546e-05, "loss": 0.016, "step": 3597 }, { "epoch": 2.5722966934763183, "grad_norm": 0.021665731444954872, "learning_rate": 5.613931279925465e-05, "loss": 0.0173, "step": 3598 }, { "epoch": 2.573011617515639, "grad_norm": 0.01348867267370224, "learning_rate": 5.6114532026362356e-05, "loss": 0.0074, "step": 3599 }, { "epoch": 2.5737265415549597, "grad_norm": 0.02003348432481289, "learning_rate": 5.6089749728632446e-05, "loss": 0.0137, "step": 3600 }, { "epoch": 2.5737265415549597, "eval_loss": 0.009791860356926918, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 3600 }, { "epoch": 2.5744414655942807, "grad_norm": 0.018992537632584572, "learning_rate": 5.606496591224516e-05, "loss": 0.0087, "step": 3601 }, { "epoch": 2.5751563896336016, "grad_norm": 0.019080743193626404, "learning_rate": 5.604018058338104e-05, "loss": 0.0137, "step": 3602 }, { "epoch": 2.575871313672922, "grad_norm": 0.016177577897906303, "learning_rate": 5.6015393748221024e-05, "loss": 0.0085, "step": 3603 }, { "epoch": 2.576586237712243, "grad_norm": 0.020405350252985954, "learning_rate": 5.5990605412946464e-05, "loss": 0.0185, "step": 3604 }, { "epoch": 2.577301161751564, "grad_norm": 0.025122297927737236, "learning_rate": 5.596581558373904e-05, "loss": 0.0172, "step": 3605 }, { "epoch": 2.577301161751564, "eval_loss": 0.009770983830094337, "eval_runtime": 4.5895, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 3605 }, { "epoch": 2.5780160857908845, "grad_norm": 0.017446396872401237, "learning_rate": 5.594102426678082e-05, "loss": 0.0085, "step": 3606 }, { "epoch": 2.5787310098302054, "grad_norm": 0.024099184200167656, "learning_rate": 5.591623146825423e-05, "loss": 0.0153, "step": 3607 }, { "epoch": 2.5794459338695264, "grad_norm": 0.018504120409488678, "learning_rate": 5.5891437194342114e-05, "loss": 0.0106, "step": 3608 }, { "epoch": 2.5801608579088473, "grad_norm": 0.025876890867948532, "learning_rate": 5.586664145122764e-05, "loss": 0.0159, "step": 3609 }, { "epoch": 2.5808757819481682, "grad_norm": 0.017959048971533775, "learning_rate": 5.584184424509434e-05, "loss": 0.0094, "step": 3610 }, { "epoch": 2.5808757819481682, "eval_loss": 0.009920342825353146, "eval_runtime": 4.6091, "eval_samples_per_second": 10.848, "eval_steps_per_second": 2.821, "step": 3610 }, { "epoch": 2.5815907059874887, "grad_norm": 0.01753164827823639, "learning_rate": 5.581704558212615e-05, "loss": 0.0099, "step": 3611 }, { "epoch": 2.5823056300268097, "grad_norm": 0.02753392979502678, "learning_rate": 5.57922454685073e-05, "loss": 0.0208, "step": 3612 }, { "epoch": 2.5830205540661306, "grad_norm": 0.01827365905046463, "learning_rate": 5.5767443910422455e-05, "loss": 0.0164, "step": 3613 }, { "epoch": 2.583735478105451, "grad_norm": 0.01673763245344162, "learning_rate": 5.574264091405661e-05, "loss": 0.0131, "step": 3614 }, { "epoch": 2.584450402144772, "grad_norm": 0.013868700712919235, "learning_rate": 5.57178364855951e-05, "loss": 0.0095, "step": 3615 }, { "epoch": 2.584450402144772, "eval_loss": 0.009930268861353397, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 3615 }, { "epoch": 2.585165326184093, "grad_norm": 0.019804468378424644, "learning_rate": 5.569303063122364e-05, "loss": 0.0116, "step": 3616 }, { "epoch": 2.585880250223414, "grad_norm": 0.016557831317186356, "learning_rate": 5.566822335712832e-05, "loss": 0.0141, "step": 3617 }, { "epoch": 2.5865951742627344, "grad_norm": 0.01370214018970728, "learning_rate": 5.564341466949553e-05, "loss": 0.0121, "step": 3618 }, { "epoch": 2.5873100983020554, "grad_norm": 0.014599219895899296, "learning_rate": 5.561860457451207e-05, "loss": 0.0154, "step": 3619 }, { "epoch": 2.5880250223413763, "grad_norm": 0.015908993780612946, "learning_rate": 5.559379307836503e-05, "loss": 0.0127, "step": 3620 }, { "epoch": 2.5880250223413763, "eval_loss": 0.009797872044146061, "eval_runtime": 4.5872, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3620 }, { "epoch": 2.588739946380697, "grad_norm": 0.017824312672019005, "learning_rate": 5.556898018724191e-05, "loss": 0.0165, "step": 3621 }, { "epoch": 2.5894548704200178, "grad_norm": 0.017058156430721283, "learning_rate": 5.554416590733054e-05, "loss": 0.0093, "step": 3622 }, { "epoch": 2.5901697944593387, "grad_norm": 0.013529292307794094, "learning_rate": 5.5519350244819057e-05, "loss": 0.0083, "step": 3623 }, { "epoch": 2.5908847184986596, "grad_norm": 0.022954823449254036, "learning_rate": 5.549453320589597e-05, "loss": 0.0116, "step": 3624 }, { "epoch": 2.5915996425379806, "grad_norm": 0.0214945450425148, "learning_rate": 5.546971479675017e-05, "loss": 0.0101, "step": 3625 }, { "epoch": 2.5915996425379806, "eval_loss": 0.009793068282306194, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 3625 }, { "epoch": 2.592314566577301, "grad_norm": 0.01899578608572483, "learning_rate": 5.544489502357084e-05, "loss": 0.0102, "step": 3626 }, { "epoch": 2.593029490616622, "grad_norm": 0.020324287936091423, "learning_rate": 5.5420073892547484e-05, "loss": 0.0127, "step": 3627 }, { "epoch": 2.593744414655943, "grad_norm": 0.01747136376798153, "learning_rate": 5.539525140987003e-05, "loss": 0.0115, "step": 3628 }, { "epoch": 2.5944593386952635, "grad_norm": 0.020057834684848785, "learning_rate": 5.537042758172866e-05, "loss": 0.0088, "step": 3629 }, { "epoch": 2.5951742627345844, "grad_norm": 0.02446053922176361, "learning_rate": 5.534560241431392e-05, "loss": 0.0158, "step": 3630 }, { "epoch": 2.5951742627345844, "eval_loss": 0.009865433908998966, "eval_runtime": 4.606, "eval_samples_per_second": 10.855, "eval_steps_per_second": 2.822, "step": 3630 }, { "epoch": 2.5958891867739053, "grad_norm": 0.020393501967191696, "learning_rate": 5.532077591381672e-05, "loss": 0.0075, "step": 3631 }, { "epoch": 2.5966041108132263, "grad_norm": 0.023244177922606468, "learning_rate": 5.529594808642824e-05, "loss": 0.0174, "step": 3632 }, { "epoch": 2.597319034852547, "grad_norm": 0.018165210261940956, "learning_rate": 5.527111893834004e-05, "loss": 0.0134, "step": 3633 }, { "epoch": 2.5980339588918677, "grad_norm": 0.024112829938530922, "learning_rate": 5.524628847574401e-05, "loss": 0.0145, "step": 3634 }, { "epoch": 2.5987488829311887, "grad_norm": 0.02168179303407669, "learning_rate": 5.522145670483233e-05, "loss": 0.0118, "step": 3635 }, { "epoch": 2.5987488829311887, "eval_loss": 0.00984195526689291, "eval_runtime": 4.6052, "eval_samples_per_second": 10.857, "eval_steps_per_second": 2.823, "step": 3635 }, { "epoch": 2.599463806970509, "grad_norm": 0.01511990837752819, "learning_rate": 5.519662363179754e-05, "loss": 0.0082, "step": 3636 }, { "epoch": 2.60017873100983, "grad_norm": 0.021410802379250526, "learning_rate": 5.5171789262832496e-05, "loss": 0.0165, "step": 3637 }, { "epoch": 2.600893655049151, "grad_norm": 0.019150041043758392, "learning_rate": 5.514695360413037e-05, "loss": 0.0164, "step": 3638 }, { "epoch": 2.601608579088472, "grad_norm": 0.015402458608150482, "learning_rate": 5.5122116661884646e-05, "loss": 0.0108, "step": 3639 }, { "epoch": 2.602323503127793, "grad_norm": 0.015448440797626972, "learning_rate": 5.509727844228917e-05, "loss": 0.008, "step": 3640 }, { "epoch": 2.602323503127793, "eval_loss": 0.009657224640250206, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3640 }, { "epoch": 2.6030384271671134, "grad_norm": 0.02169787511229515, "learning_rate": 5.507243895153807e-05, "loss": 0.0113, "step": 3641 }, { "epoch": 2.6037533512064344, "grad_norm": 0.013738666661083698, "learning_rate": 5.504759819582581e-05, "loss": 0.0116, "step": 3642 }, { "epoch": 2.6044682752457553, "grad_norm": 0.014071610756218433, "learning_rate": 5.502275618134715e-05, "loss": 0.0079, "step": 3643 }, { "epoch": 2.605183199285076, "grad_norm": 0.022182727232575417, "learning_rate": 5.4997912914297145e-05, "loss": 0.0255, "step": 3644 }, { "epoch": 2.6058981233243967, "grad_norm": 0.017825355753302574, "learning_rate": 5.497306840087124e-05, "loss": 0.0157, "step": 3645 }, { "epoch": 2.6058981233243967, "eval_loss": 0.009526352398097515, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3645 }, { "epoch": 2.6066130473637177, "grad_norm": 0.014245634898543358, "learning_rate": 5.4948222647265116e-05, "loss": 0.0087, "step": 3646 }, { "epoch": 2.6073279714030386, "grad_norm": 0.0194789320230484, "learning_rate": 5.492337565967479e-05, "loss": 0.0128, "step": 3647 }, { "epoch": 2.608042895442359, "grad_norm": 0.018353857100009918, "learning_rate": 5.489852744429659e-05, "loss": 0.0113, "step": 3648 }, { "epoch": 2.60875781948168, "grad_norm": 0.01668408326804638, "learning_rate": 5.487367800732714e-05, "loss": 0.0128, "step": 3649 }, { "epoch": 2.609472743521001, "grad_norm": 0.019776375964283943, "learning_rate": 5.4848827354963396e-05, "loss": 0.0113, "step": 3650 }, { "epoch": 2.609472743521001, "eval_loss": 0.009506037458777428, "eval_runtime": 4.5896, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 3650 }, { "epoch": 2.6101876675603215, "grad_norm": 0.01822618953883648, "learning_rate": 5.482397549340256e-05, "loss": 0.0082, "step": 3651 }, { "epoch": 2.6109025915996424, "grad_norm": 0.01683836244046688, "learning_rate": 5.4799122428842185e-05, "loss": 0.0107, "step": 3652 }, { "epoch": 2.6116175156389634, "grad_norm": 0.01754283346235752, "learning_rate": 5.477426816748014e-05, "loss": 0.0117, "step": 3653 }, { "epoch": 2.6123324396782843, "grad_norm": 0.011084221303462982, "learning_rate": 5.474941271551453e-05, "loss": 0.0064, "step": 3654 }, { "epoch": 2.6130473637176053, "grad_norm": 0.026071395725011826, "learning_rate": 5.47245560791438e-05, "loss": 0.0162, "step": 3655 }, { "epoch": 2.6130473637176053, "eval_loss": 0.009657188318669796, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 3655 }, { "epoch": 2.6137622877569258, "grad_norm": 0.017331063747406006, "learning_rate": 5.469969826456667e-05, "loss": 0.0087, "step": 3656 }, { "epoch": 2.6144772117962467, "grad_norm": 0.018337858840823174, "learning_rate": 5.467483927798217e-05, "loss": 0.0112, "step": 3657 }, { "epoch": 2.615192135835567, "grad_norm": 0.024812130257487297, "learning_rate": 5.464997912558963e-05, "loss": 0.0119, "step": 3658 }, { "epoch": 2.615907059874888, "grad_norm": 0.014692067168653011, "learning_rate": 5.462511781358866e-05, "loss": 0.01, "step": 3659 }, { "epoch": 2.616621983914209, "grad_norm": 0.022502830252051353, "learning_rate": 5.460025534817911e-05, "loss": 0.014, "step": 3660 }, { "epoch": 2.616621983914209, "eval_loss": 0.009865226224064827, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 3660 }, { "epoch": 2.61733690795353, "grad_norm": 0.022578410804271698, "learning_rate": 5.457539173556121e-05, "loss": 0.0125, "step": 3661 }, { "epoch": 2.618051831992851, "grad_norm": 0.021808432415127754, "learning_rate": 5.45505269819354e-05, "loss": 0.0123, "step": 3662 }, { "epoch": 2.6187667560321715, "grad_norm": 0.022117773070931435, "learning_rate": 5.4525661093502475e-05, "loss": 0.0127, "step": 3663 }, { "epoch": 2.6194816800714924, "grad_norm": 0.016008099541068077, "learning_rate": 5.450079407646342e-05, "loss": 0.0084, "step": 3664 }, { "epoch": 2.6201966041108133, "grad_norm": 0.020802093669772148, "learning_rate": 5.44759259370196e-05, "loss": 0.0192, "step": 3665 }, { "epoch": 2.6201966041108133, "eval_loss": 0.010034097358584404, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 3665 }, { "epoch": 2.620911528150134, "grad_norm": 0.022768516093492508, "learning_rate": 5.4451056681372606e-05, "loss": 0.0162, "step": 3666 }, { "epoch": 2.621626452189455, "grad_norm": 0.0233808271586895, "learning_rate": 5.442618631572428e-05, "loss": 0.0157, "step": 3667 }, { "epoch": 2.6223413762287757, "grad_norm": 0.020047755911946297, "learning_rate": 5.4401314846276806e-05, "loss": 0.0111, "step": 3668 }, { "epoch": 2.6230563002680967, "grad_norm": 0.01622384786605835, "learning_rate": 5.437644227923261e-05, "loss": 0.0096, "step": 3669 }, { "epoch": 2.6237712243074176, "grad_norm": 0.023330083116889, "learning_rate": 5.435156862079439e-05, "loss": 0.0173, "step": 3670 }, { "epoch": 2.6237712243074176, "eval_loss": 0.009890497662127018, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3670 }, { "epoch": 2.624486148346738, "grad_norm": 0.018357956781983376, "learning_rate": 5.4326693877165125e-05, "loss": 0.011, "step": 3671 }, { "epoch": 2.625201072386059, "grad_norm": 0.01707778498530388, "learning_rate": 5.4301818054548046e-05, "loss": 0.0091, "step": 3672 }, { "epoch": 2.6259159964253795, "grad_norm": 0.019555214792490005, "learning_rate": 5.427694115914669e-05, "loss": 0.0148, "step": 3673 }, { "epoch": 2.6266309204647005, "grad_norm": 0.015194660983979702, "learning_rate": 5.425206319716483e-05, "loss": 0.0099, "step": 3674 }, { "epoch": 2.6273458445040214, "grad_norm": 0.020993394777178764, "learning_rate": 5.422718417480651e-05, "loss": 0.02, "step": 3675 }, { "epoch": 2.6273458445040214, "eval_loss": 0.00992986187338829, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 3675 }, { "epoch": 2.6280607685433424, "grad_norm": 0.016972769051790237, "learning_rate": 5.4202304098276034e-05, "loss": 0.0129, "step": 3676 }, { "epoch": 2.6287756925826633, "grad_norm": 0.02343299426138401, "learning_rate": 5.417742297377801e-05, "loss": 0.0098, "step": 3677 }, { "epoch": 2.629490616621984, "grad_norm": 0.018698077648878098, "learning_rate": 5.415254080751725e-05, "loss": 0.0108, "step": 3678 }, { "epoch": 2.6302055406613047, "grad_norm": 0.017802143469452858, "learning_rate": 5.412765760569886e-05, "loss": 0.0098, "step": 3679 }, { "epoch": 2.6309204647006257, "grad_norm": 0.020190848037600517, "learning_rate": 5.410277337452817e-05, "loss": 0.011, "step": 3680 }, { "epoch": 2.6309204647006257, "eval_loss": 0.009768341667950153, "eval_runtime": 4.5874, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3680 }, { "epoch": 2.631635388739946, "grad_norm": 0.01846250332891941, "learning_rate": 5.407788812021082e-05, "loss": 0.014, "step": 3681 }, { "epoch": 2.632350312779267, "grad_norm": 0.02388993836939335, "learning_rate": 5.405300184895268e-05, "loss": 0.0095, "step": 3682 }, { "epoch": 2.633065236818588, "grad_norm": 0.01926545798778534, "learning_rate": 5.4028114566959853e-05, "loss": 0.0143, "step": 3683 }, { "epoch": 2.633780160857909, "grad_norm": 0.01574675925076008, "learning_rate": 5.400322628043869e-05, "loss": 0.0126, "step": 3684 }, { "epoch": 2.63449508489723, "grad_norm": 0.021531620994210243, "learning_rate": 5.397833699559587e-05, "loss": 0.0115, "step": 3685 }, { "epoch": 2.63449508489723, "eval_loss": 0.009589532390236855, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 3685 }, { "epoch": 2.6352100089365504, "grad_norm": 0.022794419899582863, "learning_rate": 5.395344671863823e-05, "loss": 0.0125, "step": 3686 }, { "epoch": 2.6359249329758714, "grad_norm": 0.023499278351664543, "learning_rate": 5.39285554557729e-05, "loss": 0.015, "step": 3687 }, { "epoch": 2.636639857015192, "grad_norm": 0.01878574676811695, "learning_rate": 5.3903663213207235e-05, "loss": 0.0107, "step": 3688 }, { "epoch": 2.637354781054513, "grad_norm": 0.025075184181332588, "learning_rate": 5.387876999714885e-05, "loss": 0.0159, "step": 3689 }, { "epoch": 2.6380697050938338, "grad_norm": 0.01564769633114338, "learning_rate": 5.385387581380561e-05, "loss": 0.0135, "step": 3690 }, { "epoch": 2.6380697050938338, "eval_loss": 0.009448870085179806, "eval_runtime": 4.6162, "eval_samples_per_second": 10.831, "eval_steps_per_second": 2.816, "step": 3690 }, { "epoch": 2.6387846291331547, "grad_norm": 0.01389281265437603, "learning_rate": 5.382898066938559e-05, "loss": 0.0109, "step": 3691 }, { "epoch": 2.6394995531724756, "grad_norm": 0.013740493915975094, "learning_rate": 5.3804084570097116e-05, "loss": 0.007, "step": 3692 }, { "epoch": 2.640214477211796, "grad_norm": 0.017708489671349525, "learning_rate": 5.377918752214878e-05, "loss": 0.0116, "step": 3693 }, { "epoch": 2.640929401251117, "grad_norm": 0.016468247398734093, "learning_rate": 5.375428953174939e-05, "loss": 0.0119, "step": 3694 }, { "epoch": 2.641644325290438, "grad_norm": 0.01804787665605545, "learning_rate": 5.3729390605107964e-05, "loss": 0.0115, "step": 3695 }, { "epoch": 2.641644325290438, "eval_loss": 0.009630408138036728, "eval_runtime": 4.5873, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3695 }, { "epoch": 2.6423592493297585, "grad_norm": 0.0135718435049057, "learning_rate": 5.37044907484338e-05, "loss": 0.0099, "step": 3696 }, { "epoch": 2.6430741733690795, "grad_norm": 0.015487894415855408, "learning_rate": 5.367958996793641e-05, "loss": 0.0083, "step": 3697 }, { "epoch": 2.6437890974084004, "grad_norm": 0.02050042897462845, "learning_rate": 5.3654688269825524e-05, "loss": 0.0182, "step": 3698 }, { "epoch": 2.6445040214477213, "grad_norm": 0.018230173736810684, "learning_rate": 5.362978566031112e-05, "loss": 0.012, "step": 3699 }, { "epoch": 2.645218945487042, "grad_norm": 0.018377263098955154, "learning_rate": 5.360488214560336e-05, "loss": 0.008, "step": 3700 }, { "epoch": 2.645218945487042, "eval_loss": 0.009585614316165447, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 3700 }, { "epoch": 2.645933869526363, "grad_norm": 0.01765885204076767, "learning_rate": 5.35799777319127e-05, "loss": 0.0107, "step": 3701 }, { "epoch": 2.6466487935656837, "grad_norm": 0.019745638594031334, "learning_rate": 5.3555072425449784e-05, "loss": 0.0161, "step": 3702 }, { "epoch": 2.647363717605004, "grad_norm": 0.01804669015109539, "learning_rate": 5.353016623242545e-05, "loss": 0.012, "step": 3703 }, { "epoch": 2.648078641644325, "grad_norm": 0.019633762538433075, "learning_rate": 5.3505259159050804e-05, "loss": 0.013, "step": 3704 }, { "epoch": 2.648793565683646, "grad_norm": 0.01989658549427986, "learning_rate": 5.348035121153716e-05, "loss": 0.0146, "step": 3705 }, { "epoch": 2.648793565683646, "eval_loss": 0.00968841277062893, "eval_runtime": 4.5998, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 3705 }, { "epoch": 2.649508489722967, "grad_norm": 0.021446673199534416, "learning_rate": 5.345544239609604e-05, "loss": 0.0118, "step": 3706 }, { "epoch": 2.650223413762288, "grad_norm": 0.019596491008996964, "learning_rate": 5.343053271893919e-05, "loss": 0.0121, "step": 3707 }, { "epoch": 2.6509383378016085, "grad_norm": 0.014640039764344692, "learning_rate": 5.340562218627854e-05, "loss": 0.0082, "step": 3708 }, { "epoch": 2.6516532618409294, "grad_norm": 0.02010459266602993, "learning_rate": 5.338071080432631e-05, "loss": 0.0156, "step": 3709 }, { "epoch": 2.6523681858802504, "grad_norm": 0.016572359949350357, "learning_rate": 5.3355798579294834e-05, "loss": 0.0086, "step": 3710 }, { "epoch": 2.6523681858802504, "eval_loss": 0.009728018194437027, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 3710 }, { "epoch": 2.653083109919571, "grad_norm": 0.020264217630028725, "learning_rate": 5.333088551739675e-05, "loss": 0.0126, "step": 3711 }, { "epoch": 2.653798033958892, "grad_norm": 0.017769789323210716, "learning_rate": 5.330597162484481e-05, "loss": 0.0093, "step": 3712 }, { "epoch": 2.6545129579982127, "grad_norm": 0.019003620371222496, "learning_rate": 5.3281056907852054e-05, "loss": 0.0108, "step": 3713 }, { "epoch": 2.6552278820375337, "grad_norm": 0.01712159812450409, "learning_rate": 5.325614137263169e-05, "loss": 0.0083, "step": 3714 }, { "epoch": 2.655942806076854, "grad_norm": 0.017875492572784424, "learning_rate": 5.3231225025397146e-05, "loss": 0.0095, "step": 3715 }, { "epoch": 2.655942806076854, "eval_loss": 0.00965079665184021, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3715 }, { "epoch": 2.656657730116175, "grad_norm": 0.018540315330028534, "learning_rate": 5.320630787236203e-05, "loss": 0.0158, "step": 3716 }, { "epoch": 2.657372654155496, "grad_norm": 0.019182033836841583, "learning_rate": 5.3181389919740164e-05, "loss": 0.0107, "step": 3717 }, { "epoch": 2.6580875781948166, "grad_norm": 0.018499331548810005, "learning_rate": 5.315647117374556e-05, "loss": 0.0103, "step": 3718 }, { "epoch": 2.6588025022341375, "grad_norm": 0.01724480465054512, "learning_rate": 5.313155164059247e-05, "loss": 0.0116, "step": 3719 }, { "epoch": 2.6595174262734584, "grad_norm": 0.01503397524356842, "learning_rate": 5.310663132649526e-05, "loss": 0.0096, "step": 3720 }, { "epoch": 2.6595174262734584, "eval_loss": 0.009629864245653152, "eval_runtime": 4.6136, "eval_samples_per_second": 10.838, "eval_steps_per_second": 2.818, "step": 3720 }, { "epoch": 2.6602323503127794, "grad_norm": 0.013882444240152836, "learning_rate": 5.308171023766859e-05, "loss": 0.0093, "step": 3721 }, { "epoch": 2.6609472743521003, "grad_norm": 0.018950236961245537, "learning_rate": 5.305678838032725e-05, "loss": 0.0144, "step": 3722 }, { "epoch": 2.661662198391421, "grad_norm": 0.019667666405439377, "learning_rate": 5.3031865760686206e-05, "loss": 0.0111, "step": 3723 }, { "epoch": 2.6623771224307418, "grad_norm": 0.019436607137322426, "learning_rate": 5.300694238496067e-05, "loss": 0.0163, "step": 3724 }, { "epoch": 2.6630920464700627, "grad_norm": 0.014687875285744667, "learning_rate": 5.2982018259366006e-05, "loss": 0.0101, "step": 3725 }, { "epoch": 2.6630920464700627, "eval_loss": 0.009629910811781883, "eval_runtime": 4.5923, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 3725 }, { "epoch": 2.663806970509383, "grad_norm": 0.020263655111193657, "learning_rate": 5.2957093390117784e-05, "loss": 0.012, "step": 3726 }, { "epoch": 2.664521894548704, "grad_norm": 0.016921555623412132, "learning_rate": 5.293216778343174e-05, "loss": 0.008, "step": 3727 }, { "epoch": 2.665236818588025, "grad_norm": 0.019538499414920807, "learning_rate": 5.290724144552379e-05, "loss": 0.0143, "step": 3728 }, { "epoch": 2.665951742627346, "grad_norm": 0.018216164782643318, "learning_rate": 5.288231438261008e-05, "loss": 0.0081, "step": 3729 }, { "epoch": 2.6666666666666665, "grad_norm": 0.016554025933146477, "learning_rate": 5.285738660090688e-05, "loss": 0.011, "step": 3730 }, { "epoch": 2.6666666666666665, "eval_loss": 0.009693811647593975, "eval_runtime": 4.5908, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 3730 }, { "epoch": 2.6673815907059875, "grad_norm": 0.017948955297470093, "learning_rate": 5.283245810663068e-05, "loss": 0.0107, "step": 3731 }, { "epoch": 2.6680965147453084, "grad_norm": 0.01827697455883026, "learning_rate": 5.280752890599809e-05, "loss": 0.0079, "step": 3732 }, { "epoch": 2.668811438784629, "grad_norm": 0.014364410191774368, "learning_rate": 5.2782599005226e-05, "loss": 0.007, "step": 3733 }, { "epoch": 2.66952636282395, "grad_norm": 0.026612676680088043, "learning_rate": 5.275766841053136e-05, "loss": 0.0193, "step": 3734 }, { "epoch": 2.670241286863271, "grad_norm": 0.0250239260494709, "learning_rate": 5.273273712813135e-05, "loss": 0.0106, "step": 3735 }, { "epoch": 2.670241286863271, "eval_loss": 0.009753533639013767, "eval_runtime": 4.6392, "eval_samples_per_second": 10.778, "eval_steps_per_second": 2.802, "step": 3735 }, { "epoch": 2.6709562109025917, "grad_norm": 0.015204629860818386, "learning_rate": 5.270780516424333e-05, "loss": 0.0095, "step": 3736 }, { "epoch": 2.6716711349419127, "grad_norm": 0.018932469189167023, "learning_rate": 5.2682872525084804e-05, "loss": 0.0087, "step": 3737 }, { "epoch": 2.672386058981233, "grad_norm": 0.01783701218664646, "learning_rate": 5.2657939216873465e-05, "loss": 0.0064, "step": 3738 }, { "epoch": 2.673100983020554, "grad_norm": 0.018393579870462418, "learning_rate": 5.263300524582717e-05, "loss": 0.0109, "step": 3739 }, { "epoch": 2.673815907059875, "grad_norm": 0.01629531942307949, "learning_rate": 5.260807061816389e-05, "loss": 0.0095, "step": 3740 }, { "epoch": 2.673815907059875, "eval_loss": 0.009724516421556473, "eval_runtime": 4.593, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 3740 }, { "epoch": 2.6745308310991955, "grad_norm": 0.026652880012989044, "learning_rate": 5.2583135340101865e-05, "loss": 0.0089, "step": 3741 }, { "epoch": 2.6752457551385165, "grad_norm": 0.020730163902044296, "learning_rate": 5.255819941785941e-05, "loss": 0.0147, "step": 3742 }, { "epoch": 2.6759606791778374, "grad_norm": 0.02055187337100506, "learning_rate": 5.253326285765502e-05, "loss": 0.0189, "step": 3743 }, { "epoch": 2.6766756032171584, "grad_norm": 0.016159119084477425, "learning_rate": 5.250832566570736e-05, "loss": 0.0079, "step": 3744 }, { "epoch": 2.677390527256479, "grad_norm": 0.019492406398057938, "learning_rate": 5.2483387848235264e-05, "loss": 0.0104, "step": 3745 }, { "epoch": 2.677390527256479, "eval_loss": 0.009766340255737305, "eval_runtime": 4.5958, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 3745 }, { "epoch": 2.6781054512958, "grad_norm": 0.013391838409006596, "learning_rate": 5.2458449411457685e-05, "loss": 0.0069, "step": 3746 }, { "epoch": 2.6788203753351207, "grad_norm": 0.022921906784176826, "learning_rate": 5.243351036159378e-05, "loss": 0.0099, "step": 3747 }, { "epoch": 2.6795352993744412, "grad_norm": 0.018953654915094376, "learning_rate": 5.240857070486279e-05, "loss": 0.0125, "step": 3748 }, { "epoch": 2.680250223413762, "grad_norm": 0.023485351353883743, "learning_rate": 5.2383630447484185e-05, "loss": 0.014, "step": 3749 }, { "epoch": 2.680965147453083, "grad_norm": 0.022454965859651566, "learning_rate": 5.235868959567755e-05, "loss": 0.013, "step": 3750 }, { "epoch": 2.680965147453083, "eval_loss": 0.009726064279675484, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 3750 }, { "epoch": 2.681680071492404, "grad_norm": 0.021823931485414505, "learning_rate": 5.233374815566257e-05, "loss": 0.0188, "step": 3751 }, { "epoch": 2.682394995531725, "grad_norm": 0.01701224595308304, "learning_rate": 5.230880613365918e-05, "loss": 0.0138, "step": 3752 }, { "epoch": 2.6831099195710455, "grad_norm": 0.013715523295104504, "learning_rate": 5.228386353588737e-05, "loss": 0.0061, "step": 3753 }, { "epoch": 2.6838248436103664, "grad_norm": 0.02017127349972725, "learning_rate": 5.2258920368567336e-05, "loss": 0.0152, "step": 3754 }, { "epoch": 2.6845397676496874, "grad_norm": 0.019682958722114563, "learning_rate": 5.2233976637919347e-05, "loss": 0.0093, "step": 3755 }, { "epoch": 2.6845397676496874, "eval_loss": 0.009664520621299744, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 3755 }, { "epoch": 2.685254691689008, "grad_norm": 0.01690613105893135, "learning_rate": 5.2209032350163876e-05, "loss": 0.0138, "step": 3756 }, { "epoch": 2.685969615728329, "grad_norm": 0.02007170207798481, "learning_rate": 5.2184087511521515e-05, "loss": 0.0134, "step": 3757 }, { "epoch": 2.6866845397676498, "grad_norm": 0.01624143496155739, "learning_rate": 5.215914212821298e-05, "loss": 0.0116, "step": 3758 }, { "epoch": 2.6873994638069707, "grad_norm": 0.017766833305358887, "learning_rate": 5.213419620645914e-05, "loss": 0.0163, "step": 3759 }, { "epoch": 2.688114387846291, "grad_norm": 0.014993246644735336, "learning_rate": 5.2109249752480983e-05, "loss": 0.0117, "step": 3760 }, { "epoch": 2.688114387846291, "eval_loss": 0.009577550925314426, "eval_runtime": 4.5795, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 3760 }, { "epoch": 2.688829311885612, "grad_norm": 0.016266079619526863, "learning_rate": 5.208430277249965e-05, "loss": 0.0104, "step": 3761 }, { "epoch": 2.689544235924933, "grad_norm": 0.018403440713882446, "learning_rate": 5.205935527273638e-05, "loss": 0.0133, "step": 3762 }, { "epoch": 2.6902591599642536, "grad_norm": 0.018268940970301628, "learning_rate": 5.2034407259412585e-05, "loss": 0.0143, "step": 3763 }, { "epoch": 2.6909740840035745, "grad_norm": 0.01859382353723049, "learning_rate": 5.200945873874978e-05, "loss": 0.0102, "step": 3764 }, { "epoch": 2.6916890080428955, "grad_norm": 0.025131607428193092, "learning_rate": 5.198450971696959e-05, "loss": 0.0181, "step": 3765 }, { "epoch": 2.6916890080428955, "eval_loss": 0.009544231928884983, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 3765 }, { "epoch": 2.6924039320822164, "grad_norm": 0.01988280937075615, "learning_rate": 5.19595602002938e-05, "loss": 0.0151, "step": 3766 }, { "epoch": 2.6931188561215373, "grad_norm": 0.020331090316176414, "learning_rate": 5.19346101949443e-05, "loss": 0.0144, "step": 3767 }, { "epoch": 2.693833780160858, "grad_norm": 0.01569848693907261, "learning_rate": 5.19096597071431e-05, "loss": 0.0069, "step": 3768 }, { "epoch": 2.694548704200179, "grad_norm": 0.018387161195278168, "learning_rate": 5.188470874311234e-05, "loss": 0.0105, "step": 3769 }, { "epoch": 2.6952636282394993, "grad_norm": 0.01769012212753296, "learning_rate": 5.1859757309074276e-05, "loss": 0.0125, "step": 3770 }, { "epoch": 2.6952636282394993, "eval_loss": 0.009488814510405064, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 3770 }, { "epoch": 2.69597855227882, "grad_norm": 0.0193592831492424, "learning_rate": 5.1834805411251275e-05, "loss": 0.011, "step": 3771 }, { "epoch": 2.696693476318141, "grad_norm": 0.0174420028924942, "learning_rate": 5.180985305586581e-05, "loss": 0.0101, "step": 3772 }, { "epoch": 2.697408400357462, "grad_norm": 0.01703731343150139, "learning_rate": 5.17849002491405e-05, "loss": 0.008, "step": 3773 }, { "epoch": 2.698123324396783, "grad_norm": 0.018321000039577484, "learning_rate": 5.175994699729806e-05, "loss": 0.0087, "step": 3774 }, { "epoch": 2.6988382484361035, "grad_norm": 0.01744784228503704, "learning_rate": 5.1734993306561294e-05, "loss": 0.0132, "step": 3775 }, { "epoch": 2.6988382484361035, "eval_loss": 0.00938432291150093, "eval_runtime": 4.6125, "eval_samples_per_second": 10.84, "eval_steps_per_second": 2.818, "step": 3775 }, { "epoch": 2.6995531724754245, "grad_norm": 0.015164489857852459, "learning_rate": 5.171003918315316e-05, "loss": 0.0079, "step": 3776 }, { "epoch": 2.7002680965147454, "grad_norm": 0.022572491317987442, "learning_rate": 5.1685084633296665e-05, "loss": 0.0215, "step": 3777 }, { "epoch": 2.700983020554066, "grad_norm": 0.019507506862282753, "learning_rate": 5.1660129663214996e-05, "loss": 0.0089, "step": 3778 }, { "epoch": 2.701697944593387, "grad_norm": 0.021243546158075333, "learning_rate": 5.163517427913139e-05, "loss": 0.0088, "step": 3779 }, { "epoch": 2.702412868632708, "grad_norm": 0.015979304909706116, "learning_rate": 5.161021848726919e-05, "loss": 0.0075, "step": 3780 }, { "epoch": 2.702412868632708, "eval_loss": 0.009254221804440022, "eval_runtime": 4.6081, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 3780 }, { "epoch": 2.7031277926720287, "grad_norm": 0.023356564342975616, "learning_rate": 5.158526229385187e-05, "loss": 0.0126, "step": 3781 }, { "epoch": 2.7038427167113497, "grad_norm": 0.03409131243824959, "learning_rate": 5.1560305705102984e-05, "loss": 0.0211, "step": 3782 }, { "epoch": 2.70455764075067, "grad_norm": 0.02345988154411316, "learning_rate": 5.153534872724618e-05, "loss": 0.0195, "step": 3783 }, { "epoch": 2.705272564789991, "grad_norm": 0.018983962014317513, "learning_rate": 5.151039136650521e-05, "loss": 0.0147, "step": 3784 }, { "epoch": 2.7059874888293116, "grad_norm": 0.016225313767790794, "learning_rate": 5.148543362910393e-05, "loss": 0.0101, "step": 3785 }, { "epoch": 2.7059874888293116, "eval_loss": 0.009084280580282211, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3785 }, { "epoch": 2.7067024128686326, "grad_norm": 0.019807737320661545, "learning_rate": 5.1460475521266296e-05, "loss": 0.0171, "step": 3786 }, { "epoch": 2.7074173369079535, "grad_norm": 0.02115590125322342, "learning_rate": 5.143551704921632e-05, "loss": 0.0155, "step": 3787 }, { "epoch": 2.7081322609472744, "grad_norm": 0.01960868202149868, "learning_rate": 5.141055821917814e-05, "loss": 0.0096, "step": 3788 }, { "epoch": 2.7088471849865954, "grad_norm": 0.018575910478830338, "learning_rate": 5.1385599037375954e-05, "loss": 0.0159, "step": 3789 }, { "epoch": 2.709562109025916, "grad_norm": 0.013721480034291744, "learning_rate": 5.136063951003409e-05, "loss": 0.0086, "step": 3790 }, { "epoch": 2.709562109025916, "eval_loss": 0.009154216386377811, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3790 }, { "epoch": 2.710277033065237, "grad_norm": 0.016338324174284935, "learning_rate": 5.133567964337693e-05, "loss": 0.0139, "step": 3791 }, { "epoch": 2.7109919571045578, "grad_norm": 0.016158048063516617, "learning_rate": 5.131071944362893e-05, "loss": 0.0112, "step": 3792 }, { "epoch": 2.7117068811438783, "grad_norm": 0.01940775290131569, "learning_rate": 5.128575891701467e-05, "loss": 0.0125, "step": 3793 }, { "epoch": 2.712421805183199, "grad_norm": 0.016048897057771683, "learning_rate": 5.126079806975878e-05, "loss": 0.0101, "step": 3794 }, { "epoch": 2.71313672922252, "grad_norm": 0.02251175232231617, "learning_rate": 5.123583690808596e-05, "loss": 0.0118, "step": 3795 }, { "epoch": 2.71313672922252, "eval_loss": 0.009209773503243923, "eval_runtime": 4.58, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 3795 }, { "epoch": 2.713851653261841, "grad_norm": 0.01891530491411686, "learning_rate": 5.1210875438221026e-05, "loss": 0.0094, "step": 3796 }, { "epoch": 2.714566577301162, "grad_norm": 0.01835547760128975, "learning_rate": 5.118591366638885e-05, "loss": 0.0118, "step": 3797 }, { "epoch": 2.7152815013404825, "grad_norm": 0.026020321995019913, "learning_rate": 5.116095159881438e-05, "loss": 0.0258, "step": 3798 }, { "epoch": 2.7159964253798035, "grad_norm": 0.016904795542359352, "learning_rate": 5.113598924172264e-05, "loss": 0.0123, "step": 3799 }, { "epoch": 2.716711349419124, "grad_norm": 0.021272439509630203, "learning_rate": 5.111102660133873e-05, "loss": 0.0095, "step": 3800 }, { "epoch": 2.716711349419124, "eval_loss": 0.009238913655281067, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3800 }, { "epoch": 2.717426273458445, "grad_norm": 0.016306433826684952, "learning_rate": 5.1086063683887785e-05, "loss": 0.0103, "step": 3801 }, { "epoch": 2.718141197497766, "grad_norm": 0.023634059354662895, "learning_rate": 5.106110049559507e-05, "loss": 0.0161, "step": 3802 }, { "epoch": 2.718856121537087, "grad_norm": 0.01425158604979515, "learning_rate": 5.1036137042685885e-05, "loss": 0.0091, "step": 3803 }, { "epoch": 2.7195710455764077, "grad_norm": 0.016604043543338776, "learning_rate": 5.101117333138558e-05, "loss": 0.0125, "step": 3804 }, { "epoch": 2.720285969615728, "grad_norm": 0.02499604970216751, "learning_rate": 5.0986209367919604e-05, "loss": 0.0117, "step": 3805 }, { "epoch": 2.720285969615728, "eval_loss": 0.009207794442772865, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 3805 }, { "epoch": 2.721000893655049, "grad_norm": 0.017966633662581444, "learning_rate": 5.096124515851344e-05, "loss": 0.0079, "step": 3806 }, { "epoch": 2.72171581769437, "grad_norm": 0.015173396095633507, "learning_rate": 5.0936280709392656e-05, "loss": 0.0104, "step": 3807 }, { "epoch": 2.7224307417336906, "grad_norm": 0.017661824822425842, "learning_rate": 5.091131602678286e-05, "loss": 0.0078, "step": 3808 }, { "epoch": 2.7231456657730115, "grad_norm": 0.022825896739959717, "learning_rate": 5.088635111690974e-05, "loss": 0.0132, "step": 3809 }, { "epoch": 2.7238605898123325, "grad_norm": 0.018762629479169846, "learning_rate": 5.086138598599901e-05, "loss": 0.0128, "step": 3810 }, { "epoch": 2.7238605898123325, "eval_loss": 0.00916528794914484, "eval_runtime": 4.5927, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 3810 }, { "epoch": 2.7245755138516534, "grad_norm": 0.015971781685948372, "learning_rate": 5.083642064027646e-05, "loss": 0.0076, "step": 3811 }, { "epoch": 2.7252904378909744, "grad_norm": 0.015346592292189598, "learning_rate": 5.081145508596794e-05, "loss": 0.0115, "step": 3812 }, { "epoch": 2.726005361930295, "grad_norm": 0.0175656508654356, "learning_rate": 5.078648932929933e-05, "loss": 0.0081, "step": 3813 }, { "epoch": 2.726720285969616, "grad_norm": 0.015483231283724308, "learning_rate": 5.076152337649658e-05, "loss": 0.0112, "step": 3814 }, { "epoch": 2.7274352100089363, "grad_norm": 0.016186952590942383, "learning_rate": 5.073655723378569e-05, "loss": 0.0123, "step": 3815 }, { "epoch": 2.7274352100089363, "eval_loss": 0.009222869761288166, "eval_runtime": 4.5978, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.827, "step": 3815 }, { "epoch": 2.7281501340482572, "grad_norm": 0.02462827041745186, "learning_rate": 5.071159090739266e-05, "loss": 0.0189, "step": 3816 }, { "epoch": 2.728865058087578, "grad_norm": 0.02152990736067295, "learning_rate": 5.0686624403543615e-05, "loss": 0.0149, "step": 3817 }, { "epoch": 2.729579982126899, "grad_norm": 0.017898431047797203, "learning_rate": 5.066165772846467e-05, "loss": 0.01, "step": 3818 }, { "epoch": 2.73029490616622, "grad_norm": 0.018210161477327347, "learning_rate": 5.063669088838201e-05, "loss": 0.0117, "step": 3819 }, { "epoch": 2.7310098302055406, "grad_norm": 0.02066907100379467, "learning_rate": 5.061172388952183e-05, "loss": 0.0189, "step": 3820 }, { "epoch": 2.7310098302055406, "eval_loss": 0.0091583002358675, "eval_runtime": 4.6219, "eval_samples_per_second": 10.818, "eval_steps_per_second": 2.813, "step": 3820 }, { "epoch": 2.7317247542448615, "grad_norm": 0.02109644003212452, "learning_rate": 5.058675673811037e-05, "loss": 0.013, "step": 3821 }, { "epoch": 2.7324396782841824, "grad_norm": 0.01881418190896511, "learning_rate": 5.0561789440373965e-05, "loss": 0.0083, "step": 3822 }, { "epoch": 2.733154602323503, "grad_norm": 0.01607435941696167, "learning_rate": 5.05368220025389e-05, "loss": 0.0112, "step": 3823 }, { "epoch": 2.733869526362824, "grad_norm": 0.019893480464816093, "learning_rate": 5.051185443083156e-05, "loss": 0.0149, "step": 3824 }, { "epoch": 2.734584450402145, "grad_norm": 0.022337239235639572, "learning_rate": 5.048688673147831e-05, "loss": 0.0127, "step": 3825 }, { "epoch": 2.734584450402145, "eval_loss": 0.00910879671573639, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 3825 }, { "epoch": 2.7352993744414658, "grad_norm": 0.01689186692237854, "learning_rate": 5.046191891070562e-05, "loss": 0.0111, "step": 3826 }, { "epoch": 2.7360142984807863, "grad_norm": 0.017710736021399498, "learning_rate": 5.043695097473992e-05, "loss": 0.0084, "step": 3827 }, { "epoch": 2.736729222520107, "grad_norm": 0.023508157581090927, "learning_rate": 5.041198292980768e-05, "loss": 0.0093, "step": 3828 }, { "epoch": 2.737444146559428, "grad_norm": 0.019687924534082413, "learning_rate": 5.038701478213545e-05, "loss": 0.0105, "step": 3829 }, { "epoch": 2.7381590705987486, "grad_norm": 0.014612278901040554, "learning_rate": 5.0362046537949746e-05, "loss": 0.0089, "step": 3830 }, { "epoch": 2.7381590705987486, "eval_loss": 0.009276868775486946, "eval_runtime": 4.619, "eval_samples_per_second": 10.825, "eval_steps_per_second": 2.814, "step": 3830 }, { "epoch": 2.7388739946380696, "grad_norm": 0.024367311969399452, "learning_rate": 5.033707820347715e-05, "loss": 0.0083, "step": 3831 }, { "epoch": 2.7395889186773905, "grad_norm": 0.021317949518561363, "learning_rate": 5.0312109784944226e-05, "loss": 0.0165, "step": 3832 }, { "epoch": 2.7403038427167115, "grad_norm": 0.013961168937385082, "learning_rate": 5.028714128857758e-05, "loss": 0.0113, "step": 3833 }, { "epoch": 2.7410187667560324, "grad_norm": 0.03114381991326809, "learning_rate": 5.026217272060386e-05, "loss": 0.0195, "step": 3834 }, { "epoch": 2.741733690795353, "grad_norm": 0.0201139934360981, "learning_rate": 5.02372040872497e-05, "loss": 0.0115, "step": 3835 }, { "epoch": 2.741733690795353, "eval_loss": 0.009430834092199802, "eval_runtime": 4.5864, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 3835 }, { "epoch": 2.742448614834674, "grad_norm": 0.0220752265304327, "learning_rate": 5.0212235394741744e-05, "loss": 0.0167, "step": 3836 }, { "epoch": 2.7431635388739948, "grad_norm": 0.023988941684365273, "learning_rate": 5.018726664930667e-05, "loss": 0.0134, "step": 3837 }, { "epoch": 2.7438784629133153, "grad_norm": 0.02007012441754341, "learning_rate": 5.016229785717119e-05, "loss": 0.0112, "step": 3838 }, { "epoch": 2.744593386952636, "grad_norm": 0.017453709617257118, "learning_rate": 5.013732902456199e-05, "loss": 0.015, "step": 3839 }, { "epoch": 2.745308310991957, "grad_norm": 0.018901903182268143, "learning_rate": 5.011236015770576e-05, "loss": 0.0105, "step": 3840 }, { "epoch": 2.745308310991957, "eval_loss": 0.009229745715856552, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 3840 }, { "epoch": 2.746023235031278, "grad_norm": 0.016763960942626, "learning_rate": 5.008739126282924e-05, "loss": 0.0071, "step": 3841 }, { "epoch": 2.7467381590705986, "grad_norm": 0.02216479927301407, "learning_rate": 5.0062422346159154e-05, "loss": 0.0103, "step": 3842 }, { "epoch": 2.7474530831099195, "grad_norm": 0.011704271659255028, "learning_rate": 5.003745341392224e-05, "loss": 0.0063, "step": 3843 }, { "epoch": 2.7481680071492405, "grad_norm": 0.0210255216807127, "learning_rate": 5.001248447234521e-05, "loss": 0.0131, "step": 3844 }, { "epoch": 2.748882931188561, "grad_norm": 0.020625051110982895, "learning_rate": 4.99875155276548e-05, "loss": 0.0077, "step": 3845 }, { "epoch": 2.748882931188561, "eval_loss": 0.009265463799238205, "eval_runtime": 4.6041, "eval_samples_per_second": 10.86, "eval_steps_per_second": 2.824, "step": 3845 }, { "epoch": 2.749597855227882, "grad_norm": 0.023474426940083504, "learning_rate": 4.996254658607778e-05, "loss": 0.0112, "step": 3846 }, { "epoch": 2.750312779267203, "grad_norm": 0.018780337646603584, "learning_rate": 4.993757765384085e-05, "loss": 0.0137, "step": 3847 }, { "epoch": 2.751027703306524, "grad_norm": 0.01871374249458313, "learning_rate": 4.991260873717077e-05, "loss": 0.0075, "step": 3848 }, { "epoch": 2.7517426273458447, "grad_norm": 0.020619196817278862, "learning_rate": 4.9887639842294245e-05, "loss": 0.0153, "step": 3849 }, { "epoch": 2.7524575513851652, "grad_norm": 0.01333692204207182, "learning_rate": 4.986267097543803e-05, "loss": 0.0087, "step": 3850 }, { "epoch": 2.7524575513851652, "eval_loss": 0.009401841089129448, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 3850 }, { "epoch": 2.753172475424486, "grad_norm": 0.018696842715144157, "learning_rate": 4.983770214282882e-05, "loss": 0.0153, "step": 3851 }, { "epoch": 2.753887399463807, "grad_norm": 0.01512330211699009, "learning_rate": 4.981273335069333e-05, "loss": 0.0112, "step": 3852 }, { "epoch": 2.7546023235031276, "grad_norm": 0.018348630517721176, "learning_rate": 4.9787764605258275e-05, "loss": 0.0119, "step": 3853 }, { "epoch": 2.7553172475424486, "grad_norm": 0.014307725243270397, "learning_rate": 4.976279591275033e-05, "loss": 0.0066, "step": 3854 }, { "epoch": 2.7560321715817695, "grad_norm": 0.019777216017246246, "learning_rate": 4.973782727939616e-05, "loss": 0.009, "step": 3855 }, { "epoch": 2.7560321715817695, "eval_loss": 0.009344739839434624, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 3855 }, { "epoch": 2.7567470956210904, "grad_norm": 0.017632482573390007, "learning_rate": 4.971285871142242e-05, "loss": 0.0092, "step": 3856 }, { "epoch": 2.757462019660411, "grad_norm": 0.01866850256919861, "learning_rate": 4.9687890215055785e-05, "loss": 0.0098, "step": 3857 }, { "epoch": 2.758176943699732, "grad_norm": 0.013521358370780945, "learning_rate": 4.9662921796522854e-05, "loss": 0.0085, "step": 3858 }, { "epoch": 2.758891867739053, "grad_norm": 0.020736491307616234, "learning_rate": 4.963795346205025e-05, "loss": 0.0119, "step": 3859 }, { "epoch": 2.7596067917783733, "grad_norm": 0.026105066761374474, "learning_rate": 4.961298521786456e-05, "loss": 0.0243, "step": 3860 }, { "epoch": 2.7596067917783733, "eval_loss": 0.009486103430390358, "eval_runtime": 4.6211, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.813, "step": 3860 }, { "epoch": 2.7603217158176943, "grad_norm": 0.02450053207576275, "learning_rate": 4.958801707019233e-05, "loss": 0.0108, "step": 3861 }, { "epoch": 2.761036639857015, "grad_norm": 0.01900411583483219, "learning_rate": 4.95630490252601e-05, "loss": 0.0133, "step": 3862 }, { "epoch": 2.761751563896336, "grad_norm": 0.01539779081940651, "learning_rate": 4.95380810892944e-05, "loss": 0.0126, "step": 3863 }, { "epoch": 2.762466487935657, "grad_norm": 0.02001391537487507, "learning_rate": 4.95131132685217e-05, "loss": 0.0137, "step": 3864 }, { "epoch": 2.7631814119749776, "grad_norm": 0.017679525539278984, "learning_rate": 4.948814556916845e-05, "loss": 0.0105, "step": 3865 }, { "epoch": 2.7631814119749776, "eval_loss": 0.009649134241044521, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3865 }, { "epoch": 2.7638963360142985, "grad_norm": 0.01784096285700798, "learning_rate": 4.946317799746111e-05, "loss": 0.0084, "step": 3866 }, { "epoch": 2.7646112600536195, "grad_norm": 0.01578163355588913, "learning_rate": 4.9438210559626047e-05, "loss": 0.0096, "step": 3867 }, { "epoch": 2.76532618409294, "grad_norm": 0.023700175806879997, "learning_rate": 4.941324326188963e-05, "loss": 0.0134, "step": 3868 }, { "epoch": 2.766041108132261, "grad_norm": 0.011371271684765816, "learning_rate": 4.938827611047818e-05, "loss": 0.005, "step": 3869 }, { "epoch": 2.766756032171582, "grad_norm": 0.024858340620994568, "learning_rate": 4.9363309111618e-05, "loss": 0.016, "step": 3870 }, { "epoch": 2.766756032171582, "eval_loss": 0.0094479164108634, "eval_runtime": 4.5783, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 3870 }, { "epoch": 2.7674709562109028, "grad_norm": 0.01648438535630703, "learning_rate": 4.9338342271535334e-05, "loss": 0.0098, "step": 3871 }, { "epoch": 2.7681858802502233, "grad_norm": 0.015594891272485256, "learning_rate": 4.9313375596456396e-05, "loss": 0.0068, "step": 3872 }, { "epoch": 2.768900804289544, "grad_norm": 0.017993133515119553, "learning_rate": 4.9288409092607344e-05, "loss": 0.0142, "step": 3873 }, { "epoch": 2.769615728328865, "grad_norm": 0.019615668803453445, "learning_rate": 4.926344276621434e-05, "loss": 0.0105, "step": 3874 }, { "epoch": 2.7703306523681857, "grad_norm": 0.020018698647618294, "learning_rate": 4.923847662350344e-05, "loss": 0.0085, "step": 3875 }, { "epoch": 2.7703306523681857, "eval_loss": 0.009403793141245842, "eval_runtime": 4.6044, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 3875 }, { "epoch": 2.7710455764075066, "grad_norm": 0.021913139149546623, "learning_rate": 4.921351067070068e-05, "loss": 0.0148, "step": 3876 }, { "epoch": 2.7717605004468275, "grad_norm": 0.020335828885436058, "learning_rate": 4.918854491403207e-05, "loss": 0.0095, "step": 3877 }, { "epoch": 2.7724754244861485, "grad_norm": 0.016397904604673386, "learning_rate": 4.9163579359723536e-05, "loss": 0.009, "step": 3878 }, { "epoch": 2.7731903485254694, "grad_norm": 0.020344069227576256, "learning_rate": 4.9138614014000996e-05, "loss": 0.0098, "step": 3879 }, { "epoch": 2.77390527256479, "grad_norm": 0.022363528609275818, "learning_rate": 4.911364888309027e-05, "loss": 0.0111, "step": 3880 }, { "epoch": 2.77390527256479, "eval_loss": 0.00936218537390232, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 3880 }, { "epoch": 2.774620196604111, "grad_norm": 0.019262345507740974, "learning_rate": 4.908868397321714e-05, "loss": 0.0152, "step": 3881 }, { "epoch": 2.775335120643432, "grad_norm": 0.025166504085063934, "learning_rate": 4.906371929060734e-05, "loss": 0.0162, "step": 3882 }, { "epoch": 2.7760500446827523, "grad_norm": 0.0184169989079237, "learning_rate": 4.9038754841486565e-05, "loss": 0.0152, "step": 3883 }, { "epoch": 2.7767649687220732, "grad_norm": 0.018190966919064522, "learning_rate": 4.901379063208041e-05, "loss": 0.009, "step": 3884 }, { "epoch": 2.777479892761394, "grad_norm": 0.019668204709887505, "learning_rate": 4.8988826668614434e-05, "loss": 0.0146, "step": 3885 }, { "epoch": 2.777479892761394, "eval_loss": 0.0094678346067667, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 3885 }, { "epoch": 2.778194816800715, "grad_norm": 0.016313988715410233, "learning_rate": 4.8963862957314126e-05, "loss": 0.0169, "step": 3886 }, { "epoch": 2.7789097408400356, "grad_norm": 0.01973392628133297, "learning_rate": 4.893889950440494e-05, "loss": 0.0101, "step": 3887 }, { "epoch": 2.7796246648793566, "grad_norm": 0.02039514109492302, "learning_rate": 4.891393631611223e-05, "loss": 0.0087, "step": 3888 }, { "epoch": 2.7803395889186775, "grad_norm": 0.022421594709157944, "learning_rate": 4.8888973398661286e-05, "loss": 0.0099, "step": 3889 }, { "epoch": 2.781054512957998, "grad_norm": 0.014119441621005535, "learning_rate": 4.8864010758277365e-05, "loss": 0.007, "step": 3890 }, { "epoch": 2.781054512957998, "eval_loss": 0.009443639777600765, "eval_runtime": 4.609, "eval_samples_per_second": 10.848, "eval_steps_per_second": 2.821, "step": 3890 }, { "epoch": 2.781769436997319, "grad_norm": 0.020765384659171104, "learning_rate": 4.8839048401185625e-05, "loss": 0.015, "step": 3891 }, { "epoch": 2.78248436103664, "grad_norm": 0.019747484475374222, "learning_rate": 4.881408633361116e-05, "loss": 0.0161, "step": 3892 }, { "epoch": 2.783199285075961, "grad_norm": 0.01692448928952217, "learning_rate": 4.878912456177898e-05, "loss": 0.0076, "step": 3893 }, { "epoch": 2.7839142091152818, "grad_norm": 0.018859948962926865, "learning_rate": 4.8764163091914054e-05, "loss": 0.0096, "step": 3894 }, { "epoch": 2.7846291331546023, "grad_norm": 0.01720239408314228, "learning_rate": 4.8739201930241254e-05, "loss": 0.0129, "step": 3895 }, { "epoch": 2.7846291331546023, "eval_loss": 0.009473703801631927, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 3895 }, { "epoch": 2.785344057193923, "grad_norm": 0.026363002136349678, "learning_rate": 4.871424108298536e-05, "loss": 0.0149, "step": 3896 }, { "epoch": 2.7860589812332437, "grad_norm": 0.016414251178503036, "learning_rate": 4.8689280556371084e-05, "loss": 0.0118, "step": 3897 }, { "epoch": 2.7867739052725646, "grad_norm": 0.020446671172976494, "learning_rate": 4.866432035662309e-05, "loss": 0.0093, "step": 3898 }, { "epoch": 2.7874888293118856, "grad_norm": 0.01609877124428749, "learning_rate": 4.863936048996593e-05, "loss": 0.0085, "step": 3899 }, { "epoch": 2.7882037533512065, "grad_norm": 0.0184469074010849, "learning_rate": 4.8614400962624044e-05, "loss": 0.0118, "step": 3900 }, { "epoch": 2.7882037533512065, "eval_loss": 0.0095703499391675, "eval_runtime": 4.5756, "eval_samples_per_second": 10.927, "eval_steps_per_second": 2.841, "step": 3900 }, { "epoch": 2.7889186773905275, "grad_norm": 0.027004214003682137, "learning_rate": 4.858944178082188e-05, "loss": 0.0167, "step": 3901 }, { "epoch": 2.789633601429848, "grad_norm": 0.0173471849411726, "learning_rate": 4.8564482950783685e-05, "loss": 0.0139, "step": 3902 }, { "epoch": 2.790348525469169, "grad_norm": 0.01607557386159897, "learning_rate": 4.853952447873371e-05, "loss": 0.0104, "step": 3903 }, { "epoch": 2.79106344950849, "grad_norm": 0.018609020859003067, "learning_rate": 4.851456637089607e-05, "loss": 0.0099, "step": 3904 }, { "epoch": 2.7917783735478103, "grad_norm": 0.020678216591477394, "learning_rate": 4.84896086334948e-05, "loss": 0.0092, "step": 3905 }, { "epoch": 2.7917783735478103, "eval_loss": 0.009670381434261799, "eval_runtime": 4.6338, "eval_samples_per_second": 10.79, "eval_steps_per_second": 2.805, "step": 3905 }, { "epoch": 2.7924932975871313, "grad_norm": 0.019118472933769226, "learning_rate": 4.846465127275383e-05, "loss": 0.0129, "step": 3906 }, { "epoch": 2.793208221626452, "grad_norm": 0.018458571285009384, "learning_rate": 4.843969429489703e-05, "loss": 0.0094, "step": 3907 }, { "epoch": 2.793923145665773, "grad_norm": 0.020388389006257057, "learning_rate": 4.841473770614814e-05, "loss": 0.0147, "step": 3908 }, { "epoch": 2.794638069705094, "grad_norm": 0.0159930232912302, "learning_rate": 4.8389781512730825e-05, "loss": 0.0078, "step": 3909 }, { "epoch": 2.7953529937444146, "grad_norm": 0.017404334619641304, "learning_rate": 4.836482572086862e-05, "loss": 0.0075, "step": 3910 }, { "epoch": 2.7953529937444146, "eval_loss": 0.009795456193387508, "eval_runtime": 4.5921, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 3910 }, { "epoch": 2.7960679177837355, "grad_norm": 0.014033527113497257, "learning_rate": 4.833987033678501e-05, "loss": 0.007, "step": 3911 }, { "epoch": 2.796782841823056, "grad_norm": 0.019141362980008125, "learning_rate": 4.831491536670334e-05, "loss": 0.0174, "step": 3912 }, { "epoch": 2.797497765862377, "grad_norm": 0.015530292876064777, "learning_rate": 4.828996081684685e-05, "loss": 0.01, "step": 3913 }, { "epoch": 2.798212689901698, "grad_norm": 0.02018721029162407, "learning_rate": 4.826500669343871e-05, "loss": 0.0139, "step": 3914 }, { "epoch": 2.798927613941019, "grad_norm": 0.01937420293688774, "learning_rate": 4.824005300270196e-05, "loss": 0.0121, "step": 3915 }, { "epoch": 2.798927613941019, "eval_loss": 0.009790102951228619, "eval_runtime": 4.6126, "eval_samples_per_second": 10.84, "eval_steps_per_second": 2.818, "step": 3915 }, { "epoch": 2.79964253798034, "grad_norm": 0.017918424680829048, "learning_rate": 4.8215099750859514e-05, "loss": 0.01, "step": 3916 }, { "epoch": 2.8003574620196603, "grad_norm": 0.020256198942661285, "learning_rate": 4.819014694413419e-05, "loss": 0.0149, "step": 3917 }, { "epoch": 2.8010723860589812, "grad_norm": 0.018786298111081123, "learning_rate": 4.8165194588748744e-05, "loss": 0.0084, "step": 3918 }, { "epoch": 2.801787310098302, "grad_norm": 0.021834764629602432, "learning_rate": 4.814024269092574e-05, "loss": 0.016, "step": 3919 }, { "epoch": 2.8025022341376227, "grad_norm": 0.019045671448111534, "learning_rate": 4.8115291256887677e-05, "loss": 0.0075, "step": 3920 }, { "epoch": 2.8025022341376227, "eval_loss": 0.009703021496534348, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 3920 }, { "epoch": 2.8032171581769436, "grad_norm": 0.01590283028781414, "learning_rate": 4.809034029285691e-05, "loss": 0.0068, "step": 3921 }, { "epoch": 2.8039320822162646, "grad_norm": 0.020855870097875595, "learning_rate": 4.806538980505572e-05, "loss": 0.0139, "step": 3922 }, { "epoch": 2.8046470062555855, "grad_norm": 0.02462116815149784, "learning_rate": 4.80404397997062e-05, "loss": 0.0194, "step": 3923 }, { "epoch": 2.8053619302949064, "grad_norm": 0.01930852234363556, "learning_rate": 4.8015490283030415e-05, "loss": 0.01, "step": 3924 }, { "epoch": 2.806076854334227, "grad_norm": 0.018476784229278564, "learning_rate": 4.7990541261250236e-05, "loss": 0.0104, "step": 3925 }, { "epoch": 2.806076854334227, "eval_loss": 0.009669611230492592, "eval_runtime": 4.607, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 3925 }, { "epoch": 2.806791778373548, "grad_norm": 0.01589636504650116, "learning_rate": 4.796559274058741e-05, "loss": 0.0072, "step": 3926 }, { "epoch": 2.8075067024128684, "grad_norm": 0.018024330958724022, "learning_rate": 4.794064472726362e-05, "loss": 0.0091, "step": 3927 }, { "epoch": 2.8082216264521893, "grad_norm": 0.027901574969291687, "learning_rate": 4.791569722750036e-05, "loss": 0.023, "step": 3928 }, { "epoch": 2.8089365504915103, "grad_norm": 0.01645221747457981, "learning_rate": 4.789075024751903e-05, "loss": 0.0092, "step": 3929 }, { "epoch": 2.809651474530831, "grad_norm": 0.016241826117038727, "learning_rate": 4.7865803793540865e-05, "loss": 0.0081, "step": 3930 }, { "epoch": 2.809651474530831, "eval_loss": 0.0096663236618042, "eval_runtime": 4.5983, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 3930 }, { "epoch": 2.810366398570152, "grad_norm": 0.01822345331311226, "learning_rate": 4.7840857871787025e-05, "loss": 0.0109, "step": 3931 }, { "epoch": 2.8110813226094726, "grad_norm": 0.019138872623443604, "learning_rate": 4.78159124884785e-05, "loss": 0.0148, "step": 3932 }, { "epoch": 2.8117962466487936, "grad_norm": 0.02128869667649269, "learning_rate": 4.7790967649836135e-05, "loss": 0.0103, "step": 3933 }, { "epoch": 2.8125111706881145, "grad_norm": 0.021117407828569412, "learning_rate": 4.7766023362080665e-05, "loss": 0.0131, "step": 3934 }, { "epoch": 2.813226094727435, "grad_norm": 0.021429071202874184, "learning_rate": 4.774107963143268e-05, "loss": 0.0091, "step": 3935 }, { "epoch": 2.813226094727435, "eval_loss": 0.009765686467289925, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3935 }, { "epoch": 2.813941018766756, "grad_norm": 0.02028771862387657, "learning_rate": 4.7716136464112636e-05, "loss": 0.0135, "step": 3936 }, { "epoch": 2.814655942806077, "grad_norm": 0.017489800229668617, "learning_rate": 4.769119386634083e-05, "loss": 0.01, "step": 3937 }, { "epoch": 2.815370866845398, "grad_norm": 0.018108589574694633, "learning_rate": 4.766625184433743e-05, "loss": 0.0111, "step": 3938 }, { "epoch": 2.8160857908847188, "grad_norm": 0.021554987877607346, "learning_rate": 4.764131040432248e-05, "loss": 0.012, "step": 3939 }, { "epoch": 2.8168007149240393, "grad_norm": 0.02217131480574608, "learning_rate": 4.761636955251583e-05, "loss": 0.013, "step": 3940 }, { "epoch": 2.8168007149240393, "eval_loss": 0.009718680754303932, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 3940 }, { "epoch": 2.81751563896336, "grad_norm": 0.020759014412760735, "learning_rate": 4.759142929513722e-05, "loss": 0.0111, "step": 3941 }, { "epoch": 2.8182305630026807, "grad_norm": 0.021256454288959503, "learning_rate": 4.756648963840624e-05, "loss": 0.0193, "step": 3942 }, { "epoch": 2.8189454870420017, "grad_norm": 0.020557068288326263, "learning_rate": 4.7541550588542326e-05, "loss": 0.01, "step": 3943 }, { "epoch": 2.8196604110813226, "grad_norm": 0.018082087859511375, "learning_rate": 4.751661215176476e-05, "loss": 0.0138, "step": 3944 }, { "epoch": 2.8203753351206435, "grad_norm": 0.016153227537870407, "learning_rate": 4.749167433429264e-05, "loss": 0.0109, "step": 3945 }, { "epoch": 2.8203753351206435, "eval_loss": 0.00949089601635933, "eval_runtime": 4.6043, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 3945 }, { "epoch": 2.8210902591599645, "grad_norm": 0.01982036605477333, "learning_rate": 4.7466737142344976e-05, "loss": 0.0089, "step": 3946 }, { "epoch": 2.821805183199285, "grad_norm": 0.016982708126306534, "learning_rate": 4.7441800582140595e-05, "loss": 0.0083, "step": 3947 }, { "epoch": 2.822520107238606, "grad_norm": 0.01850713975727558, "learning_rate": 4.741686465989813e-05, "loss": 0.0119, "step": 3948 }, { "epoch": 2.823235031277927, "grad_norm": 0.017909707501530647, "learning_rate": 4.739192938183611e-05, "loss": 0.0134, "step": 3949 }, { "epoch": 2.8239499553172474, "grad_norm": 0.022292902693152428, "learning_rate": 4.7366994754172846e-05, "loss": 0.0151, "step": 3950 }, { "epoch": 2.8239499553172474, "eval_loss": 0.00931947585195303, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3950 }, { "epoch": 2.8246648793565683, "grad_norm": 0.01673286221921444, "learning_rate": 4.734206078312654e-05, "loss": 0.0101, "step": 3951 }, { "epoch": 2.8253798033958892, "grad_norm": 0.02098608948290348, "learning_rate": 4.731712747491521e-05, "loss": 0.0136, "step": 3952 }, { "epoch": 2.82609472743521, "grad_norm": 0.019623465836048126, "learning_rate": 4.729219483575669e-05, "loss": 0.011, "step": 3953 }, { "epoch": 2.8268096514745307, "grad_norm": 0.020740611478686333, "learning_rate": 4.7267262871868655e-05, "loss": 0.0133, "step": 3954 }, { "epoch": 2.8275245755138516, "grad_norm": 0.018707724288105965, "learning_rate": 4.724233158946866e-05, "loss": 0.0083, "step": 3955 }, { "epoch": 2.8275245755138516, "eval_loss": 0.009461352601647377, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 3955 }, { "epoch": 2.8282394995531726, "grad_norm": 0.02096658945083618, "learning_rate": 4.721740099477402e-05, "loss": 0.0139, "step": 3956 }, { "epoch": 2.828954423592493, "grad_norm": 0.02067185379564762, "learning_rate": 4.7192471094001914e-05, "loss": 0.0119, "step": 3957 }, { "epoch": 2.829669347631814, "grad_norm": 0.01710454188287258, "learning_rate": 4.716754189336933e-05, "loss": 0.0121, "step": 3958 }, { "epoch": 2.830384271671135, "grad_norm": 0.022055430337786674, "learning_rate": 4.7142613399093126e-05, "loss": 0.0118, "step": 3959 }, { "epoch": 2.831099195710456, "grad_norm": 0.023539027199149132, "learning_rate": 4.711768561738993e-05, "loss": 0.0151, "step": 3960 }, { "epoch": 2.831099195710456, "eval_loss": 0.009485156275331974, "eval_runtime": 4.6436, "eval_samples_per_second": 10.768, "eval_steps_per_second": 2.8, "step": 3960 }, { "epoch": 2.831814119749777, "grad_norm": 0.020301828160881996, "learning_rate": 4.709275855447621e-05, "loss": 0.0066, "step": 3961 }, { "epoch": 2.8325290437890973, "grad_norm": 0.0141847999766469, "learning_rate": 4.7067832216568284e-05, "loss": 0.0074, "step": 3962 }, { "epoch": 2.8332439678284183, "grad_norm": 0.023888060823082924, "learning_rate": 4.7042906609882234e-05, "loss": 0.0142, "step": 3963 }, { "epoch": 2.833958891867739, "grad_norm": 0.02042084001004696, "learning_rate": 4.701798174063401e-05, "loss": 0.0113, "step": 3964 }, { "epoch": 2.8346738159070597, "grad_norm": 0.014445292763411999, "learning_rate": 4.699305761503934e-05, "loss": 0.007, "step": 3965 }, { "epoch": 2.8346738159070597, "eval_loss": 0.009535209275782108, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 3965 }, { "epoch": 2.8353887399463806, "grad_norm": 0.01620856113731861, "learning_rate": 4.696813423931381e-05, "loss": 0.0082, "step": 3966 }, { "epoch": 2.8361036639857016, "grad_norm": 0.016529187560081482, "learning_rate": 4.6943211619672755e-05, "loss": 0.0079, "step": 3967 }, { "epoch": 2.8368185880250225, "grad_norm": 0.023070139810442924, "learning_rate": 4.6918289762331405e-05, "loss": 0.0156, "step": 3968 }, { "epoch": 2.837533512064343, "grad_norm": 0.022031549364328384, "learning_rate": 4.6893368673504736e-05, "loss": 0.0157, "step": 3969 }, { "epoch": 2.838248436103664, "grad_norm": 0.02814461849629879, "learning_rate": 4.6868448359407536e-05, "loss": 0.0093, "step": 3970 }, { "epoch": 2.838248436103664, "eval_loss": 0.009302706457674503, "eval_runtime": 4.5873, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 3970 }, { "epoch": 2.838963360142985, "grad_norm": 0.019791603088378906, "learning_rate": 4.684352882625444e-05, "loss": 0.0122, "step": 3971 }, { "epoch": 2.8396782841823054, "grad_norm": 0.01631128042936325, "learning_rate": 4.681861008025985e-05, "loss": 0.0098, "step": 3972 }, { "epoch": 2.8403932082216263, "grad_norm": 0.018682342022657394, "learning_rate": 4.679369212763799e-05, "loss": 0.0078, "step": 3973 }, { "epoch": 2.8411081322609473, "grad_norm": 0.018916264176368713, "learning_rate": 4.676877497460286e-05, "loss": 0.008, "step": 3974 }, { "epoch": 2.841823056300268, "grad_norm": 0.017629079520702362, "learning_rate": 4.674385862736831e-05, "loss": 0.0106, "step": 3975 }, { "epoch": 2.841823056300268, "eval_loss": 0.00922340713441372, "eval_runtime": 4.5783, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 3975 }, { "epoch": 2.842537980339589, "grad_norm": 0.015501925721764565, "learning_rate": 4.671894309214796e-05, "loss": 0.0084, "step": 3976 }, { "epoch": 2.8432529043789097, "grad_norm": 0.019562946632504463, "learning_rate": 4.669402837515521e-05, "loss": 0.0104, "step": 3977 }, { "epoch": 2.8439678284182306, "grad_norm": 0.016538580879569054, "learning_rate": 4.666911448260327e-05, "loss": 0.0081, "step": 3978 }, { "epoch": 2.8446827524575515, "grad_norm": 0.016803381964564323, "learning_rate": 4.664420142070518e-05, "loss": 0.0097, "step": 3979 }, { "epoch": 2.845397676496872, "grad_norm": 0.02262107841670513, "learning_rate": 4.661928919567371e-05, "loss": 0.0155, "step": 3980 }, { "epoch": 2.845397676496872, "eval_loss": 0.009256232529878616, "eval_runtime": 4.5786, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 3980 }, { "epoch": 2.846112600536193, "grad_norm": 0.01421007513999939, "learning_rate": 4.6594377813721465e-05, "loss": 0.0068, "step": 3981 }, { "epoch": 2.846827524575514, "grad_norm": 0.025246387347579002, "learning_rate": 4.656946728106083e-05, "loss": 0.0162, "step": 3982 }, { "epoch": 2.847542448614835, "grad_norm": 0.020132659003138542, "learning_rate": 4.6544557603903975e-05, "loss": 0.0133, "step": 3983 }, { "epoch": 2.8482573726541554, "grad_norm": 0.019619682803750038, "learning_rate": 4.6519648788462854e-05, "loss": 0.0109, "step": 3984 }, { "epoch": 2.8489722966934763, "grad_norm": 0.014775858260691166, "learning_rate": 4.64947408409492e-05, "loss": 0.0061, "step": 3985 }, { "epoch": 2.8489722966934763, "eval_loss": 0.009221899323165417, "eval_runtime": 4.5844, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 3985 }, { "epoch": 2.8496872207327972, "grad_norm": 0.020264621824026108, "learning_rate": 4.6469833767574566e-05, "loss": 0.0146, "step": 3986 }, { "epoch": 2.8504021447721177, "grad_norm": 0.030836738646030426, "learning_rate": 4.644492757455025e-05, "loss": 0.0113, "step": 3987 }, { "epoch": 2.8511170688114387, "grad_norm": 0.01818966120481491, "learning_rate": 4.6420022268087317e-05, "loss": 0.0094, "step": 3988 }, { "epoch": 2.8518319928507596, "grad_norm": 0.030421510338783264, "learning_rate": 4.6395117854396644e-05, "loss": 0.0116, "step": 3989 }, { "epoch": 2.8525469168900806, "grad_norm": 0.018771788105368614, "learning_rate": 4.637021433968889e-05, "loss": 0.0089, "step": 3990 }, { "epoch": 2.8525469168900806, "eval_loss": 0.009191556833684444, "eval_runtime": 4.5794, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 3990 }, { "epoch": 2.8532618409294015, "grad_norm": 0.026379764080047607, "learning_rate": 4.6345311730174474e-05, "loss": 0.0166, "step": 3991 }, { "epoch": 2.853976764968722, "grad_norm": 0.019520262256264687, "learning_rate": 4.632041003206359e-05, "loss": 0.014, "step": 3992 }, { "epoch": 2.854691689008043, "grad_norm": 0.01923801749944687, "learning_rate": 4.62955092515662e-05, "loss": 0.0088, "step": 3993 }, { "epoch": 2.855406613047364, "grad_norm": 0.019235344603657722, "learning_rate": 4.6270609394892034e-05, "loss": 0.0077, "step": 3994 }, { "epoch": 2.8561215370866844, "grad_norm": 0.026581231504678726, "learning_rate": 4.6245710468250626e-05, "loss": 0.0196, "step": 3995 }, { "epoch": 2.8561215370866844, "eval_loss": 0.009182777255773544, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 3995 }, { "epoch": 2.8568364611260053, "grad_norm": 0.026729408651590347, "learning_rate": 4.622081247785123e-05, "loss": 0.0117, "step": 3996 }, { "epoch": 2.8575513851653263, "grad_norm": 0.018554693087935448, "learning_rate": 4.6195915429902896e-05, "loss": 0.0145, "step": 3997 }, { "epoch": 2.858266309204647, "grad_norm": 0.014894416555762291, "learning_rate": 4.617101933061442e-05, "loss": 0.0079, "step": 3998 }, { "epoch": 2.8589812332439677, "grad_norm": 0.015185097232460976, "learning_rate": 4.6146124186194404e-05, "loss": 0.0081, "step": 3999 }, { "epoch": 2.8596961572832886, "grad_norm": 0.019466277211904526, "learning_rate": 4.612123000285116e-05, "loss": 0.0142, "step": 4000 }, { "epoch": 2.8596961572832886, "eval_loss": 0.00924843642860651, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4000 }, { "epoch": 2.8604110813226096, "grad_norm": 0.021970732137560844, "learning_rate": 4.609633678679278e-05, "loss": 0.014, "step": 4001 }, { "epoch": 2.86112600536193, "grad_norm": 0.01788434199988842, "learning_rate": 4.607144454422711e-05, "loss": 0.0065, "step": 4002 }, { "epoch": 2.861840929401251, "grad_norm": 0.01468585804104805, "learning_rate": 4.604655328136178e-05, "loss": 0.0093, "step": 4003 }, { "epoch": 2.862555853440572, "grad_norm": 0.015341033227741718, "learning_rate": 4.602166300440414e-05, "loss": 0.0101, "step": 4004 }, { "epoch": 2.863270777479893, "grad_norm": 0.019973192363977432, "learning_rate": 4.599677371956132e-05, "loss": 0.0141, "step": 4005 }, { "epoch": 2.863270777479893, "eval_loss": 0.009161983616650105, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 4005 }, { "epoch": 2.863985701519214, "grad_norm": 0.01703028567135334, "learning_rate": 4.597188543304017e-05, "loss": 0.0102, "step": 4006 }, { "epoch": 2.8647006255585343, "grad_norm": 0.018129665404558182, "learning_rate": 4.594699815104735e-05, "loss": 0.0105, "step": 4007 }, { "epoch": 2.8654155495978553, "grad_norm": 0.022998662665486336, "learning_rate": 4.59221118797892e-05, "loss": 0.0117, "step": 4008 }, { "epoch": 2.866130473637176, "grad_norm": 0.015768520534038544, "learning_rate": 4.5897226625471845e-05, "loss": 0.0072, "step": 4009 }, { "epoch": 2.8668453976764967, "grad_norm": 0.020583709701895714, "learning_rate": 4.587234239430117e-05, "loss": 0.0124, "step": 4010 }, { "epoch": 2.8668453976764967, "eval_loss": 0.009271523915231228, "eval_runtime": 4.5904, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 4010 }, { "epoch": 2.8675603217158177, "grad_norm": 0.026273876428604126, "learning_rate": 4.584745919248275e-05, "loss": 0.0207, "step": 4011 }, { "epoch": 2.8682752457551386, "grad_norm": 0.020790856331586838, "learning_rate": 4.582257702622199e-05, "loss": 0.0104, "step": 4012 }, { "epoch": 2.8689901697944595, "grad_norm": 0.015834184363484383, "learning_rate": 4.5797695901723964e-05, "loss": 0.0109, "step": 4013 }, { "epoch": 2.86970509383378, "grad_norm": 0.021269768476486206, "learning_rate": 4.57728158251935e-05, "loss": 0.0135, "step": 4014 }, { "epoch": 2.870420017873101, "grad_norm": 0.025934303179383278, "learning_rate": 4.5747936802835176e-05, "loss": 0.014, "step": 4015 }, { "epoch": 2.870420017873101, "eval_loss": 0.009421600960195065, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4015 }, { "epoch": 2.871134941912422, "grad_norm": 0.021600810810923576, "learning_rate": 4.5723058840853315e-05, "loss": 0.01, "step": 4016 }, { "epoch": 2.8718498659517424, "grad_norm": 0.025763381272554398, "learning_rate": 4.569818194545196e-05, "loss": 0.0098, "step": 4017 }, { "epoch": 2.8725647899910633, "grad_norm": 0.020147422328591347, "learning_rate": 4.567330612283488e-05, "loss": 0.0087, "step": 4018 }, { "epoch": 2.8732797140303843, "grad_norm": 0.019808495417237282, "learning_rate": 4.564843137920561e-05, "loss": 0.0084, "step": 4019 }, { "epoch": 2.8739946380697052, "grad_norm": 0.0213484987616539, "learning_rate": 4.56235577207674e-05, "loss": 0.0105, "step": 4020 }, { "epoch": 2.8739946380697052, "eval_loss": 0.009377069771289825, "eval_runtime": 4.654, "eval_samples_per_second": 10.744, "eval_steps_per_second": 2.793, "step": 4020 }, { "epoch": 2.874709562109026, "grad_norm": 0.019090251997113228, "learning_rate": 4.5598685153723205e-05, "loss": 0.0111, "step": 4021 }, { "epoch": 2.8754244861483467, "grad_norm": 0.028119701892137527, "learning_rate": 4.5573813684275725e-05, "loss": 0.0155, "step": 4022 }, { "epoch": 2.8761394101876676, "grad_norm": 0.026336459442973137, "learning_rate": 4.554894331862741e-05, "loss": 0.0135, "step": 4023 }, { "epoch": 2.876854334226988, "grad_norm": 0.018873970955610275, "learning_rate": 4.5524074062980405e-05, "loss": 0.0113, "step": 4024 }, { "epoch": 2.877569258266309, "grad_norm": 0.02051805891096592, "learning_rate": 4.5499205923536584e-05, "loss": 0.0095, "step": 4025 }, { "epoch": 2.877569258266309, "eval_loss": 0.00931282714009285, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 4025 }, { "epoch": 2.87828418230563, "grad_norm": 0.018822936341166496, "learning_rate": 4.5474338906497537e-05, "loss": 0.01, "step": 4026 }, { "epoch": 2.878999106344951, "grad_norm": 0.01443392038345337, "learning_rate": 4.54494730180646e-05, "loss": 0.0064, "step": 4027 }, { "epoch": 2.879714030384272, "grad_norm": 0.01514608133584261, "learning_rate": 4.5424608264438815e-05, "loss": 0.0098, "step": 4028 }, { "epoch": 2.8804289544235924, "grad_norm": 0.020645510405302048, "learning_rate": 4.5399744651820915e-05, "loss": 0.0104, "step": 4029 }, { "epoch": 2.8811438784629133, "grad_norm": 0.015199822373688221, "learning_rate": 4.5374882186411375e-05, "loss": 0.0071, "step": 4030 }, { "epoch": 2.8811438784629133, "eval_loss": 0.009355227462947369, "eval_runtime": 4.5779, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 4030 }, { "epoch": 2.8818588025022343, "grad_norm": 0.02483302168548107, "learning_rate": 4.5350020874410386e-05, "loss": 0.0134, "step": 4031 }, { "epoch": 2.8825737265415547, "grad_norm": 0.018640130758285522, "learning_rate": 4.5325160722017845e-05, "loss": 0.0096, "step": 4032 }, { "epoch": 2.8832886505808757, "grad_norm": 0.022942136973142624, "learning_rate": 4.5300301735433334e-05, "loss": 0.0156, "step": 4033 }, { "epoch": 2.8840035746201966, "grad_norm": 0.019569532945752144, "learning_rate": 4.5275443920856213e-05, "loss": 0.0141, "step": 4034 }, { "epoch": 2.8847184986595176, "grad_norm": 0.018083246424794197, "learning_rate": 4.5250587284485476e-05, "loss": 0.0119, "step": 4035 }, { "epoch": 2.8847184986595176, "eval_loss": 0.009184056892991066, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 4035 }, { "epoch": 2.8854334226988385, "grad_norm": 0.01530284620821476, "learning_rate": 4.5225731832519866e-05, "loss": 0.011, "step": 4036 }, { "epoch": 2.886148346738159, "grad_norm": 0.011465176939964294, "learning_rate": 4.520087757115781e-05, "loss": 0.0054, "step": 4037 }, { "epoch": 2.88686327077748, "grad_norm": 0.023071978241205215, "learning_rate": 4.517602450659746e-05, "loss": 0.0238, "step": 4038 }, { "epoch": 2.8875781948168004, "grad_norm": 0.022295814007520676, "learning_rate": 4.515117264503662e-05, "loss": 0.023, "step": 4039 }, { "epoch": 2.8882931188561214, "grad_norm": 0.024720653891563416, "learning_rate": 4.512632199267286e-05, "loss": 0.013, "step": 4040 }, { "epoch": 2.8882931188561214, "eval_loss": 0.009322117082774639, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 4040 }, { "epoch": 2.8890080428954423, "grad_norm": 0.019516320899128914, "learning_rate": 4.5101472555703424e-05, "loss": 0.0162, "step": 4041 }, { "epoch": 2.8897229669347633, "grad_norm": 0.017681272700428963, "learning_rate": 4.507662434032522e-05, "loss": 0.0094, "step": 4042 }, { "epoch": 2.890437890974084, "grad_norm": 0.016746263951063156, "learning_rate": 4.505177735273489e-05, "loss": 0.0087, "step": 4043 }, { "epoch": 2.8911528150134047, "grad_norm": 0.02063927985727787, "learning_rate": 4.502693159912877e-05, "loss": 0.0203, "step": 4044 }, { "epoch": 2.8918677390527256, "grad_norm": 0.02021694742143154, "learning_rate": 4.5002087085702867e-05, "loss": 0.0164, "step": 4045 }, { "epoch": 2.8918677390527256, "eval_loss": 0.009427938610315323, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 4045 }, { "epoch": 2.8925826630920466, "grad_norm": 0.019351134076714516, "learning_rate": 4.497724381865286e-05, "loss": 0.0095, "step": 4046 }, { "epoch": 2.893297587131367, "grad_norm": 0.02686150185763836, "learning_rate": 4.49524018041742e-05, "loss": 0.0144, "step": 4047 }, { "epoch": 2.894012511170688, "grad_norm": 0.015707438811659813, "learning_rate": 4.492756104846193e-05, "loss": 0.007, "step": 4048 }, { "epoch": 2.894727435210009, "grad_norm": 0.01951874978840351, "learning_rate": 4.4902721557710837e-05, "loss": 0.0134, "step": 4049 }, { "epoch": 2.89544235924933, "grad_norm": 0.018479397520422935, "learning_rate": 4.487788333811536e-05, "loss": 0.0081, "step": 4050 }, { "epoch": 2.89544235924933, "eval_loss": 0.009486679919064045, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4050 }, { "epoch": 2.896157283288651, "grad_norm": 0.02095634862780571, "learning_rate": 4.4853046395869655e-05, "loss": 0.0105, "step": 4051 }, { "epoch": 2.8968722073279713, "grad_norm": 0.02646881341934204, "learning_rate": 4.482821073716753e-05, "loss": 0.0173, "step": 4052 }, { "epoch": 2.8975871313672923, "grad_norm": 0.012154079042375088, "learning_rate": 4.4803376368202484e-05, "loss": 0.005, "step": 4053 }, { "epoch": 2.898302055406613, "grad_norm": 0.01934860087931156, "learning_rate": 4.477854329516769e-05, "loss": 0.0095, "step": 4054 }, { "epoch": 2.8990169794459337, "grad_norm": 0.02205142006278038, "learning_rate": 4.4753711524255994e-05, "loss": 0.0107, "step": 4055 }, { "epoch": 2.8990169794459337, "eval_loss": 0.009467744268476963, "eval_runtime": 4.5823, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4055 }, { "epoch": 2.8997319034852547, "grad_norm": 0.018474403768777847, "learning_rate": 4.472888106165995e-05, "loss": 0.0072, "step": 4056 }, { "epoch": 2.9004468275245756, "grad_norm": 0.019231697544455528, "learning_rate": 4.4704051913571767e-05, "loss": 0.0094, "step": 4057 }, { "epoch": 2.9011617515638966, "grad_norm": 0.01967509835958481, "learning_rate": 4.467922408618329e-05, "loss": 0.0085, "step": 4058 }, { "epoch": 2.901876675603217, "grad_norm": 0.018039481714367867, "learning_rate": 4.465439758568608e-05, "loss": 0.0147, "step": 4059 }, { "epoch": 2.902591599642538, "grad_norm": 0.021168945357203484, "learning_rate": 4.462957241827135e-05, "loss": 0.0103, "step": 4060 }, { "epoch": 2.902591599642538, "eval_loss": 0.009411572478711605, "eval_runtime": 4.6194, "eval_samples_per_second": 10.824, "eval_steps_per_second": 2.814, "step": 4060 }, { "epoch": 2.903306523681859, "grad_norm": 0.025572672486305237, "learning_rate": 4.460474859012998e-05, "loss": 0.012, "step": 4061 }, { "epoch": 2.9040214477211794, "grad_norm": 0.020501255989074707, "learning_rate": 4.457992610745252e-05, "loss": 0.0075, "step": 4062 }, { "epoch": 2.9047363717605004, "grad_norm": 0.02515609748661518, "learning_rate": 4.455510497642918e-05, "loss": 0.0142, "step": 4063 }, { "epoch": 2.9054512957998213, "grad_norm": 0.016588233411312103, "learning_rate": 4.453028520324984e-05, "loss": 0.0084, "step": 4064 }, { "epoch": 2.9061662198391423, "grad_norm": 0.020565170794725418, "learning_rate": 4.450546679410404e-05, "loss": 0.0134, "step": 4065 }, { "epoch": 2.9061662198391423, "eval_loss": 0.009331113658845425, "eval_runtime": 4.5765, "eval_samples_per_second": 10.925, "eval_steps_per_second": 2.841, "step": 4065 }, { "epoch": 2.9068811438784627, "grad_norm": 0.026046698912978172, "learning_rate": 4.448064975518096e-05, "loss": 0.0127, "step": 4066 }, { "epoch": 2.9075960679177837, "grad_norm": 0.022838793694972992, "learning_rate": 4.445583409266947e-05, "loss": 0.0124, "step": 4067 }, { "epoch": 2.9083109919571046, "grad_norm": 0.01632099226117134, "learning_rate": 4.44310198127581e-05, "loss": 0.0084, "step": 4068 }, { "epoch": 2.909025915996425, "grad_norm": 0.01720251515507698, "learning_rate": 4.440620692163498e-05, "loss": 0.0103, "step": 4069 }, { "epoch": 2.909740840035746, "grad_norm": 0.027761666104197502, "learning_rate": 4.438139542548794e-05, "loss": 0.0169, "step": 4070 }, { "epoch": 2.909740840035746, "eval_loss": 0.009308157488703728, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4070 }, { "epoch": 2.910455764075067, "grad_norm": 0.021762333810329437, "learning_rate": 4.435658533050448e-05, "loss": 0.0164, "step": 4071 }, { "epoch": 2.911170688114388, "grad_norm": 0.021341176703572273, "learning_rate": 4.43317766428717e-05, "loss": 0.0123, "step": 4072 }, { "epoch": 2.911885612153709, "grad_norm": 0.02612178586423397, "learning_rate": 4.4306969368776376e-05, "loss": 0.0173, "step": 4073 }, { "epoch": 2.9126005361930294, "grad_norm": 0.020595841109752655, "learning_rate": 4.428216351440492e-05, "loss": 0.0132, "step": 4074 }, { "epoch": 2.9133154602323503, "grad_norm": 0.019136346876621246, "learning_rate": 4.4257359085943414e-05, "loss": 0.0156, "step": 4075 }, { "epoch": 2.9133154602323503, "eval_loss": 0.009278818033635616, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4075 }, { "epoch": 2.9140303842716713, "grad_norm": 0.02302454598248005, "learning_rate": 4.423255608957757e-05, "loss": 0.0119, "step": 4076 }, { "epoch": 2.9147453083109918, "grad_norm": 0.014921791851520538, "learning_rate": 4.420775453149273e-05, "loss": 0.0081, "step": 4077 }, { "epoch": 2.9154602323503127, "grad_norm": 0.017377479001879692, "learning_rate": 4.4182954417873867e-05, "loss": 0.0104, "step": 4078 }, { "epoch": 2.9161751563896336, "grad_norm": 0.017156658694148064, "learning_rate": 4.4158155754905656e-05, "loss": 0.0085, "step": 4079 }, { "epoch": 2.9168900804289546, "grad_norm": 0.02055978775024414, "learning_rate": 4.4133358548772364e-05, "loss": 0.0131, "step": 4080 }, { "epoch": 2.9168900804289546, "eval_loss": 0.009318161755800247, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4080 }, { "epoch": 2.917605004468275, "grad_norm": 0.013213838450610638, "learning_rate": 4.410856280565789e-05, "loss": 0.0095, "step": 4081 }, { "epoch": 2.918319928507596, "grad_norm": 0.019927704706788063, "learning_rate": 4.4083768531745775e-05, "loss": 0.0088, "step": 4082 }, { "epoch": 2.919034852546917, "grad_norm": 0.017245078459382057, "learning_rate": 4.40589757332192e-05, "loss": 0.0078, "step": 4083 }, { "epoch": 2.9197497765862375, "grad_norm": 0.020503323525190353, "learning_rate": 4.4034184416260975e-05, "loss": 0.0123, "step": 4084 }, { "epoch": 2.9204647006255584, "grad_norm": 0.025176867842674255, "learning_rate": 4.4009394587053554e-05, "loss": 0.0206, "step": 4085 }, { "epoch": 2.9204647006255584, "eval_loss": 0.009399720467627048, "eval_runtime": 4.5823, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4085 }, { "epoch": 2.9211796246648793, "grad_norm": 0.020706893876194954, "learning_rate": 4.398460625177899e-05, "loss": 0.0114, "step": 4086 }, { "epoch": 2.9218945487042003, "grad_norm": 0.017067143693566322, "learning_rate": 4.395981941661897e-05, "loss": 0.009, "step": 4087 }, { "epoch": 2.9226094727435212, "grad_norm": 0.01940217614173889, "learning_rate": 4.393503408775485e-05, "loss": 0.0154, "step": 4088 }, { "epoch": 2.9233243967828417, "grad_norm": 0.0184321291744709, "learning_rate": 4.391025027136756e-05, "loss": 0.0114, "step": 4089 }, { "epoch": 2.9240393208221627, "grad_norm": 0.02079356275498867, "learning_rate": 4.388546797363766e-05, "loss": 0.0113, "step": 4090 }, { "epoch": 2.9240393208221627, "eval_loss": 0.009337745606899261, "eval_runtime": 4.5919, "eval_samples_per_second": 10.889, "eval_steps_per_second": 2.831, "step": 4090 }, { "epoch": 2.9247542448614836, "grad_norm": 0.02157442644238472, "learning_rate": 4.386068720074536e-05, "loss": 0.0207, "step": 4091 }, { "epoch": 2.925469168900804, "grad_norm": 0.02172680012881756, "learning_rate": 4.383590795887046e-05, "loss": 0.0148, "step": 4092 }, { "epoch": 2.926184092940125, "grad_norm": 0.016780763864517212, "learning_rate": 4.38111302541924e-05, "loss": 0.0073, "step": 4093 }, { "epoch": 2.926899016979446, "grad_norm": 0.020590856671333313, "learning_rate": 4.3786354092890206e-05, "loss": 0.0142, "step": 4094 }, { "epoch": 2.927613941018767, "grad_norm": 0.023433735594153404, "learning_rate": 4.376157948114256e-05, "loss": 0.0142, "step": 4095 }, { "epoch": 2.927613941018767, "eval_loss": 0.009385247714817524, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 4095 }, { "epoch": 2.9283288650580874, "grad_norm": 0.015642227604985237, "learning_rate": 4.373680642512773e-05, "loss": 0.0077, "step": 4096 }, { "epoch": 2.9290437890974084, "grad_norm": 0.021872663870453835, "learning_rate": 4.371203493102359e-05, "loss": 0.0159, "step": 4097 }, { "epoch": 2.9297587131367293, "grad_norm": 0.02150583453476429, "learning_rate": 4.3687265005007654e-05, "loss": 0.011, "step": 4098 }, { "epoch": 2.93047363717605, "grad_norm": 0.025683404877781868, "learning_rate": 4.366249665325703e-05, "loss": 0.0157, "step": 4099 }, { "epoch": 2.9311885612153707, "grad_norm": 0.02220023423433304, "learning_rate": 4.3637729881948394e-05, "loss": 0.012, "step": 4100 }, { "epoch": 2.9311885612153707, "eval_loss": 0.009296510368585587, "eval_runtime": 4.5825, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4100 }, { "epoch": 2.9319034852546917, "grad_norm": 0.020007889717817307, "learning_rate": 4.361296469725813e-05, "loss": 0.0103, "step": 4101 }, { "epoch": 2.9326184092940126, "grad_norm": 0.021182378754019737, "learning_rate": 4.358820110536212e-05, "loss": 0.0118, "step": 4102 }, { "epoch": 2.9333333333333336, "grad_norm": 0.02322842739522457, "learning_rate": 4.35634391124359e-05, "loss": 0.0138, "step": 4103 }, { "epoch": 2.934048257372654, "grad_norm": 0.016023827716708183, "learning_rate": 4.35386787246546e-05, "loss": 0.0074, "step": 4104 }, { "epoch": 2.934763181411975, "grad_norm": 0.02007250115275383, "learning_rate": 4.351391994819294e-05, "loss": 0.0121, "step": 4105 }, { "epoch": 2.934763181411975, "eval_loss": 0.00925239734351635, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4105 }, { "epoch": 2.935478105451296, "grad_norm": 0.01981605403125286, "learning_rate": 4.3489162789225266e-05, "loss": 0.0082, "step": 4106 }, { "epoch": 2.9361930294906164, "grad_norm": 0.019079815596342087, "learning_rate": 4.346440725392546e-05, "loss": 0.0112, "step": 4107 }, { "epoch": 2.9369079535299374, "grad_norm": 0.01820473186671734, "learning_rate": 4.343965334846708e-05, "loss": 0.0069, "step": 4108 }, { "epoch": 2.9376228775692583, "grad_norm": 0.019862813875079155, "learning_rate": 4.341490107902323e-05, "loss": 0.0111, "step": 4109 }, { "epoch": 2.9383378016085793, "grad_norm": 0.028598181903362274, "learning_rate": 4.339015045176659e-05, "loss": 0.0107, "step": 4110 }, { "epoch": 2.9383378016085793, "eval_loss": 0.009368419647216797, "eval_runtime": 4.5951, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 4110 }, { "epoch": 2.9390527256478998, "grad_norm": 0.026065733283758163, "learning_rate": 4.336540147286946e-05, "loss": 0.016, "step": 4111 }, { "epoch": 2.9397676496872207, "grad_norm": 0.01946023292839527, "learning_rate": 4.334065414850375e-05, "loss": 0.0087, "step": 4112 }, { "epoch": 2.9404825737265416, "grad_norm": 0.021196408197283745, "learning_rate": 4.3315908484840905e-05, "loss": 0.0121, "step": 4113 }, { "epoch": 2.941197497765862, "grad_norm": 0.019067304208874702, "learning_rate": 4.329116448805198e-05, "loss": 0.0096, "step": 4114 }, { "epoch": 2.941912421805183, "grad_norm": 0.015126046724617481, "learning_rate": 4.326642216430763e-05, "loss": 0.0069, "step": 4115 }, { "epoch": 2.941912421805183, "eval_loss": 0.00933968834578991, "eval_runtime": 4.6191, "eval_samples_per_second": 10.825, "eval_steps_per_second": 2.814, "step": 4115 }, { "epoch": 2.942627345844504, "grad_norm": 0.021942703053355217, "learning_rate": 4.324168151977807e-05, "loss": 0.0113, "step": 4116 }, { "epoch": 2.943342269883825, "grad_norm": 0.022929325699806213, "learning_rate": 4.321694256063311e-05, "loss": 0.0158, "step": 4117 }, { "epoch": 2.944057193923146, "grad_norm": 0.01964411325752735, "learning_rate": 4.3192205293042104e-05, "loss": 0.0087, "step": 4118 }, { "epoch": 2.9447721179624664, "grad_norm": 0.023028235882520676, "learning_rate": 4.316746972317406e-05, "loss": 0.0135, "step": 4119 }, { "epoch": 2.9454870420017873, "grad_norm": 0.019652526825666428, "learning_rate": 4.3142735857197483e-05, "loss": 0.0084, "step": 4120 }, { "epoch": 2.9454870420017873, "eval_loss": 0.009256185963749886, "eval_runtime": 4.6181, "eval_samples_per_second": 10.827, "eval_steps_per_second": 2.815, "step": 4120 }, { "epoch": 2.9462019660411083, "grad_norm": 0.021463952958583832, "learning_rate": 4.311800370128051e-05, "loss": 0.0123, "step": 4121 }, { "epoch": 2.946916890080429, "grad_norm": 0.02259257435798645, "learning_rate": 4.309327326159078e-05, "loss": 0.0104, "step": 4122 }, { "epoch": 2.9476318141197497, "grad_norm": 0.022536812350153923, "learning_rate": 4.3068544544295616e-05, "loss": 0.0125, "step": 4123 }, { "epoch": 2.9483467381590707, "grad_norm": 0.02179431915283203, "learning_rate": 4.304381755556182e-05, "loss": 0.0118, "step": 4124 }, { "epoch": 2.9490616621983916, "grad_norm": 0.023109344765543938, "learning_rate": 4.301909230155579e-05, "loss": 0.0075, "step": 4125 }, { "epoch": 2.9490616621983916, "eval_loss": 0.00937414076179266, "eval_runtime": 4.6008, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 4125 }, { "epoch": 2.949776586237712, "grad_norm": 0.02166130766272545, "learning_rate": 4.29943687884435e-05, "loss": 0.0123, "step": 4126 }, { "epoch": 2.950491510277033, "grad_norm": 0.023679377511143684, "learning_rate": 4.296964702239046e-05, "loss": 0.0153, "step": 4127 }, { "epoch": 2.951206434316354, "grad_norm": 0.025442760437726974, "learning_rate": 4.2944927009561786e-05, "loss": 0.0156, "step": 4128 }, { "epoch": 2.9519213583556745, "grad_norm": 0.017397325485944748, "learning_rate": 4.292020875612214e-05, "loss": 0.0094, "step": 4129 }, { "epoch": 2.9526362823949954, "grad_norm": 0.01798865757882595, "learning_rate": 4.2895492268235725e-05, "loss": 0.0098, "step": 4130 }, { "epoch": 2.9526362823949954, "eval_loss": 0.009339657612144947, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 4130 }, { "epoch": 2.9533512064343164, "grad_norm": 0.014761867001652718, "learning_rate": 4.2870777552066334e-05, "loss": 0.0093, "step": 4131 }, { "epoch": 2.9540661304736373, "grad_norm": 0.022372804582118988, "learning_rate": 4.28460646137773e-05, "loss": 0.0208, "step": 4132 }, { "epoch": 2.9547810545129582, "grad_norm": 0.013142753392457962, "learning_rate": 4.282135345953152e-05, "loss": 0.008, "step": 4133 }, { "epoch": 2.9554959785522787, "grad_norm": 0.020591866225004196, "learning_rate": 4.279664409549144e-05, "loss": 0.0126, "step": 4134 }, { "epoch": 2.9562109025915997, "grad_norm": 0.017533812671899796, "learning_rate": 4.277193652781906e-05, "loss": 0.0107, "step": 4135 }, { "epoch": 2.9562109025915997, "eval_loss": 0.009322415105998516, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4135 }, { "epoch": 2.9569258266309206, "grad_norm": 0.014322835952043533, "learning_rate": 4.274723076267595e-05, "loss": 0.0074, "step": 4136 }, { "epoch": 2.957640750670241, "grad_norm": 0.020083099603652954, "learning_rate": 4.272252680622321e-05, "loss": 0.0159, "step": 4137 }, { "epoch": 2.958355674709562, "grad_norm": 0.017285378649830818, "learning_rate": 4.2697824664621484e-05, "loss": 0.0137, "step": 4138 }, { "epoch": 2.959070598748883, "grad_norm": 0.020605113357305527, "learning_rate": 4.267312434403099e-05, "loss": 0.0147, "step": 4139 }, { "epoch": 2.959785522788204, "grad_norm": 0.01478940062224865, "learning_rate": 4.264842585061147e-05, "loss": 0.0117, "step": 4140 }, { "epoch": 2.959785522788204, "eval_loss": 0.009315062314271927, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 4140 }, { "epoch": 2.9605004468275244, "grad_norm": 0.015672391280531883, "learning_rate": 4.262372919052221e-05, "loss": 0.0105, "step": 4141 }, { "epoch": 2.9612153708668454, "grad_norm": 0.018279295414686203, "learning_rate": 4.259903436992204e-05, "loss": 0.0144, "step": 4142 }, { "epoch": 2.9619302949061663, "grad_norm": 0.013824786990880966, "learning_rate": 4.2574341394969365e-05, "loss": 0.0055, "step": 4143 }, { "epoch": 2.962645218945487, "grad_norm": 0.021533312276005745, "learning_rate": 4.254965027182206e-05, "loss": 0.009, "step": 4144 }, { "epoch": 2.9633601429848078, "grad_norm": 0.019075661897659302, "learning_rate": 4.252496100663762e-05, "loss": 0.009, "step": 4145 }, { "epoch": 2.9633601429848078, "eval_loss": 0.009406158700585365, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4145 }, { "epoch": 2.9640750670241287, "grad_norm": 0.02010565996170044, "learning_rate": 4.250027360557302e-05, "loss": 0.0121, "step": 4146 }, { "epoch": 2.9647899910634496, "grad_norm": 0.020640412345528603, "learning_rate": 4.247558807478478e-05, "loss": 0.0099, "step": 4147 }, { "epoch": 2.9655049151027706, "grad_norm": 0.022241029888391495, "learning_rate": 4.245090442042897e-05, "loss": 0.0073, "step": 4148 }, { "epoch": 2.966219839142091, "grad_norm": 0.02228499762713909, "learning_rate": 4.2426222648661176e-05, "loss": 0.0114, "step": 4149 }, { "epoch": 2.966934763181412, "grad_norm": 0.017516303807497025, "learning_rate": 4.240154276563653e-05, "loss": 0.0093, "step": 4150 }, { "epoch": 2.966934763181412, "eval_loss": 0.009450871497392654, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 4150 }, { "epoch": 2.9676496872207325, "grad_norm": 0.017703190445899963, "learning_rate": 4.2376864777509654e-05, "loss": 0.0086, "step": 4151 }, { "epoch": 2.9683646112600535, "grad_norm": 0.01592276059091091, "learning_rate": 4.235218869043476e-05, "loss": 0.0072, "step": 4152 }, { "epoch": 2.9690795352993744, "grad_norm": 0.024926260113716125, "learning_rate": 4.2327514510565544e-05, "loss": 0.0129, "step": 4153 }, { "epoch": 2.9697944593386953, "grad_norm": 0.01991606131196022, "learning_rate": 4.2302842244055236e-05, "loss": 0.0102, "step": 4154 }, { "epoch": 2.9705093833780163, "grad_norm": 0.02559712529182434, "learning_rate": 4.227817189705657e-05, "loss": 0.0078, "step": 4155 }, { "epoch": 2.9705093833780163, "eval_loss": 0.00945497490465641, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4155 }, { "epoch": 2.971224307417337, "grad_norm": 0.0192166306078434, "learning_rate": 4.2253503475721844e-05, "loss": 0.0091, "step": 4156 }, { "epoch": 2.9719392314566577, "grad_norm": 0.018036440014839172, "learning_rate": 4.222883698620284e-05, "loss": 0.0089, "step": 4157 }, { "epoch": 2.9726541554959787, "grad_norm": 0.0182360727339983, "learning_rate": 4.2204172434650866e-05, "loss": 0.0133, "step": 4158 }, { "epoch": 2.973369079535299, "grad_norm": 0.019610179588198662, "learning_rate": 4.217950982721675e-05, "loss": 0.0129, "step": 4159 }, { "epoch": 2.97408400357462, "grad_norm": 0.016360780224204063, "learning_rate": 4.215484917005085e-05, "loss": 0.0106, "step": 4160 }, { "epoch": 2.97408400357462, "eval_loss": 0.009396161884069443, "eval_runtime": 4.6028, "eval_samples_per_second": 10.863, "eval_steps_per_second": 2.824, "step": 4160 }, { "epoch": 2.974798927613941, "grad_norm": 0.022767173126339912, "learning_rate": 4.213019046930301e-05, "loss": 0.013, "step": 4161 }, { "epoch": 2.975513851653262, "grad_norm": 0.02483181841671467, "learning_rate": 4.210553373112259e-05, "loss": 0.0196, "step": 4162 }, { "epoch": 2.976228775692583, "grad_norm": 0.017638420686125755, "learning_rate": 4.2080878961658486e-05, "loss": 0.0147, "step": 4163 }, { "epoch": 2.9769436997319034, "grad_norm": 0.02503577619791031, "learning_rate": 4.2056226167059085e-05, "loss": 0.0119, "step": 4164 }, { "epoch": 2.9776586237712244, "grad_norm": 0.014528445899486542, "learning_rate": 4.203157535347229e-05, "loss": 0.0058, "step": 4165 }, { "epoch": 2.9776586237712244, "eval_loss": 0.009452412836253643, "eval_runtime": 4.5878, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 4165 }, { "epoch": 2.978373547810545, "grad_norm": 0.02039092592895031, "learning_rate": 4.200692652704545e-05, "loss": 0.0108, "step": 4166 }, { "epoch": 2.979088471849866, "grad_norm": 0.021910913288593292, "learning_rate": 4.198227969392555e-05, "loss": 0.0137, "step": 4167 }, { "epoch": 2.9798033958891867, "grad_norm": 0.019015895202755928, "learning_rate": 4.195763486025895e-05, "loss": 0.0089, "step": 4168 }, { "epoch": 2.9805183199285077, "grad_norm": 0.016089744865894318, "learning_rate": 4.1932992032191594e-05, "loss": 0.0071, "step": 4169 }, { "epoch": 2.9812332439678286, "grad_norm": 0.01657823473215103, "learning_rate": 4.190835121586887e-05, "loss": 0.0081, "step": 4170 }, { "epoch": 2.9812332439678286, "eval_loss": 0.009344993159174919, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4170 }, { "epoch": 2.981948168007149, "grad_norm": 0.022203465923666954, "learning_rate": 4.188371241743567e-05, "loss": 0.0103, "step": 4171 }, { "epoch": 2.98266309204647, "grad_norm": 0.023685438558459282, "learning_rate": 4.185907564303644e-05, "loss": 0.0114, "step": 4172 }, { "epoch": 2.983378016085791, "grad_norm": 0.02113027684390545, "learning_rate": 4.1834440898815056e-05, "loss": 0.0134, "step": 4173 }, { "epoch": 2.9840929401251115, "grad_norm": 0.019143003970384598, "learning_rate": 4.180980819091492e-05, "loss": 0.015, "step": 4174 }, { "epoch": 2.9848078641644324, "grad_norm": 0.01846488192677498, "learning_rate": 4.178517752547891e-05, "loss": 0.0073, "step": 4175 }, { "epoch": 2.9848078641644324, "eval_loss": 0.009251091629266739, "eval_runtime": 4.5844, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 4175 }, { "epoch": 2.9855227882037534, "grad_norm": 0.01751372031867504, "learning_rate": 4.1760548908649416e-05, "loss": 0.0114, "step": 4176 }, { "epoch": 2.9862377122430743, "grad_norm": 0.01966174878180027, "learning_rate": 4.17359223465683e-05, "loss": 0.0155, "step": 4177 }, { "epoch": 2.9869526362823953, "grad_norm": 0.020546432584524155, "learning_rate": 4.1711297845376917e-05, "loss": 0.0102, "step": 4178 }, { "epoch": 2.9876675603217158, "grad_norm": 0.016927147284150124, "learning_rate": 4.168667541121609e-05, "loss": 0.0076, "step": 4179 }, { "epoch": 2.9883824843610367, "grad_norm": 0.01725955866277218, "learning_rate": 4.166205505022618e-05, "loss": 0.0121, "step": 4180 }, { "epoch": 2.9883824843610367, "eval_loss": 0.009502245113253593, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 4180 }, { "epoch": 2.989097408400357, "grad_norm": 0.0231107696890831, "learning_rate": 4.163743676854697e-05, "loss": 0.0146, "step": 4181 }, { "epoch": 2.989812332439678, "grad_norm": 0.019069872796535492, "learning_rate": 4.161282057231776e-05, "loss": 0.0104, "step": 4182 }, { "epoch": 2.990527256478999, "grad_norm": 0.021371737122535706, "learning_rate": 4.1588206467677294e-05, "loss": 0.0138, "step": 4183 }, { "epoch": 2.99124218051832, "grad_norm": 0.02176518552005291, "learning_rate": 4.1563594460763846e-05, "loss": 0.0083, "step": 4184 }, { "epoch": 2.991957104557641, "grad_norm": 0.019786017015576363, "learning_rate": 4.153898455771514e-05, "loss": 0.0101, "step": 4185 }, { "epoch": 2.991957104557641, "eval_loss": 0.009479643777012825, "eval_runtime": 4.6367, "eval_samples_per_second": 10.783, "eval_steps_per_second": 2.804, "step": 4185 }, { "epoch": 2.9926720285969615, "grad_norm": 0.022599970921874046, "learning_rate": 4.151437676466836e-05, "loss": 0.0103, "step": 4186 }, { "epoch": 2.9933869526362824, "grad_norm": 0.016784608364105225, "learning_rate": 4.148977108776021e-05, "loss": 0.0067, "step": 4187 }, { "epoch": 2.9941018766756033, "grad_norm": 0.02280685491859913, "learning_rate": 4.146516753312677e-05, "loss": 0.0147, "step": 4188 }, { "epoch": 2.994816800714924, "grad_norm": 0.015338772907853127, "learning_rate": 4.144056610690374e-05, "loss": 0.0066, "step": 4189 }, { "epoch": 2.995531724754245, "grad_norm": 0.014092134311795235, "learning_rate": 4.141596681522617e-05, "loss": 0.0066, "step": 4190 }, { "epoch": 2.995531724754245, "eval_loss": 0.009395349770784378, "eval_runtime": 4.5903, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 4190 }, { "epoch": 2.9962466487935657, "grad_norm": 0.019677462056279182, "learning_rate": 4.13913696642286e-05, "loss": 0.0108, "step": 4191 }, { "epoch": 2.9969615728328867, "grad_norm": 0.01666819117963314, "learning_rate": 4.136677466004506e-05, "loss": 0.009, "step": 4192 }, { "epoch": 2.997676496872207, "grad_norm": 0.01786665990948677, "learning_rate": 4.134218180880905e-05, "loss": 0.0117, "step": 4193 }, { "epoch": 2.998391420911528, "grad_norm": 0.016621900722384453, "learning_rate": 4.131759111665349e-05, "loss": 0.0069, "step": 4194 }, { "epoch": 2.999106344950849, "grad_norm": 0.019217083230614662, "learning_rate": 4.1293002589710794e-05, "loss": 0.0125, "step": 4195 }, { "epoch": 2.999106344950849, "eval_loss": 0.009364133700728416, "eval_runtime": 4.596, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 4195 }, { "epoch": 2.9998212689901695, "grad_norm": 0.02339281141757965, "learning_rate": 4.1268416234112856e-05, "loss": 0.0086, "step": 4196 }, { "epoch": 3.0005361930294905, "grad_norm": 0.026480259373784065, "learning_rate": 4.1243832055990986e-05, "loss": 0.0143, "step": 4197 }, { "epoch": 3.0012511170688114, "grad_norm": 0.019079750403761864, "learning_rate": 4.121925006147597e-05, "loss": 0.0076, "step": 4198 }, { "epoch": 3.0019660411081324, "grad_norm": 0.03606313839554787, "learning_rate": 4.119467025669803e-05, "loss": 0.0086, "step": 4199 }, { "epoch": 3.002680965147453, "grad_norm": 0.02885003387928009, "learning_rate": 4.11700926477869e-05, "loss": 0.0114, "step": 4200 }, { "epoch": 3.002680965147453, "eval_loss": 0.009450279176235199, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 4200 }, { "epoch": 3.003395889186774, "grad_norm": 0.0187782421708107, "learning_rate": 4.1145517240871695e-05, "loss": 0.0096, "step": 4201 }, { "epoch": 3.0041108132260947, "grad_norm": 0.015348988585174084, "learning_rate": 4.1120944042081025e-05, "loss": 0.0061, "step": 4202 }, { "epoch": 3.0048257372654157, "grad_norm": 0.01864839531481266, "learning_rate": 4.109637305754293e-05, "loss": 0.0067, "step": 4203 }, { "epoch": 3.005540661304736, "grad_norm": 0.01620553806424141, "learning_rate": 4.1071804293384904e-05, "loss": 0.0079, "step": 4204 }, { "epoch": 3.006255585344057, "grad_norm": 0.019680028781294823, "learning_rate": 4.10472377557339e-05, "loss": 0.0061, "step": 4205 }, { "epoch": 3.006255585344057, "eval_loss": 0.009695898741483688, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 4205 }, { "epoch": 3.006970509383378, "grad_norm": 0.024634748697280884, "learning_rate": 4.102267345071629e-05, "loss": 0.007, "step": 4206 }, { "epoch": 3.007685433422699, "grad_norm": 0.023954778909683228, "learning_rate": 4.09981113844579e-05, "loss": 0.0118, "step": 4207 }, { "epoch": 3.0084003574620195, "grad_norm": 0.026115721091628075, "learning_rate": 4.097355156308402e-05, "loss": 0.0074, "step": 4208 }, { "epoch": 3.0091152815013404, "grad_norm": 0.020988324657082558, "learning_rate": 4.0948993992719346e-05, "loss": 0.0063, "step": 4209 }, { "epoch": 3.0098302055406614, "grad_norm": 0.02322893962264061, "learning_rate": 4.092443867948801e-05, "loss": 0.0091, "step": 4210 }, { "epoch": 3.0098302055406614, "eval_loss": 0.009840661659836769, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4210 }, { "epoch": 3.0105451295799823, "grad_norm": 0.02197432331740856, "learning_rate": 4.0899885629513636e-05, "loss": 0.0066, "step": 4211 }, { "epoch": 3.011260053619303, "grad_norm": 0.026239894330501556, "learning_rate": 4.087533484891922e-05, "loss": 0.0093, "step": 4212 }, { "epoch": 3.0119749776586238, "grad_norm": 0.020737774670124054, "learning_rate": 4.0850786343827235e-05, "loss": 0.0094, "step": 4213 }, { "epoch": 3.0126899016979447, "grad_norm": 0.018864985555410385, "learning_rate": 4.082624012035956e-05, "loss": 0.0081, "step": 4214 }, { "epoch": 3.013404825737265, "grad_norm": 0.019377680495381355, "learning_rate": 4.080169618463752e-05, "loss": 0.0083, "step": 4215 }, { "epoch": 3.013404825737265, "eval_loss": 0.009775602258741856, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4215 }, { "epoch": 3.014119749776586, "grad_norm": 0.015472108498215675, "learning_rate": 4.0777154542781844e-05, "loss": 0.0056, "step": 4216 }, { "epoch": 3.014834673815907, "grad_norm": 0.019116438925266266, "learning_rate": 4.075261520091273e-05, "loss": 0.0062, "step": 4217 }, { "epoch": 3.015549597855228, "grad_norm": 0.024950675666332245, "learning_rate": 4.072807816514978e-05, "loss": 0.0074, "step": 4218 }, { "epoch": 3.0162645218945485, "grad_norm": 0.023081446066498756, "learning_rate": 4.070354344161201e-05, "loss": 0.007, "step": 4219 }, { "epoch": 3.0169794459338695, "grad_norm": 0.017340483143925667, "learning_rate": 4.067901103641789e-05, "loss": 0.0086, "step": 4220 }, { "epoch": 3.0169794459338695, "eval_loss": 0.009748578071594238, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4220 }, { "epoch": 3.0176943699731904, "grad_norm": 0.02354114130139351, "learning_rate": 4.065448095568527e-05, "loss": 0.0106, "step": 4221 }, { "epoch": 3.0184092940125113, "grad_norm": 0.02315167337656021, "learning_rate": 4.062995320553147e-05, "loss": 0.0088, "step": 4222 }, { "epoch": 3.019124218051832, "grad_norm": 0.01490732841193676, "learning_rate": 4.060542779207317e-05, "loss": 0.006, "step": 4223 }, { "epoch": 3.019839142091153, "grad_norm": 0.023217422887682915, "learning_rate": 4.058090472142654e-05, "loss": 0.0135, "step": 4224 }, { "epoch": 3.0205540661304737, "grad_norm": 0.02969435788691044, "learning_rate": 4.0556383999707096e-05, "loss": 0.0089, "step": 4225 }, { "epoch": 3.0205540661304737, "eval_loss": 0.009743588045239449, "eval_runtime": 4.6103, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 4225 }, { "epoch": 3.0212689901697947, "grad_norm": 0.025598784908652306, "learning_rate": 4.053186563302981e-05, "loss": 0.0089, "step": 4226 }, { "epoch": 3.021983914209115, "grad_norm": 0.02582564949989319, "learning_rate": 4.0507349627509036e-05, "loss": 0.0129, "step": 4227 }, { "epoch": 3.022698838248436, "grad_norm": 0.02182547003030777, "learning_rate": 4.0482835989258596e-05, "loss": 0.0051, "step": 4228 }, { "epoch": 3.023413762287757, "grad_norm": 0.026572002097964287, "learning_rate": 4.045832472439165e-05, "loss": 0.0165, "step": 4229 }, { "epoch": 3.0241286863270775, "grad_norm": 0.02098054438829422, "learning_rate": 4.043381583902081e-05, "loss": 0.0089, "step": 4230 }, { "epoch": 3.0241286863270775, "eval_loss": 0.009719674475491047, "eval_runtime": 4.5875, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 4230 }, { "epoch": 3.0248436103663985, "grad_norm": 0.02607264742255211, "learning_rate": 4.040930933925808e-05, "loss": 0.01, "step": 4231 }, { "epoch": 3.0255585344057194, "grad_norm": 0.029421884566545486, "learning_rate": 4.038480523121488e-05, "loss": 0.0065, "step": 4232 }, { "epoch": 3.0262734584450404, "grad_norm": 0.02418660931289196, "learning_rate": 4.036030352100201e-05, "loss": 0.0116, "step": 4233 }, { "epoch": 3.026988382484361, "grad_norm": 0.025699812918901443, "learning_rate": 4.033580421472973e-05, "loss": 0.0127, "step": 4234 }, { "epoch": 3.027703306523682, "grad_norm": 0.024757787585258484, "learning_rate": 4.031130731850762e-05, "loss": 0.008, "step": 4235 }, { "epoch": 3.027703306523682, "eval_loss": 0.009821576997637749, "eval_runtime": 4.6077, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 4235 }, { "epoch": 3.0284182305630027, "grad_norm": 0.028672775253653526, "learning_rate": 4.028681283844471e-05, "loss": 0.0101, "step": 4236 }, { "epoch": 3.0291331546023237, "grad_norm": 0.02001885510981083, "learning_rate": 4.026232078064942e-05, "loss": 0.0089, "step": 4237 }, { "epoch": 3.029848078641644, "grad_norm": 0.02235858328640461, "learning_rate": 4.023783115122957e-05, "loss": 0.0078, "step": 4238 }, { "epoch": 3.030563002680965, "grad_norm": 0.022704903036355972, "learning_rate": 4.021334395629234e-05, "loss": 0.0098, "step": 4239 }, { "epoch": 3.031277926720286, "grad_norm": 0.02196570858359337, "learning_rate": 4.018885920194434e-05, "loss": 0.0092, "step": 4240 }, { "epoch": 3.031277926720286, "eval_loss": 0.009909252636134624, "eval_runtime": 4.6408, "eval_samples_per_second": 10.774, "eval_steps_per_second": 2.801, "step": 4240 }, { "epoch": 3.031992850759607, "grad_norm": 0.019807105883955956, "learning_rate": 4.016437689429157e-05, "loss": 0.0087, "step": 4241 }, { "epoch": 3.0327077747989275, "grad_norm": 0.018971221521496773, "learning_rate": 4.013989703943941e-05, "loss": 0.0067, "step": 4242 }, { "epoch": 3.0334226988382484, "grad_norm": 0.02643708884716034, "learning_rate": 4.0115419643492615e-05, "loss": 0.0089, "step": 4243 }, { "epoch": 3.0341376228775694, "grad_norm": 0.01742468774318695, "learning_rate": 4.0090944712555353e-05, "loss": 0.0056, "step": 4244 }, { "epoch": 3.03485254691689, "grad_norm": 0.022690968587994576, "learning_rate": 4.006647225273116e-05, "loss": 0.0068, "step": 4245 }, { "epoch": 3.03485254691689, "eval_loss": 0.010046632960438728, "eval_runtime": 4.5959, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 4245 }, { "epoch": 3.035567470956211, "grad_norm": 0.026995399966835976, "learning_rate": 4.004200227012297e-05, "loss": 0.0128, "step": 4246 }, { "epoch": 3.0362823949955318, "grad_norm": 0.03278697654604912, "learning_rate": 4.001753477083305e-05, "loss": 0.0111, "step": 4247 }, { "epoch": 3.0369973190348527, "grad_norm": 0.01754939928650856, "learning_rate": 3.999306976096315e-05, "loss": 0.0055, "step": 4248 }, { "epoch": 3.037712243074173, "grad_norm": 0.023243330419063568, "learning_rate": 3.996860724661429e-05, "loss": 0.0124, "step": 4249 }, { "epoch": 3.038427167113494, "grad_norm": 0.022887500002980232, "learning_rate": 3.994414723388693e-05, "loss": 0.008, "step": 4250 }, { "epoch": 3.038427167113494, "eval_loss": 0.009960631839931011, "eval_runtime": 4.6, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 4250 }, { "epoch": 3.039142091152815, "grad_norm": 0.027850095182657242, "learning_rate": 3.9919689728880884e-05, "loss": 0.009, "step": 4251 }, { "epoch": 3.039857015192136, "grad_norm": 0.02328716404736042, "learning_rate": 3.989523473769535e-05, "loss": 0.0095, "step": 4252 }, { "epoch": 3.0405719392314565, "grad_norm": 0.02420770563185215, "learning_rate": 3.987078226642891e-05, "loss": 0.0078, "step": 4253 }, { "epoch": 3.0412868632707775, "grad_norm": 0.024674393236637115, "learning_rate": 3.9846332321179484e-05, "loss": 0.0093, "step": 4254 }, { "epoch": 3.0420017873100984, "grad_norm": 0.019670626148581505, "learning_rate": 3.9821884908044366e-05, "loss": 0.0087, "step": 4255 }, { "epoch": 3.0420017873100984, "eval_loss": 0.009913301095366478, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4255 }, { "epoch": 3.042716711349419, "grad_norm": 0.02297901175916195, "learning_rate": 3.979744003312027e-05, "loss": 0.0071, "step": 4256 }, { "epoch": 3.04343163538874, "grad_norm": 0.01951507478952408, "learning_rate": 3.9772997702503247e-05, "loss": 0.0074, "step": 4257 }, { "epoch": 3.044146559428061, "grad_norm": 0.03255302459001541, "learning_rate": 3.974855792228868e-05, "loss": 0.0159, "step": 4258 }, { "epoch": 3.0448614834673817, "grad_norm": 0.02479768916964531, "learning_rate": 3.972412069857136e-05, "loss": 0.0083, "step": 4259 }, { "epoch": 3.045576407506702, "grad_norm": 0.023161500692367554, "learning_rate": 3.969968603744539e-05, "loss": 0.0077, "step": 4260 }, { "epoch": 3.045576407506702, "eval_loss": 0.009783695451915264, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4260 }, { "epoch": 3.046291331546023, "grad_norm": 0.021049970760941505, "learning_rate": 3.967525394500432e-05, "loss": 0.0082, "step": 4261 }, { "epoch": 3.047006255585344, "grad_norm": 0.021125154569745064, "learning_rate": 3.9650824427340986e-05, "loss": 0.0077, "step": 4262 }, { "epoch": 3.047721179624665, "grad_norm": 0.02589944750070572, "learning_rate": 3.9626397490547594e-05, "loss": 0.01, "step": 4263 }, { "epoch": 3.0484361036639855, "grad_norm": 0.017507627606391907, "learning_rate": 3.960197314071571e-05, "loss": 0.0063, "step": 4264 }, { "epoch": 3.0491510277033065, "grad_norm": 0.01705099269747734, "learning_rate": 3.957755138393629e-05, "loss": 0.0081, "step": 4265 }, { "epoch": 3.0491510277033065, "eval_loss": 0.009692066349089146, "eval_runtime": 4.6002, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 4265 }, { "epoch": 3.0498659517426274, "grad_norm": 0.03505481034517288, "learning_rate": 3.95531322262996e-05, "loss": 0.006, "step": 4266 }, { "epoch": 3.0505808757819484, "grad_norm": 0.020601877942681313, "learning_rate": 3.9528715673895246e-05, "loss": 0.0062, "step": 4267 }, { "epoch": 3.051295799821269, "grad_norm": 0.026372641324996948, "learning_rate": 3.950430173281225e-05, "loss": 0.0142, "step": 4268 }, { "epoch": 3.05201072386059, "grad_norm": 0.021053548902273178, "learning_rate": 3.947989040913893e-05, "loss": 0.0065, "step": 4269 }, { "epoch": 3.0527256478999107, "grad_norm": 0.01809418573975563, "learning_rate": 3.945548170896296e-05, "loss": 0.0062, "step": 4270 }, { "epoch": 3.0527256478999107, "eval_loss": 0.009598931297659874, "eval_runtime": 4.611, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.819, "step": 4270 }, { "epoch": 3.0534405719392312, "grad_norm": 0.014344977214932442, "learning_rate": 3.943107563837135e-05, "loss": 0.0062, "step": 4271 }, { "epoch": 3.054155495978552, "grad_norm": 0.02864030748605728, "learning_rate": 3.9406672203450504e-05, "loss": 0.0143, "step": 4272 }, { "epoch": 3.054870420017873, "grad_norm": 0.01845909282565117, "learning_rate": 3.9382271410286095e-05, "loss": 0.0055, "step": 4273 }, { "epoch": 3.055585344057194, "grad_norm": 0.025065066292881966, "learning_rate": 3.9357873264963206e-05, "loss": 0.0098, "step": 4274 }, { "epoch": 3.0563002680965146, "grad_norm": 0.02039424329996109, "learning_rate": 3.9333477773566204e-05, "loss": 0.0068, "step": 4275 }, { "epoch": 3.0563002680965146, "eval_loss": 0.009558380581438541, "eval_runtime": 4.5827, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4275 }, { "epoch": 3.0570151921358355, "grad_norm": 0.024535540491342545, "learning_rate": 3.930908494217884e-05, "loss": 0.0078, "step": 4276 }, { "epoch": 3.0577301161751564, "grad_norm": 0.02315269224345684, "learning_rate": 3.928469477688415e-05, "loss": 0.0083, "step": 4277 }, { "epoch": 3.0584450402144774, "grad_norm": 0.02579404227435589, "learning_rate": 3.9260307283764575e-05, "loss": 0.0101, "step": 4278 }, { "epoch": 3.059159964253798, "grad_norm": 0.02378762699663639, "learning_rate": 3.923592246890183e-05, "loss": 0.0088, "step": 4279 }, { "epoch": 3.059874888293119, "grad_norm": 0.02602572739124298, "learning_rate": 3.921154033837698e-05, "loss": 0.0096, "step": 4280 }, { "epoch": 3.059874888293119, "eval_loss": 0.009445971809327602, "eval_runtime": 4.5937, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 4280 }, { "epoch": 3.0605898123324398, "grad_norm": 0.021926622837781906, "learning_rate": 3.918716089827044e-05, "loss": 0.0092, "step": 4281 }, { "epoch": 3.0613047363717607, "grad_norm": 0.025560412555933, "learning_rate": 3.916278415466193e-05, "loss": 0.0096, "step": 4282 }, { "epoch": 3.062019660411081, "grad_norm": 0.02383461594581604, "learning_rate": 3.9138410113630485e-05, "loss": 0.0069, "step": 4283 }, { "epoch": 3.062734584450402, "grad_norm": 0.017218029126524925, "learning_rate": 3.91140387812545e-05, "loss": 0.0062, "step": 4284 }, { "epoch": 3.063449508489723, "grad_norm": 0.024038225412368774, "learning_rate": 3.908967016361169e-05, "loss": 0.0121, "step": 4285 }, { "epoch": 3.063449508489723, "eval_loss": 0.0094751613214612, "eval_runtime": 4.6102, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 4285 }, { "epoch": 3.0641644325290436, "grad_norm": 0.01994771510362625, "learning_rate": 3.9065304266779076e-05, "loss": 0.0087, "step": 4286 }, { "epoch": 3.0648793565683645, "grad_norm": 0.02127186208963394, "learning_rate": 3.9040941096833e-05, "loss": 0.0101, "step": 4287 }, { "epoch": 3.0655942806076855, "grad_norm": 0.027766376733779907, "learning_rate": 3.9016580659849136e-05, "loss": 0.0072, "step": 4288 }, { "epoch": 3.0663092046470064, "grad_norm": 0.01979934796690941, "learning_rate": 3.899222296190248e-05, "loss": 0.0055, "step": 4289 }, { "epoch": 3.067024128686327, "grad_norm": 0.019384291023015976, "learning_rate": 3.8967868009067334e-05, "loss": 0.0075, "step": 4290 }, { "epoch": 3.067024128686327, "eval_loss": 0.009370317682623863, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4290 }, { "epoch": 3.067739052725648, "grad_norm": 0.024777906015515327, "learning_rate": 3.894351580741731e-05, "loss": 0.008, "step": 4291 }, { "epoch": 3.068453976764969, "grad_norm": 0.017844192683696747, "learning_rate": 3.891916636302535e-05, "loss": 0.0071, "step": 4292 }, { "epoch": 3.0691689008042897, "grad_norm": 0.029085030779242516, "learning_rate": 3.88948196819637e-05, "loss": 0.0063, "step": 4293 }, { "epoch": 3.06988382484361, "grad_norm": 0.02034417912364006, "learning_rate": 3.8870475770303924e-05, "loss": 0.0058, "step": 4294 }, { "epoch": 3.070598748882931, "grad_norm": 0.019516916945576668, "learning_rate": 3.884613463411687e-05, "loss": 0.0087, "step": 4295 }, { "epoch": 3.070598748882931, "eval_loss": 0.009307547472417355, "eval_runtime": 4.6078, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 4295 }, { "epoch": 3.071313672922252, "grad_norm": 0.020789984613656998, "learning_rate": 3.882179627947273e-05, "loss": 0.0068, "step": 4296 }, { "epoch": 3.072028596961573, "grad_norm": 0.020827125757932663, "learning_rate": 3.879746071244099e-05, "loss": 0.0077, "step": 4297 }, { "epoch": 3.0727435210008935, "grad_norm": 0.021666264161467552, "learning_rate": 3.877312793909042e-05, "loss": 0.0073, "step": 4298 }, { "epoch": 3.0734584450402145, "grad_norm": 0.039935749024152756, "learning_rate": 3.8748797965489105e-05, "loss": 0.0086, "step": 4299 }, { "epoch": 3.0741733690795354, "grad_norm": 0.027774544432759285, "learning_rate": 3.872447079770446e-05, "loss": 0.0083, "step": 4300 }, { "epoch": 3.0741733690795354, "eval_loss": 0.009415005333721638, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4300 }, { "epoch": 3.074888293118856, "grad_norm": 0.02237977646291256, "learning_rate": 3.8700146441803154e-05, "loss": 0.0088, "step": 4301 }, { "epoch": 3.075603217158177, "grad_norm": 0.02226019836962223, "learning_rate": 3.86758249038512e-05, "loss": 0.0075, "step": 4302 }, { "epoch": 3.076318141197498, "grad_norm": 0.03144821897149086, "learning_rate": 3.865150618991388e-05, "loss": 0.0125, "step": 4303 }, { "epoch": 3.0770330652368187, "grad_norm": 0.015144586563110352, "learning_rate": 3.8627190306055746e-05, "loss": 0.0049, "step": 4304 }, { "epoch": 3.0777479892761392, "grad_norm": 0.028286458924412727, "learning_rate": 3.860287725834072e-05, "loss": 0.0122, "step": 4305 }, { "epoch": 3.0777479892761392, "eval_loss": 0.009436728432774544, "eval_runtime": 4.611, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.819, "step": 4305 }, { "epoch": 3.07846291331546, "grad_norm": 0.025195255875587463, "learning_rate": 3.857856705283195e-05, "loss": 0.0094, "step": 4306 }, { "epoch": 3.079177837354781, "grad_norm": 0.026246435940265656, "learning_rate": 3.8554259695591905e-05, "loss": 0.0079, "step": 4307 }, { "epoch": 3.079892761394102, "grad_norm": 0.020666578784585, "learning_rate": 3.852995519268232e-05, "loss": 0.0072, "step": 4308 }, { "epoch": 3.0806076854334226, "grad_norm": 0.02023998647928238, "learning_rate": 3.850565355016425e-05, "loss": 0.0089, "step": 4309 }, { "epoch": 3.0813226094727435, "grad_norm": 0.023900602012872696, "learning_rate": 3.8481354774098025e-05, "loss": 0.0072, "step": 4310 }, { "epoch": 3.0813226094727435, "eval_loss": 0.00918580312281847, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4310 }, { "epoch": 3.0820375335120644, "grad_norm": 0.023471901193261147, "learning_rate": 3.845705887054324e-05, "loss": 0.0082, "step": 4311 }, { "epoch": 3.0827524575513854, "grad_norm": 0.027832020074129105, "learning_rate": 3.8432765845558785e-05, "loss": 0.0129, "step": 4312 }, { "epoch": 3.083467381590706, "grad_norm": 0.023098548874258995, "learning_rate": 3.840847570520287e-05, "loss": 0.0071, "step": 4313 }, { "epoch": 3.084182305630027, "grad_norm": 0.018786150962114334, "learning_rate": 3.8384188455532935e-05, "loss": 0.0072, "step": 4314 }, { "epoch": 3.0848972296693478, "grad_norm": 0.017724404111504555, "learning_rate": 3.8359904102605705e-05, "loss": 0.0068, "step": 4315 }, { "epoch": 3.0848972296693478, "eval_loss": 0.009226296097040176, "eval_runtime": 4.5801, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 4315 }, { "epoch": 3.0856121537086683, "grad_norm": 0.026719966903328896, "learning_rate": 3.83356226524772e-05, "loss": 0.0092, "step": 4316 }, { "epoch": 3.086327077747989, "grad_norm": 0.017791079357266426, "learning_rate": 3.831134411120272e-05, "loss": 0.0068, "step": 4317 }, { "epoch": 3.08704200178731, "grad_norm": 0.019647816196084023, "learning_rate": 3.828706848483683e-05, "loss": 0.0066, "step": 4318 }, { "epoch": 3.087756925826631, "grad_norm": 0.021482357755303383, "learning_rate": 3.8262795779433344e-05, "loss": 0.0098, "step": 4319 }, { "epoch": 3.0884718498659516, "grad_norm": 0.021130772307515144, "learning_rate": 3.8238526001045416e-05, "loss": 0.0101, "step": 4320 }, { "epoch": 3.0884718498659516, "eval_loss": 0.009521499276161194, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4320 }, { "epoch": 3.0891867739052725, "grad_norm": 0.03435303270816803, "learning_rate": 3.8214259155725366e-05, "loss": 0.0102, "step": 4321 }, { "epoch": 3.0899016979445935, "grad_norm": 0.024233326315879822, "learning_rate": 3.818999524952491e-05, "loss": 0.0095, "step": 4322 }, { "epoch": 3.0906166219839144, "grad_norm": 0.026600364595651627, "learning_rate": 3.816573428849492e-05, "loss": 0.0098, "step": 4323 }, { "epoch": 3.091331546023235, "grad_norm": 0.01968393847346306, "learning_rate": 3.81414762786856e-05, "loss": 0.0071, "step": 4324 }, { "epoch": 3.092046470062556, "grad_norm": 0.023724298924207687, "learning_rate": 3.811722122614636e-05, "loss": 0.0091, "step": 4325 }, { "epoch": 3.092046470062556, "eval_loss": 0.009406503289937973, "eval_runtime": 4.5936, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 4325 }, { "epoch": 3.092761394101877, "grad_norm": 0.01673417165875435, "learning_rate": 3.809296913692594e-05, "loss": 0.005, "step": 4326 }, { "epoch": 3.0934763181411977, "grad_norm": 0.020058438181877136, "learning_rate": 3.80687200170723e-05, "loss": 0.0085, "step": 4327 }, { "epoch": 3.094191242180518, "grad_norm": 0.018812282010912895, "learning_rate": 3.8044473872632666e-05, "loss": 0.0059, "step": 4328 }, { "epoch": 3.094906166219839, "grad_norm": 0.023540930822491646, "learning_rate": 3.802023070965353e-05, "loss": 0.0059, "step": 4329 }, { "epoch": 3.09562109025916, "grad_norm": 0.018830053508281708, "learning_rate": 3.799599053418062e-05, "loss": 0.0065, "step": 4330 }, { "epoch": 3.09562109025916, "eval_loss": 0.009372654370963573, "eval_runtime": 4.5957, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 4330 }, { "epoch": 3.0963360142984806, "grad_norm": 0.020443009212613106, "learning_rate": 3.797175335225895e-05, "loss": 0.0067, "step": 4331 }, { "epoch": 3.0970509383378015, "grad_norm": 0.020706580951809883, "learning_rate": 3.7947519169932754e-05, "loss": 0.0109, "step": 4332 }, { "epoch": 3.0977658623771225, "grad_norm": 0.026608815416693687, "learning_rate": 3.792328799324555e-05, "loss": 0.0118, "step": 4333 }, { "epoch": 3.0984807864164434, "grad_norm": 0.024171434342861176, "learning_rate": 3.78990598282401e-05, "loss": 0.006, "step": 4334 }, { "epoch": 3.099195710455764, "grad_norm": 0.025390109047293663, "learning_rate": 3.7874834680958374e-05, "loss": 0.0123, "step": 4335 }, { "epoch": 3.099195710455764, "eval_loss": 0.009359048679471016, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 4335 }, { "epoch": 3.099910634495085, "grad_norm": 0.01927867904305458, "learning_rate": 3.7850612557441645e-05, "loss": 0.0054, "step": 4336 }, { "epoch": 3.100625558534406, "grad_norm": 0.022291477769613266, "learning_rate": 3.78263934637304e-05, "loss": 0.0091, "step": 4337 }, { "epoch": 3.1013404825737267, "grad_norm": 0.01935274712741375, "learning_rate": 3.7802177405864394e-05, "loss": 0.0058, "step": 4338 }, { "epoch": 3.1020554066130472, "grad_norm": 0.01832299679517746, "learning_rate": 3.7777964389882596e-05, "loss": 0.0059, "step": 4339 }, { "epoch": 3.102770330652368, "grad_norm": 0.03264039382338524, "learning_rate": 3.7753754421823226e-05, "loss": 0.0112, "step": 4340 }, { "epoch": 3.102770330652368, "eval_loss": 0.0092331413179636, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 4340 }, { "epoch": 3.103485254691689, "grad_norm": 0.02541413903236389, "learning_rate": 3.772954750772376e-05, "loss": 0.0097, "step": 4341 }, { "epoch": 3.1042001787310096, "grad_norm": 0.029183246195316315, "learning_rate": 3.770534365362089e-05, "loss": 0.0133, "step": 4342 }, { "epoch": 3.1049151027703306, "grad_norm": 0.02172701805830002, "learning_rate": 3.7681142865550554e-05, "loss": 0.0104, "step": 4343 }, { "epoch": 3.1056300268096515, "grad_norm": 0.026894867420196533, "learning_rate": 3.7656945149547955e-05, "loss": 0.0095, "step": 4344 }, { "epoch": 3.1063449508489724, "grad_norm": 0.021764526143670082, "learning_rate": 3.763275051164747e-05, "loss": 0.0106, "step": 4345 }, { "epoch": 3.1063449508489724, "eval_loss": 0.009236921556293964, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 4345 }, { "epoch": 3.107059874888293, "grad_norm": 0.018756667152047157, "learning_rate": 3.760855895788277e-05, "loss": 0.0053, "step": 4346 }, { "epoch": 3.107774798927614, "grad_norm": 0.018746787682175636, "learning_rate": 3.7584370494286697e-05, "loss": 0.0081, "step": 4347 }, { "epoch": 3.108489722966935, "grad_norm": 0.022109612822532654, "learning_rate": 3.7560185126891376e-05, "loss": 0.0089, "step": 4348 }, { "epoch": 3.1092046470062558, "grad_norm": 0.020315352827310562, "learning_rate": 3.753600286172811e-05, "loss": 0.0073, "step": 4349 }, { "epoch": 3.1099195710455763, "grad_norm": 0.022820528596639633, "learning_rate": 3.7511823704827484e-05, "loss": 0.0055, "step": 4350 }, { "epoch": 3.1099195710455763, "eval_loss": 0.009490014053881168, "eval_runtime": 4.5884, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 4350 }, { "epoch": 3.110634495084897, "grad_norm": 0.03130174055695534, "learning_rate": 3.748764766221926e-05, "loss": 0.0118, "step": 4351 }, { "epoch": 3.111349419124218, "grad_norm": 0.02669333666563034, "learning_rate": 3.746347473993245e-05, "loss": 0.0065, "step": 4352 }, { "epoch": 3.112064343163539, "grad_norm": 0.03023650497198105, "learning_rate": 3.743930494399527e-05, "loss": 0.0094, "step": 4353 }, { "epoch": 3.1127792672028596, "grad_norm": 0.023654460906982422, "learning_rate": 3.741513828043519e-05, "loss": 0.0081, "step": 4354 }, { "epoch": 3.1134941912421805, "grad_norm": 0.026330560445785522, "learning_rate": 3.739097475527885e-05, "loss": 0.0154, "step": 4355 }, { "epoch": 3.1134941912421805, "eval_loss": 0.00953325442969799, "eval_runtime": 4.6093, "eval_samples_per_second": 10.848, "eval_steps_per_second": 2.82, "step": 4355 }, { "epoch": 3.1142091152815015, "grad_norm": 0.02079145424067974, "learning_rate": 3.7366814374552136e-05, "loss": 0.0061, "step": 4356 }, { "epoch": 3.114924039320822, "grad_norm": 0.02160513401031494, "learning_rate": 3.734265714428016e-05, "loss": 0.0069, "step": 4357 }, { "epoch": 3.115638963360143, "grad_norm": 0.019308220595121384, "learning_rate": 3.731850307048723e-05, "loss": 0.0072, "step": 4358 }, { "epoch": 3.116353887399464, "grad_norm": 0.02168116718530655, "learning_rate": 3.729435215919686e-05, "loss": 0.007, "step": 4359 }, { "epoch": 3.117068811438785, "grad_norm": 0.022732751443982124, "learning_rate": 3.72702044164318e-05, "loss": 0.0058, "step": 4360 }, { "epoch": 3.117068811438785, "eval_loss": 0.00938111636787653, "eval_runtime": 4.5986, "eval_samples_per_second": 10.873, "eval_steps_per_second": 2.827, "step": 4360 }, { "epoch": 3.1177837354781053, "grad_norm": 0.017305763438344002, "learning_rate": 3.724605984821399e-05, "loss": 0.0051, "step": 4361 }, { "epoch": 3.118498659517426, "grad_norm": 0.022578684613108635, "learning_rate": 3.7221918460564596e-05, "loss": 0.0095, "step": 4362 }, { "epoch": 3.119213583556747, "grad_norm": 0.024611564353108406, "learning_rate": 3.719778025950397e-05, "loss": 0.011, "step": 4363 }, { "epoch": 3.119928507596068, "grad_norm": 0.02294464409351349, "learning_rate": 3.717364525105166e-05, "loss": 0.0099, "step": 4364 }, { "epoch": 3.1206434316353886, "grad_norm": 0.04318814352154732, "learning_rate": 3.714951344122647e-05, "loss": 0.0087, "step": 4365 }, { "epoch": 3.1206434316353886, "eval_loss": 0.009124482981860638, "eval_runtime": 4.6175, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.815, "step": 4365 }, { "epoch": 3.1213583556747095, "grad_norm": 0.01739954948425293, "learning_rate": 3.712538483604634e-05, "loss": 0.0047, "step": 4366 }, { "epoch": 3.1220732797140305, "grad_norm": 0.018636411055922508, "learning_rate": 3.710125944152849e-05, "loss": 0.0067, "step": 4367 }, { "epoch": 3.1227882037533514, "grad_norm": 0.02380586788058281, "learning_rate": 3.707713726368926e-05, "loss": 0.0091, "step": 4368 }, { "epoch": 3.123503127792672, "grad_norm": 0.017610732465982437, "learning_rate": 3.705301830854423e-05, "loss": 0.007, "step": 4369 }, { "epoch": 3.124218051831993, "grad_norm": 0.02175714634358883, "learning_rate": 3.702890258210817e-05, "loss": 0.0083, "step": 4370 }, { "epoch": 3.124218051831993, "eval_loss": 0.009385120123624802, "eval_runtime": 4.5997, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 4370 }, { "epoch": 3.124932975871314, "grad_norm": 0.017425309866666794, "learning_rate": 3.700479009039504e-05, "loss": 0.0062, "step": 4371 }, { "epoch": 3.1256478999106343, "grad_norm": 0.02059403993189335, "learning_rate": 3.6980680839417994e-05, "loss": 0.0077, "step": 4372 }, { "epoch": 3.1263628239499552, "grad_norm": 0.025916798040270805, "learning_rate": 3.6956574835189374e-05, "loss": 0.0101, "step": 4373 }, { "epoch": 3.127077747989276, "grad_norm": 0.025598855689167976, "learning_rate": 3.693247208372074e-05, "loss": 0.0111, "step": 4374 }, { "epoch": 3.127792672028597, "grad_norm": 0.02045896090567112, "learning_rate": 3.6908372591022786e-05, "loss": 0.0066, "step": 4375 }, { "epoch": 3.127792672028597, "eval_loss": 0.009231746196746826, "eval_runtime": 4.5777, "eval_samples_per_second": 10.923, "eval_steps_per_second": 2.84, "step": 4375 }, { "epoch": 3.1285075960679176, "grad_norm": 0.022661443799734116, "learning_rate": 3.688427636310545e-05, "loss": 0.0108, "step": 4376 }, { "epoch": 3.1292225201072386, "grad_norm": 0.021680600941181183, "learning_rate": 3.686018340597783e-05, "loss": 0.0079, "step": 4377 }, { "epoch": 3.1299374441465595, "grad_norm": 0.022991230711340904, "learning_rate": 3.6836093725648205e-05, "loss": 0.0185, "step": 4378 }, { "epoch": 3.1306523681858804, "grad_norm": 0.02031269483268261, "learning_rate": 3.681200732812405e-05, "loss": 0.0091, "step": 4379 }, { "epoch": 3.131367292225201, "grad_norm": 0.021789981052279472, "learning_rate": 3.678792421941199e-05, "loss": 0.0058, "step": 4380 }, { "epoch": 3.131367292225201, "eval_loss": 0.009247686713933945, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4380 }, { "epoch": 3.132082216264522, "grad_norm": 0.022000368684530258, "learning_rate": 3.676384440551788e-05, "loss": 0.0097, "step": 4381 }, { "epoch": 3.132797140303843, "grad_norm": 0.026252830401062965, "learning_rate": 3.673976789244672e-05, "loss": 0.0069, "step": 4382 }, { "epoch": 3.1335120643431633, "grad_norm": 0.028936391696333885, "learning_rate": 3.671569468620268e-05, "loss": 0.0135, "step": 4383 }, { "epoch": 3.1342269883824843, "grad_norm": 0.019274426624178886, "learning_rate": 3.669162479278914e-05, "loss": 0.0054, "step": 4384 }, { "epoch": 3.134941912421805, "grad_norm": 0.018233584240078926, "learning_rate": 3.6667558218208616e-05, "loss": 0.0055, "step": 4385 }, { "epoch": 3.134941912421805, "eval_loss": 0.009103406220674515, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 4385 }, { "epoch": 3.135656836461126, "grad_norm": 0.024694060906767845, "learning_rate": 3.6643494968462824e-05, "loss": 0.0093, "step": 4386 }, { "epoch": 3.1363717605004466, "grad_norm": 0.023434119299054146, "learning_rate": 3.661943504955263e-05, "loss": 0.0094, "step": 4387 }, { "epoch": 3.1370866845397676, "grad_norm": 0.023040786385536194, "learning_rate": 3.659537846747806e-05, "loss": 0.0135, "step": 4388 }, { "epoch": 3.1378016085790885, "grad_norm": 0.01911139488220215, "learning_rate": 3.657132522823837e-05, "loss": 0.0075, "step": 4389 }, { "epoch": 3.1385165326184095, "grad_norm": 0.016799790784716606, "learning_rate": 3.654727533783192e-05, "loss": 0.0055, "step": 4390 }, { "epoch": 3.1385165326184095, "eval_loss": 0.009075229987502098, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 4390 }, { "epoch": 3.13923145665773, "grad_norm": 0.020267484709620476, "learning_rate": 3.6523228802256264e-05, "loss": 0.007, "step": 4391 }, { "epoch": 3.139946380697051, "grad_norm": 0.02119622379541397, "learning_rate": 3.64991856275081e-05, "loss": 0.0077, "step": 4392 }, { "epoch": 3.140661304736372, "grad_norm": 0.02416282519698143, "learning_rate": 3.647514581958328e-05, "loss": 0.0092, "step": 4393 }, { "epoch": 3.141376228775693, "grad_norm": 0.021991239860653877, "learning_rate": 3.645110938447687e-05, "loss": 0.0059, "step": 4394 }, { "epoch": 3.1420911528150133, "grad_norm": 0.019183039665222168, "learning_rate": 3.642707632818304e-05, "loss": 0.0065, "step": 4395 }, { "epoch": 3.1420911528150133, "eval_loss": 0.009084822610020638, "eval_runtime": 4.5804, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4395 }, { "epoch": 3.142806076854334, "grad_norm": 0.016596969217061996, "learning_rate": 3.640304665669514e-05, "loss": 0.0058, "step": 4396 }, { "epoch": 3.143521000893655, "grad_norm": 0.026011241599917412, "learning_rate": 3.637902037600564e-05, "loss": 0.0101, "step": 4397 }, { "epoch": 3.1442359249329757, "grad_norm": 0.018245477229356766, "learning_rate": 3.635499749210626e-05, "loss": 0.0086, "step": 4398 }, { "epoch": 3.1449508489722966, "grad_norm": 0.023696420714259148, "learning_rate": 3.633097801098777e-05, "loss": 0.0093, "step": 4399 }, { "epoch": 3.1456657730116175, "grad_norm": 0.023180760443210602, "learning_rate": 3.630696193864012e-05, "loss": 0.0097, "step": 4400 }, { "epoch": 3.1456657730116175, "eval_loss": 0.009174310602247715, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4400 }, { "epoch": 3.1463806970509385, "grad_norm": 0.02460496500134468, "learning_rate": 3.628294928105245e-05, "loss": 0.0119, "step": 4401 }, { "epoch": 3.147095621090259, "grad_norm": 0.025042230263352394, "learning_rate": 3.6258940044213e-05, "loss": 0.0073, "step": 4402 }, { "epoch": 3.14781054512958, "grad_norm": 0.02232506312429905, "learning_rate": 3.6234934234109194e-05, "loss": 0.0096, "step": 4403 }, { "epoch": 3.148525469168901, "grad_norm": 0.018683794885873795, "learning_rate": 3.6210931856727545e-05, "loss": 0.0073, "step": 4404 }, { "epoch": 3.149240393208222, "grad_norm": 0.019316058605909348, "learning_rate": 3.6186932918053805e-05, "loss": 0.0081, "step": 4405 }, { "epoch": 3.149240393208222, "eval_loss": 0.00917768944054842, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 4405 }, { "epoch": 3.1499553172475423, "grad_norm": 0.02187584899365902, "learning_rate": 3.616293742407277e-05, "loss": 0.0048, "step": 4406 }, { "epoch": 3.1506702412868632, "grad_norm": 0.02016156166791916, "learning_rate": 3.613894538076844e-05, "loss": 0.0075, "step": 4407 }, { "epoch": 3.151385165326184, "grad_norm": 0.01905989460647106, "learning_rate": 3.611495679412391e-05, "loss": 0.0065, "step": 4408 }, { "epoch": 3.152100089365505, "grad_norm": 0.026708899065852165, "learning_rate": 3.609097167012147e-05, "loss": 0.0084, "step": 4409 }, { "epoch": 3.1528150134048256, "grad_norm": 0.023920375853776932, "learning_rate": 3.606699001474246e-05, "loss": 0.008, "step": 4410 }, { "epoch": 3.1528150134048256, "eval_loss": 0.009244062937796116, "eval_runtime": 4.5794, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 4410 }, { "epoch": 3.1535299374441466, "grad_norm": 0.020904457196593285, "learning_rate": 3.604301183396747e-05, "loss": 0.0084, "step": 4411 }, { "epoch": 3.1542448614834675, "grad_norm": 0.021034955978393555, "learning_rate": 3.601903713377613e-05, "loss": 0.0066, "step": 4412 }, { "epoch": 3.154959785522788, "grad_norm": 0.026376191526651382, "learning_rate": 3.599506592014723e-05, "loss": 0.0077, "step": 4413 }, { "epoch": 3.155674709562109, "grad_norm": 0.02649685926735401, "learning_rate": 3.597109819905872e-05, "loss": 0.0063, "step": 4414 }, { "epoch": 3.15638963360143, "grad_norm": 0.023620450869202614, "learning_rate": 3.5947133976487626e-05, "loss": 0.0076, "step": 4415 }, { "epoch": 3.15638963360143, "eval_loss": 0.009337508119642735, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4415 }, { "epoch": 3.157104557640751, "grad_norm": 0.02397162653505802, "learning_rate": 3.5923173258410136e-05, "loss": 0.0071, "step": 4416 }, { "epoch": 3.1578194816800713, "grad_norm": 0.026134056970477104, "learning_rate": 3.589921605080155e-05, "loss": 0.0094, "step": 4417 }, { "epoch": 3.1585344057193923, "grad_norm": 0.026431819424033165, "learning_rate": 3.587526235963631e-05, "loss": 0.0063, "step": 4418 }, { "epoch": 3.159249329758713, "grad_norm": 0.02131429873406887, "learning_rate": 3.5851312190887975e-05, "loss": 0.0115, "step": 4419 }, { "epoch": 3.159964253798034, "grad_norm": 0.015126435086131096, "learning_rate": 3.582736555052921e-05, "loss": 0.0055, "step": 4420 }, { "epoch": 3.159964253798034, "eval_loss": 0.009355567395687103, "eval_runtime": 4.6023, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 4420 }, { "epoch": 3.1606791778373546, "grad_norm": 0.02860107086598873, "learning_rate": 3.580342244453181e-05, "loss": 0.0175, "step": 4421 }, { "epoch": 3.1613941018766756, "grad_norm": 0.01739524118602276, "learning_rate": 3.57794828788667e-05, "loss": 0.0074, "step": 4422 }, { "epoch": 3.1621090259159965, "grad_norm": 0.020116815343499184, "learning_rate": 3.5755546859503894e-05, "loss": 0.0076, "step": 4423 }, { "epoch": 3.1628239499553175, "grad_norm": 0.019724121317267418, "learning_rate": 3.573161439241256e-05, "loss": 0.0079, "step": 4424 }, { "epoch": 3.163538873994638, "grad_norm": 0.025515824556350708, "learning_rate": 3.570768548356095e-05, "loss": 0.0073, "step": 4425 }, { "epoch": 3.163538873994638, "eval_loss": 0.009199392981827259, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4425 }, { "epoch": 3.164253798033959, "grad_norm": 0.026934996247291565, "learning_rate": 3.5683760138916435e-05, "loss": 0.0052, "step": 4426 }, { "epoch": 3.16496872207328, "grad_norm": 0.027431128546595573, "learning_rate": 3.5659838364445505e-05, "loss": 0.0145, "step": 4427 }, { "epoch": 3.1656836461126003, "grad_norm": 0.021702542901039124, "learning_rate": 3.5635920166113735e-05, "loss": 0.0066, "step": 4428 }, { "epoch": 3.1663985701519213, "grad_norm": 0.023822737857699394, "learning_rate": 3.5612005549885865e-05, "loss": 0.0073, "step": 4429 }, { "epoch": 3.167113494191242, "grad_norm": 0.02306945063173771, "learning_rate": 3.5588094521725684e-05, "loss": 0.0092, "step": 4430 }, { "epoch": 3.167113494191242, "eval_loss": 0.009339443407952785, "eval_runtime": 4.5823, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4430 }, { "epoch": 3.167828418230563, "grad_norm": 0.026211561635136604, "learning_rate": 3.5564187087596114e-05, "loss": 0.0073, "step": 4431 }, { "epoch": 3.1685433422698837, "grad_norm": 0.024678794667124748, "learning_rate": 3.554028325345914e-05, "loss": 0.0117, "step": 4432 }, { "epoch": 3.1692582663092046, "grad_norm": 0.02596781775355339, "learning_rate": 3.551638302527592e-05, "loss": 0.0072, "step": 4433 }, { "epoch": 3.1699731903485255, "grad_norm": 0.028395388275384903, "learning_rate": 3.549248640900668e-05, "loss": 0.0134, "step": 4434 }, { "epoch": 3.1706881143878465, "grad_norm": 0.03117554448544979, "learning_rate": 3.546859341061073e-05, "loss": 0.0072, "step": 4435 }, { "epoch": 3.1706881143878465, "eval_loss": 0.009348390623927116, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 4435 }, { "epoch": 3.171403038427167, "grad_norm": 0.023791303858160973, "learning_rate": 3.5444704036046484e-05, "loss": 0.007, "step": 4436 }, { "epoch": 3.172117962466488, "grad_norm": 0.022144997492432594, "learning_rate": 3.542081829127145e-05, "loss": 0.0108, "step": 4437 }, { "epoch": 3.172832886505809, "grad_norm": 0.02949981577694416, "learning_rate": 3.5396936182242254e-05, "loss": 0.0126, "step": 4438 }, { "epoch": 3.17354781054513, "grad_norm": 0.0235719233751297, "learning_rate": 3.53730577149146e-05, "loss": 0.0133, "step": 4439 }, { "epoch": 3.1742627345844503, "grad_norm": 0.0291193388402462, "learning_rate": 3.534918289524327e-05, "loss": 0.0153, "step": 4440 }, { "epoch": 3.1742627345844503, "eval_loss": 0.009433384984731674, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4440 }, { "epoch": 3.1749776586237712, "grad_norm": 0.025260714814066887, "learning_rate": 3.532531172918215e-05, "loss": 0.0113, "step": 4441 }, { "epoch": 3.175692582663092, "grad_norm": 0.02254706434905529, "learning_rate": 3.5301444222684224e-05, "loss": 0.007, "step": 4442 }, { "epoch": 3.1764075067024127, "grad_norm": 0.02134043350815773, "learning_rate": 3.5277580381701555e-05, "loss": 0.0094, "step": 4443 }, { "epoch": 3.1771224307417336, "grad_norm": 0.021154534071683884, "learning_rate": 3.525372021218528e-05, "loss": 0.0086, "step": 4444 }, { "epoch": 3.1778373547810546, "grad_norm": 0.023719891905784607, "learning_rate": 3.522986372008562e-05, "loss": 0.0086, "step": 4445 }, { "epoch": 3.1778373547810546, "eval_loss": 0.00911447498947382, "eval_runtime": 4.6185, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.815, "step": 4445 }, { "epoch": 3.1785522788203755, "grad_norm": 0.02894650585949421, "learning_rate": 3.520601091135192e-05, "loss": 0.0102, "step": 4446 }, { "epoch": 3.179267202859696, "grad_norm": 0.01716662384569645, "learning_rate": 3.518216179193256e-05, "loss": 0.0051, "step": 4447 }, { "epoch": 3.179982126899017, "grad_norm": 0.020963463932275772, "learning_rate": 3.515831636777501e-05, "loss": 0.0074, "step": 4448 }, { "epoch": 3.180697050938338, "grad_norm": 0.0240678358823061, "learning_rate": 3.513447464482584e-05, "loss": 0.0078, "step": 4449 }, { "epoch": 3.181411974977659, "grad_norm": 0.03005722165107727, "learning_rate": 3.5110636629030676e-05, "loss": 0.0114, "step": 4450 }, { "epoch": 3.181411974977659, "eval_loss": 0.00904365349560976, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 4450 }, { "epoch": 3.1821268990169793, "grad_norm": 0.028214329853653908, "learning_rate": 3.508680232633422e-05, "loss": 0.0098, "step": 4451 }, { "epoch": 3.1828418230563003, "grad_norm": 0.023723484948277473, "learning_rate": 3.5062971742680243e-05, "loss": 0.0115, "step": 4452 }, { "epoch": 3.183556747095621, "grad_norm": 0.022923335433006287, "learning_rate": 3.503914488401163e-05, "loss": 0.0074, "step": 4453 }, { "epoch": 3.184271671134942, "grad_norm": 0.023634282872080803, "learning_rate": 3.501532175627026e-05, "loss": 0.0067, "step": 4454 }, { "epoch": 3.1849865951742626, "grad_norm": 0.023212138563394547, "learning_rate": 3.4991502365397177e-05, "loss": 0.0052, "step": 4455 }, { "epoch": 3.1849865951742626, "eval_loss": 0.008835136890411377, "eval_runtime": 4.6134, "eval_samples_per_second": 10.838, "eval_steps_per_second": 2.818, "step": 4455 }, { "epoch": 3.1857015192135836, "grad_norm": 0.02600695751607418, "learning_rate": 3.4967686717332425e-05, "loss": 0.0083, "step": 4456 }, { "epoch": 3.1864164432529045, "grad_norm": 0.026506440714001656, "learning_rate": 3.4943874818015115e-05, "loss": 0.0121, "step": 4457 }, { "epoch": 3.187131367292225, "grad_norm": 0.025321969762444496, "learning_rate": 3.4920066673383466e-05, "loss": 0.0095, "step": 4458 }, { "epoch": 3.187846291331546, "grad_norm": 0.024402422830462456, "learning_rate": 3.4896262289374735e-05, "loss": 0.0064, "step": 4459 }, { "epoch": 3.188561215370867, "grad_norm": 0.03320235759019852, "learning_rate": 3.4872461671925225e-05, "loss": 0.0105, "step": 4460 }, { "epoch": 3.188561215370867, "eval_loss": 0.008900300599634647, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 4460 }, { "epoch": 3.189276139410188, "grad_norm": 0.0205813217908144, "learning_rate": 3.484866482697032e-05, "loss": 0.0075, "step": 4461 }, { "epoch": 3.1899910634495083, "grad_norm": 0.023880314081907272, "learning_rate": 3.482487176044447e-05, "loss": 0.0081, "step": 4462 }, { "epoch": 3.1907059874888293, "grad_norm": 0.019289221614599228, "learning_rate": 3.4801082478281164e-05, "loss": 0.0061, "step": 4463 }, { "epoch": 3.19142091152815, "grad_norm": 0.02103133127093315, "learning_rate": 3.477729698641297e-05, "loss": 0.0056, "step": 4464 }, { "epoch": 3.192135835567471, "grad_norm": 0.029724642634391785, "learning_rate": 3.4753515290771475e-05, "loss": 0.0108, "step": 4465 }, { "epoch": 3.192135835567471, "eval_loss": 0.008676988072693348, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4465 }, { "epoch": 3.1928507596067917, "grad_norm": 0.02052401937544346, "learning_rate": 3.472973739728737e-05, "loss": 0.0071, "step": 4466 }, { "epoch": 3.1935656836461126, "grad_norm": 0.02312977984547615, "learning_rate": 3.470596331189033e-05, "loss": 0.0078, "step": 4467 }, { "epoch": 3.1942806076854335, "grad_norm": 0.020683646202087402, "learning_rate": 3.468219304050916e-05, "loss": 0.011, "step": 4468 }, { "epoch": 3.1949955317247545, "grad_norm": 0.021353788673877716, "learning_rate": 3.4658426589071636e-05, "loss": 0.008, "step": 4469 }, { "epoch": 3.195710455764075, "grad_norm": 0.022913722321391106, "learning_rate": 3.4634663963504654e-05, "loss": 0.0144, "step": 4470 }, { "epoch": 3.195710455764075, "eval_loss": 0.008730134926736355, "eval_runtime": 4.5777, "eval_samples_per_second": 10.923, "eval_steps_per_second": 2.84, "step": 4470 }, { "epoch": 3.196425379803396, "grad_norm": 0.02204093523323536, "learning_rate": 3.46109051697341e-05, "loss": 0.0093, "step": 4471 }, { "epoch": 3.197140303842717, "grad_norm": 0.02163214050233364, "learning_rate": 3.4587150213684915e-05, "loss": 0.0078, "step": 4472 }, { "epoch": 3.1978552278820374, "grad_norm": 0.020294999703764915, "learning_rate": 3.456339910128111e-05, "loss": 0.0082, "step": 4473 }, { "epoch": 3.1985701519213583, "grad_norm": 0.01528211124241352, "learning_rate": 3.453965183844573e-05, "loss": 0.0047, "step": 4474 }, { "epoch": 3.1992850759606792, "grad_norm": 0.022834250703454018, "learning_rate": 3.451590843110083e-05, "loss": 0.0077, "step": 4475 }, { "epoch": 3.1992850759606792, "eval_loss": 0.00907410029321909, "eval_runtime": 4.5771, "eval_samples_per_second": 10.924, "eval_steps_per_second": 2.84, "step": 4475 }, { "epoch": 3.2, "grad_norm": 0.026575613766908646, "learning_rate": 3.449216888516751e-05, "loss": 0.0096, "step": 4476 }, { "epoch": 3.2007149240393207, "grad_norm": 0.021158332005143166, "learning_rate": 3.4468433206565966e-05, "loss": 0.0101, "step": 4477 }, { "epoch": 3.2014298480786416, "grad_norm": 0.02338942140340805, "learning_rate": 3.444470140121534e-05, "loss": 0.0091, "step": 4478 }, { "epoch": 3.2021447721179626, "grad_norm": 0.02488544024527073, "learning_rate": 3.442097347503389e-05, "loss": 0.0078, "step": 4479 }, { "epoch": 3.202859696157283, "grad_norm": 0.022615358233451843, "learning_rate": 3.439724943393885e-05, "loss": 0.0064, "step": 4480 }, { "epoch": 3.202859696157283, "eval_loss": 0.009272659197449684, "eval_runtime": 4.5993, "eval_samples_per_second": 10.871, "eval_steps_per_second": 2.827, "step": 4480 }, { "epoch": 3.203574620196604, "grad_norm": 0.02307887189090252, "learning_rate": 3.437352928384649e-05, "loss": 0.0076, "step": 4481 }, { "epoch": 3.204289544235925, "grad_norm": 0.024622689932584763, "learning_rate": 3.4349813030672164e-05, "loss": 0.0066, "step": 4482 }, { "epoch": 3.205004468275246, "grad_norm": 0.02426758036017418, "learning_rate": 3.432610068033018e-05, "loss": 0.0082, "step": 4483 }, { "epoch": 3.2057193923145664, "grad_norm": 0.024406252428889275, "learning_rate": 3.4302392238733916e-05, "loss": 0.0079, "step": 4484 }, { "epoch": 3.2064343163538873, "grad_norm": 0.021258821710944176, "learning_rate": 3.4278687711795754e-05, "loss": 0.008, "step": 4485 }, { "epoch": 3.2064343163538873, "eval_loss": 0.009194295853376389, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 4485 }, { "epoch": 3.2071492403932083, "grad_norm": 0.022345347329974174, "learning_rate": 3.4254987105427136e-05, "loss": 0.0079, "step": 4486 }, { "epoch": 3.207864164432529, "grad_norm": 0.019613543525338173, "learning_rate": 3.4231290425538495e-05, "loss": 0.0065, "step": 4487 }, { "epoch": 3.2085790884718497, "grad_norm": 0.027092531323432922, "learning_rate": 3.4207597678039296e-05, "loss": 0.0124, "step": 4488 }, { "epoch": 3.2092940125111706, "grad_norm": 0.02242838777601719, "learning_rate": 3.4183908868837986e-05, "loss": 0.0085, "step": 4489 }, { "epoch": 3.2100089365504916, "grad_norm": 0.020276226103305817, "learning_rate": 3.41602240038421e-05, "loss": 0.0051, "step": 4490 }, { "epoch": 3.2100089365504916, "eval_loss": 0.00896516814827919, "eval_runtime": 4.6001, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 4490 }, { "epoch": 3.2107238605898125, "grad_norm": 0.013854228891432285, "learning_rate": 3.413654308895814e-05, "loss": 0.0054, "step": 4491 }, { "epoch": 3.211438784629133, "grad_norm": 0.025300463661551476, "learning_rate": 3.411286613009165e-05, "loss": 0.0085, "step": 4492 }, { "epoch": 3.212153708668454, "grad_norm": 0.0215613953769207, "learning_rate": 3.4089193133147135e-05, "loss": 0.0089, "step": 4493 }, { "epoch": 3.212868632707775, "grad_norm": 0.03153610974550247, "learning_rate": 3.406552410402818e-05, "loss": 0.0117, "step": 4494 }, { "epoch": 3.2135835567470954, "grad_norm": 0.020202090963721275, "learning_rate": 3.404185904863736e-05, "loss": 0.008, "step": 4495 }, { "epoch": 3.2135835567470954, "eval_loss": 0.008893732912838459, "eval_runtime": 4.5986, "eval_samples_per_second": 10.873, "eval_steps_per_second": 2.827, "step": 4495 }, { "epoch": 3.2142984807864163, "grad_norm": 0.024858854711055756, "learning_rate": 3.401819797287621e-05, "loss": 0.0068, "step": 4496 }, { "epoch": 3.2150134048257373, "grad_norm": 0.020960181951522827, "learning_rate": 3.3994540882645354e-05, "loss": 0.0047, "step": 4497 }, { "epoch": 3.215728328865058, "grad_norm": 0.024249687790870667, "learning_rate": 3.397088778384434e-05, "loss": 0.0111, "step": 4498 }, { "epoch": 3.2164432529043787, "grad_norm": 0.019065657630562782, "learning_rate": 3.39472386823718e-05, "loss": 0.0101, "step": 4499 }, { "epoch": 3.2171581769436997, "grad_norm": 0.02112553082406521, "learning_rate": 3.392359358412532e-05, "loss": 0.0055, "step": 4500 }, { "epoch": 3.2171581769436997, "eval_loss": 0.009010551497340202, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4500 }, { "epoch": 3.2178731009830206, "grad_norm": 0.024211114272475243, "learning_rate": 3.3899952495001487e-05, "loss": 0.0091, "step": 4501 }, { "epoch": 3.2185880250223415, "grad_norm": 0.020825350657105446, "learning_rate": 3.387631542089589e-05, "loss": 0.0065, "step": 4502 }, { "epoch": 3.219302949061662, "grad_norm": 0.020344698801636696, "learning_rate": 3.385268236770315e-05, "loss": 0.0073, "step": 4503 }, { "epoch": 3.220017873100983, "grad_norm": 0.020682992413640022, "learning_rate": 3.382905334131684e-05, "loss": 0.007, "step": 4504 }, { "epoch": 3.220732797140304, "grad_norm": 0.013722988776862621, "learning_rate": 3.3805428347629555e-05, "loss": 0.0049, "step": 4505 }, { "epoch": 3.220732797140304, "eval_loss": 0.009079582057893276, "eval_runtime": 4.604, "eval_samples_per_second": 10.86, "eval_steps_per_second": 2.824, "step": 4505 }, { "epoch": 3.221447721179625, "grad_norm": 0.024558382108807564, "learning_rate": 3.378180739253289e-05, "loss": 0.0083, "step": 4506 }, { "epoch": 3.2221626452189454, "grad_norm": 0.021093357354402542, "learning_rate": 3.375819048191742e-05, "loss": 0.006, "step": 4507 }, { "epoch": 3.2228775692582663, "grad_norm": 0.027095403522253036, "learning_rate": 3.3734577621672694e-05, "loss": 0.0092, "step": 4508 }, { "epoch": 3.2235924932975872, "grad_norm": 0.02522853948175907, "learning_rate": 3.371096881768728e-05, "loss": 0.0078, "step": 4509 }, { "epoch": 3.2243074173369077, "grad_norm": 0.01783503219485283, "learning_rate": 3.368736407584874e-05, "loss": 0.0063, "step": 4510 }, { "epoch": 3.2243074173369077, "eval_loss": 0.00901604164391756, "eval_runtime": 4.6312, "eval_samples_per_second": 10.796, "eval_steps_per_second": 2.807, "step": 4510 }, { "epoch": 3.2250223413762287, "grad_norm": 0.030565867200493813, "learning_rate": 3.366376340204359e-05, "loss": 0.0093, "step": 4511 }, { "epoch": 3.2257372654155496, "grad_norm": 0.014877456240355968, "learning_rate": 3.3640166802157356e-05, "loss": 0.0051, "step": 4512 }, { "epoch": 3.2264521894548706, "grad_norm": 0.02261616662144661, "learning_rate": 3.361657428207453e-05, "loss": 0.0055, "step": 4513 }, { "epoch": 3.227167113494191, "grad_norm": 0.01893637701869011, "learning_rate": 3.359298584767862e-05, "loss": 0.0048, "step": 4514 }, { "epoch": 3.227882037533512, "grad_norm": 0.029317505657672882, "learning_rate": 3.3569401504852074e-05, "loss": 0.0132, "step": 4515 }, { "epoch": 3.227882037533512, "eval_loss": 0.008952837437391281, "eval_runtime": 4.5867, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 4515 }, { "epoch": 3.228596961572833, "grad_norm": 0.02378537319600582, "learning_rate": 3.354582125947634e-05, "loss": 0.0084, "step": 4516 }, { "epoch": 3.229311885612154, "grad_norm": 0.017022496089339256, "learning_rate": 3.352224511743185e-05, "loss": 0.0063, "step": 4517 }, { "epoch": 3.2300268096514744, "grad_norm": 0.02702830545604229, "learning_rate": 3.3498673084597996e-05, "loss": 0.0094, "step": 4518 }, { "epoch": 3.2307417336907953, "grad_norm": 0.024629689753055573, "learning_rate": 3.347510516685317e-05, "loss": 0.0082, "step": 4519 }, { "epoch": 3.2314566577301163, "grad_norm": 0.024344323202967644, "learning_rate": 3.34515413700747e-05, "loss": 0.0063, "step": 4520 }, { "epoch": 3.2314566577301163, "eval_loss": 0.009098341688513756, "eval_runtime": 4.5784, "eval_samples_per_second": 10.921, "eval_steps_per_second": 2.839, "step": 4520 }, { "epoch": 3.232171581769437, "grad_norm": 0.0217114444822073, "learning_rate": 3.342798170013892e-05, "loss": 0.0076, "step": 4521 }, { "epoch": 3.2328865058087577, "grad_norm": 0.024540774524211884, "learning_rate": 3.340442616292112e-05, "loss": 0.0106, "step": 4522 }, { "epoch": 3.2336014298480786, "grad_norm": 0.02397649548947811, "learning_rate": 3.338087476429559e-05, "loss": 0.007, "step": 4523 }, { "epoch": 3.2343163538873996, "grad_norm": 0.01824873499572277, "learning_rate": 3.335732751013553e-05, "loss": 0.0056, "step": 4524 }, { "epoch": 3.23503127792672, "grad_norm": 0.018094133585691452, "learning_rate": 3.333378440631315e-05, "loss": 0.0058, "step": 4525 }, { "epoch": 3.23503127792672, "eval_loss": 0.009100818075239658, "eval_runtime": 4.5794, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 4525 }, { "epoch": 3.235746201966041, "grad_norm": 0.03883569687604904, "learning_rate": 3.3310245458699595e-05, "loss": 0.0155, "step": 4526 }, { "epoch": 3.236461126005362, "grad_norm": 0.024816880002617836, "learning_rate": 3.328671067316501e-05, "loss": 0.0094, "step": 4527 }, { "epoch": 3.237176050044683, "grad_norm": 0.02983127161860466, "learning_rate": 3.326318005557849e-05, "loss": 0.0072, "step": 4528 }, { "epoch": 3.2378909740840034, "grad_norm": 0.0300898477435112, "learning_rate": 3.323965361180805e-05, "loss": 0.0098, "step": 4529 }, { "epoch": 3.2386058981233243, "grad_norm": 0.02591671794652939, "learning_rate": 3.321613134772073e-05, "loss": 0.0085, "step": 4530 }, { "epoch": 3.2386058981233243, "eval_loss": 0.009110643528401852, "eval_runtime": 4.5887, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 4530 }, { "epoch": 3.2393208221626453, "grad_norm": 0.022740252315998077, "learning_rate": 3.319261326918248e-05, "loss": 0.0077, "step": 4531 }, { "epoch": 3.240035746201966, "grad_norm": 0.026189977303147316, "learning_rate": 3.3169099382058234e-05, "loss": 0.008, "step": 4532 }, { "epoch": 3.2407506702412867, "grad_norm": 0.021504445001482964, "learning_rate": 3.314558969221184e-05, "loss": 0.0061, "step": 4533 }, { "epoch": 3.2414655942806077, "grad_norm": 0.017204327508807182, "learning_rate": 3.312208420550615e-05, "loss": 0.0058, "step": 4534 }, { "epoch": 3.2421805183199286, "grad_norm": 0.024715131148695946, "learning_rate": 3.309858292780296e-05, "loss": 0.0093, "step": 4535 }, { "epoch": 3.2421805183199286, "eval_loss": 0.009054790250957012, "eval_runtime": 4.6144, "eval_samples_per_second": 10.836, "eval_steps_per_second": 2.817, "step": 4535 }, { "epoch": 3.2428954423592495, "grad_norm": 0.03114071860909462, "learning_rate": 3.307508586496297e-05, "loss": 0.0094, "step": 4536 }, { "epoch": 3.24361036639857, "grad_norm": 0.020537564530968666, "learning_rate": 3.305159302284587e-05, "loss": 0.0076, "step": 4537 }, { "epoch": 3.244325290437891, "grad_norm": 0.02100704424083233, "learning_rate": 3.30281044073103e-05, "loss": 0.0071, "step": 4538 }, { "epoch": 3.245040214477212, "grad_norm": 0.029521938413381577, "learning_rate": 3.300462002421382e-05, "loss": 0.0115, "step": 4539 }, { "epoch": 3.2457551385165324, "grad_norm": 0.024176716804504395, "learning_rate": 3.2981139879412965e-05, "loss": 0.006, "step": 4540 }, { "epoch": 3.2457551385165324, "eval_loss": 0.008953814394772053, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 4540 }, { "epoch": 3.2464700625558534, "grad_norm": 0.02755354903638363, "learning_rate": 3.295766397876317e-05, "loss": 0.0072, "step": 4541 }, { "epoch": 3.2471849865951743, "grad_norm": 0.024647153913974762, "learning_rate": 3.2934192328118865e-05, "loss": 0.0078, "step": 4542 }, { "epoch": 3.2478999106344952, "grad_norm": 0.024475425481796265, "learning_rate": 3.291072493333336e-05, "loss": 0.0056, "step": 4543 }, { "epoch": 3.2486148346738157, "grad_norm": 0.026538977399468422, "learning_rate": 3.288726180025898e-05, "loss": 0.0116, "step": 4544 }, { "epoch": 3.2493297587131367, "grad_norm": 0.019558623433113098, "learning_rate": 3.286380293474694e-05, "loss": 0.0055, "step": 4545 }, { "epoch": 3.2493297587131367, "eval_loss": 0.008893478661775589, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4545 }, { "epoch": 3.2500446827524576, "grad_norm": 0.028470085933804512, "learning_rate": 3.2840348342647354e-05, "loss": 0.0061, "step": 4546 }, { "epoch": 3.2507596067917786, "grad_norm": 0.01868400163948536, "learning_rate": 3.281689802980934e-05, "loss": 0.0055, "step": 4547 }, { "epoch": 3.251474530831099, "grad_norm": 0.025865623727440834, "learning_rate": 3.279345200208093e-05, "loss": 0.0089, "step": 4548 }, { "epoch": 3.25218945487042, "grad_norm": 0.020182382315397263, "learning_rate": 3.2770010265309056e-05, "loss": 0.0091, "step": 4549 }, { "epoch": 3.252904378909741, "grad_norm": 0.019868142902851105, "learning_rate": 3.2746572825339606e-05, "loss": 0.0075, "step": 4550 }, { "epoch": 3.252904378909741, "eval_loss": 0.008913086727261543, "eval_runtime": 4.6171, "eval_samples_per_second": 10.829, "eval_steps_per_second": 2.816, "step": 4550 }, { "epoch": 3.253619302949062, "grad_norm": 0.020933100953698158, "learning_rate": 3.27231396880174e-05, "loss": 0.0095, "step": 4551 }, { "epoch": 3.2543342269883824, "grad_norm": 0.01792929321527481, "learning_rate": 3.269971085918617e-05, "loss": 0.0056, "step": 4552 }, { "epoch": 3.2550491510277033, "grad_norm": 0.02148902602493763, "learning_rate": 3.267628634468857e-05, "loss": 0.0102, "step": 4553 }, { "epoch": 3.2557640750670243, "grad_norm": 0.025272903963923454, "learning_rate": 3.265286615036622e-05, "loss": 0.0083, "step": 4554 }, { "epoch": 3.2564789991063448, "grad_norm": 0.01972990296781063, "learning_rate": 3.2629450282059606e-05, "loss": 0.0074, "step": 4555 }, { "epoch": 3.2564789991063448, "eval_loss": 0.008802777156233788, "eval_runtime": 4.6035, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.824, "step": 4555 }, { "epoch": 3.2571939231456657, "grad_norm": 0.022155826911330223, "learning_rate": 3.260603874560817e-05, "loss": 0.0089, "step": 4556 }, { "epoch": 3.2579088471849866, "grad_norm": 0.016729773953557014, "learning_rate": 3.258263154685025e-05, "loss": 0.0064, "step": 4557 }, { "epoch": 3.2586237712243076, "grad_norm": 0.021275801584124565, "learning_rate": 3.255922869162313e-05, "loss": 0.0073, "step": 4558 }, { "epoch": 3.259338695263628, "grad_norm": 0.030011717230081558, "learning_rate": 3.2535830185763e-05, "loss": 0.0108, "step": 4559 }, { "epoch": 3.260053619302949, "grad_norm": 0.0276069026440382, "learning_rate": 3.2512436035104964e-05, "loss": 0.0061, "step": 4560 }, { "epoch": 3.260053619302949, "eval_loss": 0.008780141361057758, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4560 }, { "epoch": 3.26076854334227, "grad_norm": 0.02718374878168106, "learning_rate": 3.2489046245483016e-05, "loss": 0.0083, "step": 4561 }, { "epoch": 3.261483467381591, "grad_norm": 0.022329220548272133, "learning_rate": 3.246566082273011e-05, "loss": 0.0068, "step": 4562 }, { "epoch": 3.2621983914209114, "grad_norm": 0.016787545755505562, "learning_rate": 3.244227977267807e-05, "loss": 0.0067, "step": 4563 }, { "epoch": 3.2629133154602323, "grad_norm": 0.03314310312271118, "learning_rate": 3.241890310115766e-05, "loss": 0.0192, "step": 4564 }, { "epoch": 3.2636282394995533, "grad_norm": 0.019374795258045197, "learning_rate": 3.239553081399851e-05, "loss": 0.0054, "step": 4565 }, { "epoch": 3.2636282394995533, "eval_loss": 0.008734794333577156, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4565 }, { "epoch": 3.264343163538874, "grad_norm": 0.02085433341562748, "learning_rate": 3.23721629170292e-05, "loss": 0.0092, "step": 4566 }, { "epoch": 3.2650580875781947, "grad_norm": 0.019691025838255882, "learning_rate": 3.234879941607722e-05, "loss": 0.0049, "step": 4567 }, { "epoch": 3.2657730116175157, "grad_norm": 0.02145727351307869, "learning_rate": 3.232544031696893e-05, "loss": 0.01, "step": 4568 }, { "epoch": 3.2664879356568366, "grad_norm": 0.030028555542230606, "learning_rate": 3.230208562552959e-05, "loss": 0.0107, "step": 4569 }, { "epoch": 3.267202859696157, "grad_norm": 0.014383678324520588, "learning_rate": 3.227873534758339e-05, "loss": 0.0046, "step": 4570 }, { "epoch": 3.267202859696157, "eval_loss": 0.008658569306135178, "eval_runtime": 4.5819, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4570 }, { "epoch": 3.267917783735478, "grad_norm": 0.036469925194978714, "learning_rate": 3.22553894889534e-05, "loss": 0.0113, "step": 4571 }, { "epoch": 3.268632707774799, "grad_norm": 0.024342576041817665, "learning_rate": 3.223204805546161e-05, "loss": 0.0089, "step": 4572 }, { "epoch": 3.26934763181412, "grad_norm": 0.024368342012166977, "learning_rate": 3.2208711052928865e-05, "loss": 0.0057, "step": 4573 }, { "epoch": 3.2700625558534404, "grad_norm": 0.023289553821086884, "learning_rate": 3.218537848717493e-05, "loss": 0.0085, "step": 4574 }, { "epoch": 3.2707774798927614, "grad_norm": 0.02724497951567173, "learning_rate": 3.216205036401848e-05, "loss": 0.012, "step": 4575 }, { "epoch": 3.2707774798927614, "eval_loss": 0.008741768077015877, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4575 }, { "epoch": 3.2714924039320823, "grad_norm": 0.02004668302834034, "learning_rate": 3.213872668927707e-05, "loss": 0.007, "step": 4576 }, { "epoch": 3.272207327971403, "grad_norm": 0.02028329111635685, "learning_rate": 3.211540746876711e-05, "loss": 0.0059, "step": 4577 }, { "epoch": 3.2729222520107237, "grad_norm": 0.03114287182688713, "learning_rate": 3.2092092708303976e-05, "loss": 0.0139, "step": 4578 }, { "epoch": 3.2736371760500447, "grad_norm": 0.021562185138463974, "learning_rate": 3.2068782413701846e-05, "loss": 0.007, "step": 4579 }, { "epoch": 3.2743521000893656, "grad_norm": 0.01582437753677368, "learning_rate": 3.204547659077385e-05, "loss": 0.0051, "step": 4580 }, { "epoch": 3.2743521000893656, "eval_loss": 0.008830955252051353, "eval_runtime": 4.6011, "eval_samples_per_second": 10.867, "eval_steps_per_second": 2.825, "step": 4580 }, { "epoch": 3.2750670241286866, "grad_norm": 0.02221549116075039, "learning_rate": 3.202217524533195e-05, "loss": 0.0061, "step": 4581 }, { "epoch": 3.275781948168007, "grad_norm": 0.02113187685608864, "learning_rate": 3.199887838318705e-05, "loss": 0.0081, "step": 4582 }, { "epoch": 3.276496872207328, "grad_norm": 0.016456110402941704, "learning_rate": 3.197558601014889e-05, "loss": 0.0048, "step": 4583 }, { "epoch": 3.277211796246649, "grad_norm": 0.01651705987751484, "learning_rate": 3.19522981320261e-05, "loss": 0.0045, "step": 4584 }, { "epoch": 3.2779267202859694, "grad_norm": 0.02632502280175686, "learning_rate": 3.1929014754626197e-05, "loss": 0.0098, "step": 4585 }, { "epoch": 3.2779267202859694, "eval_loss": 0.008852651342749596, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4585 }, { "epoch": 3.2786416443252904, "grad_norm": 0.01756856217980385, "learning_rate": 3.1905735883755584e-05, "loss": 0.006, "step": 4586 }, { "epoch": 3.2793565683646113, "grad_norm": 0.02040211297571659, "learning_rate": 3.18824615252195e-05, "loss": 0.0049, "step": 4587 }, { "epoch": 3.2800714924039323, "grad_norm": 0.021761229261755943, "learning_rate": 3.185919168482213e-05, "loss": 0.006, "step": 4588 }, { "epoch": 3.2807864164432528, "grad_norm": 0.028823448345065117, "learning_rate": 3.1835926368366466e-05, "loss": 0.0094, "step": 4589 }, { "epoch": 3.2815013404825737, "grad_norm": 0.021717891097068787, "learning_rate": 3.1812665581654386e-05, "loss": 0.0083, "step": 4590 }, { "epoch": 3.2815013404825737, "eval_loss": 0.00885077565908432, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4590 }, { "epoch": 3.2822162645218946, "grad_norm": 0.02543029375374317, "learning_rate": 3.178940933048668e-05, "loss": 0.0073, "step": 4591 }, { "epoch": 3.282931188561215, "grad_norm": 0.024070462211966515, "learning_rate": 3.176615762066295e-05, "loss": 0.0083, "step": 4592 }, { "epoch": 3.283646112600536, "grad_norm": 0.028986578807234764, "learning_rate": 3.17429104579817e-05, "loss": 0.0091, "step": 4593 }, { "epoch": 3.284361036639857, "grad_norm": 0.023514127358794212, "learning_rate": 3.171966784824028e-05, "loss": 0.0068, "step": 4594 }, { "epoch": 3.285075960679178, "grad_norm": 0.02968861721456051, "learning_rate": 3.1696429797234914e-05, "loss": 0.0125, "step": 4595 }, { "epoch": 3.285075960679178, "eval_loss": 0.008752882480621338, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 4595 }, { "epoch": 3.285790884718499, "grad_norm": 0.039810121059417725, "learning_rate": 3.1673196310760724e-05, "loss": 0.0082, "step": 4596 }, { "epoch": 3.2865058087578194, "grad_norm": 0.018668722361326218, "learning_rate": 3.1649967394611617e-05, "loss": 0.0056, "step": 4597 }, { "epoch": 3.2872207327971403, "grad_norm": 0.0254515428096056, "learning_rate": 3.162674305458042e-05, "loss": 0.007, "step": 4598 }, { "epoch": 3.2879356568364613, "grad_norm": 0.025969920679926872, "learning_rate": 3.160352329645881e-05, "loss": 0.0092, "step": 4599 }, { "epoch": 3.2886505808757818, "grad_norm": 0.023221665993332863, "learning_rate": 3.1580308126037305e-05, "loss": 0.0093, "step": 4600 }, { "epoch": 3.2886505808757818, "eval_loss": 0.00846777856349945, "eval_runtime": 4.6206, "eval_samples_per_second": 10.821, "eval_steps_per_second": 2.814, "step": 4600 }, { "epoch": 3.2893655049151027, "grad_norm": 0.026593878865242004, "learning_rate": 3.1557097549105285e-05, "loss": 0.0095, "step": 4601 }, { "epoch": 3.2900804289544237, "grad_norm": 0.02706456370651722, "learning_rate": 3.1533891571451e-05, "loss": 0.0108, "step": 4602 }, { "epoch": 3.2907953529937446, "grad_norm": 0.020855529233813286, "learning_rate": 3.151069019886153e-05, "loss": 0.0088, "step": 4603 }, { "epoch": 3.291510277033065, "grad_norm": 0.024631422013044357, "learning_rate": 3.148749343712282e-05, "loss": 0.0116, "step": 4604 }, { "epoch": 3.292225201072386, "grad_norm": 0.02889758162200451, "learning_rate": 3.146430129201965e-05, "loss": 0.0097, "step": 4605 }, { "epoch": 3.292225201072386, "eval_loss": 0.008406703360378742, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 4605 }, { "epoch": 3.292940125111707, "grad_norm": 0.02209310047328472, "learning_rate": 3.144111376933568e-05, "loss": 0.0062, "step": 4606 }, { "epoch": 3.2936550491510275, "grad_norm": 0.025181826204061508, "learning_rate": 3.1417930874853383e-05, "loss": 0.007, "step": 4607 }, { "epoch": 3.2943699731903484, "grad_norm": 0.02285636216402054, "learning_rate": 3.13947526143541e-05, "loss": 0.0107, "step": 4608 }, { "epoch": 3.2950848972296694, "grad_norm": 0.01848652958869934, "learning_rate": 3.137157899361799e-05, "loss": 0.0073, "step": 4609 }, { "epoch": 3.2957998212689903, "grad_norm": 0.018319038674235344, "learning_rate": 3.134841001842411e-05, "loss": 0.0074, "step": 4610 }, { "epoch": 3.2957998212689903, "eval_loss": 0.008354597724974155, "eval_runtime": 4.5796, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 4610 }, { "epoch": 3.2965147453083112, "grad_norm": 0.019364142790436745, "learning_rate": 3.132524569455029e-05, "loss": 0.0063, "step": 4611 }, { "epoch": 3.2972296693476317, "grad_norm": 0.018782848492264748, "learning_rate": 3.1302086027773256e-05, "loss": 0.0053, "step": 4612 }, { "epoch": 3.2979445933869527, "grad_norm": 0.021373072639107704, "learning_rate": 3.127893102386854e-05, "loss": 0.0077, "step": 4613 }, { "epoch": 3.2986595174262736, "grad_norm": 0.018818242475390434, "learning_rate": 3.125578068861051e-05, "loss": 0.0062, "step": 4614 }, { "epoch": 3.299374441465594, "grad_norm": 0.029878104105591774, "learning_rate": 3.123263502777239e-05, "loss": 0.0114, "step": 4615 }, { "epoch": 3.299374441465594, "eval_loss": 0.008418995887041092, "eval_runtime": 4.5801, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 4615 }, { "epoch": 3.300089365504915, "grad_norm": 0.03220529109239578, "learning_rate": 3.120949404712623e-05, "loss": 0.007, "step": 4616 }, { "epoch": 3.300804289544236, "grad_norm": 0.0254114530980587, "learning_rate": 3.1186357752442914e-05, "loss": 0.0146, "step": 4617 }, { "epoch": 3.301519213583557, "grad_norm": 0.019155485555529594, "learning_rate": 3.116322614949213e-05, "loss": 0.0076, "step": 4618 }, { "epoch": 3.3022341376228774, "grad_norm": 0.02712063118815422, "learning_rate": 3.114009924404245e-05, "loss": 0.0101, "step": 4619 }, { "epoch": 3.3029490616621984, "grad_norm": 0.017129842191934586, "learning_rate": 3.111697704186124e-05, "loss": 0.005, "step": 4620 }, { "epoch": 3.3029490616621984, "eval_loss": 0.008459353819489479, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4620 }, { "epoch": 3.3036639857015193, "grad_norm": 0.022587884217500687, "learning_rate": 3.109385954871469e-05, "loss": 0.0065, "step": 4621 }, { "epoch": 3.30437890974084, "grad_norm": 0.019430266693234444, "learning_rate": 3.107074677036781e-05, "loss": 0.0066, "step": 4622 }, { "epoch": 3.3050938337801608, "grad_norm": 0.03203903138637543, "learning_rate": 3.104763871258447e-05, "loss": 0.0084, "step": 4623 }, { "epoch": 3.3058087578194817, "grad_norm": 0.022593775764107704, "learning_rate": 3.1024535381127337e-05, "loss": 0.0076, "step": 4624 }, { "epoch": 3.3065236818588026, "grad_norm": 0.020433668047189713, "learning_rate": 3.10014367817579e-05, "loss": 0.0056, "step": 4625 }, { "epoch": 3.3065236818588026, "eval_loss": 0.008480082266032696, "eval_runtime": 4.5872, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 4625 }, { "epoch": 3.3072386058981236, "grad_norm": 0.02340192347764969, "learning_rate": 3.097834292023647e-05, "loss": 0.0072, "step": 4626 }, { "epoch": 3.307953529937444, "grad_norm": 0.025508413091301918, "learning_rate": 3.095525380232218e-05, "loss": 0.0107, "step": 4627 }, { "epoch": 3.308668453976765, "grad_norm": 0.026661720126867294, "learning_rate": 3.093216943377298e-05, "loss": 0.0096, "step": 4628 }, { "epoch": 3.309383378016086, "grad_norm": 0.02468370832502842, "learning_rate": 3.0909089820345624e-05, "loss": 0.0089, "step": 4629 }, { "epoch": 3.3100983020554064, "grad_norm": 0.023104609921574593, "learning_rate": 3.0886014967795695e-05, "loss": 0.0088, "step": 4630 }, { "epoch": 3.3100983020554064, "eval_loss": 0.008573942817747593, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 4630 }, { "epoch": 3.3108132260947274, "grad_norm": 0.021846946328878403, "learning_rate": 3.086294488187758e-05, "loss": 0.0062, "step": 4631 }, { "epoch": 3.3115281501340483, "grad_norm": 0.027096256613731384, "learning_rate": 3.083987956834449e-05, "loss": 0.0093, "step": 4632 }, { "epoch": 3.3122430741733693, "grad_norm": 0.027190765365958214, "learning_rate": 3.081681903294843e-05, "loss": 0.0127, "step": 4633 }, { "epoch": 3.3129579982126898, "grad_norm": 0.021074049174785614, "learning_rate": 3.0793763281440224e-05, "loss": 0.0048, "step": 4634 }, { "epoch": 3.3136729222520107, "grad_norm": 0.030434053391218185, "learning_rate": 3.077071231956948e-05, "loss": 0.0062, "step": 4635 }, { "epoch": 3.3136729222520107, "eval_loss": 0.00866297259926796, "eval_runtime": 4.5763, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.841, "step": 4635 }, { "epoch": 3.3143878462913317, "grad_norm": 0.023870371282100677, "learning_rate": 3.0747666153084655e-05, "loss": 0.0059, "step": 4636 }, { "epoch": 3.315102770330652, "grad_norm": 0.02662351168692112, "learning_rate": 3.072462478773298e-05, "loss": 0.0089, "step": 4637 }, { "epoch": 3.315817694369973, "grad_norm": 0.032015323638916016, "learning_rate": 3.070158822926048e-05, "loss": 0.0113, "step": 4638 }, { "epoch": 3.316532618409294, "grad_norm": 0.02410188876092434, "learning_rate": 3.0678556483412004e-05, "loss": 0.0094, "step": 4639 }, { "epoch": 3.317247542448615, "grad_norm": 0.027874212712049484, "learning_rate": 3.06555295559312e-05, "loss": 0.0076, "step": 4640 }, { "epoch": 3.317247542448615, "eval_loss": 0.00872318260371685, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4640 }, { "epoch": 3.3179624664879355, "grad_norm": 0.026326792314648628, "learning_rate": 3.06325074525605e-05, "loss": 0.011, "step": 4641 }, { "epoch": 3.3186773905272564, "grad_norm": 0.02039935812354088, "learning_rate": 3.060949017904112e-05, "loss": 0.0059, "step": 4642 }, { "epoch": 3.3193923145665774, "grad_norm": 0.02705063857138157, "learning_rate": 3.058647774111314e-05, "loss": 0.0094, "step": 4643 }, { "epoch": 3.3201072386058983, "grad_norm": 0.023670298978686333, "learning_rate": 3.056347014451534e-05, "loss": 0.006, "step": 4644 }, { "epoch": 3.320822162645219, "grad_norm": 0.02257668226957321, "learning_rate": 3.054046739498535e-05, "loss": 0.0091, "step": 4645 }, { "epoch": 3.320822162645219, "eval_loss": 0.008672783151268959, "eval_runtime": 4.579, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 4645 }, { "epoch": 3.3215370866845397, "grad_norm": 0.019015712663531303, "learning_rate": 3.0517469498259578e-05, "loss": 0.0059, "step": 4646 }, { "epoch": 3.3222520107238607, "grad_norm": 0.02445090375840664, "learning_rate": 3.0494476460073235e-05, "loss": 0.0071, "step": 4647 }, { "epoch": 3.3229669347631816, "grad_norm": 0.023388605564832687, "learning_rate": 3.0471488286160298e-05, "loss": 0.0088, "step": 4648 }, { "epoch": 3.323681858802502, "grad_norm": 0.03042544424533844, "learning_rate": 3.044850498225354e-05, "loss": 0.0109, "step": 4649 }, { "epoch": 3.324396782841823, "grad_norm": 0.03017844259738922, "learning_rate": 3.0425526554084525e-05, "loss": 0.0123, "step": 4650 }, { "epoch": 3.324396782841823, "eval_loss": 0.008623098023235798, "eval_runtime": 4.5975, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.828, "step": 4650 }, { "epoch": 3.325111706881144, "grad_norm": 0.02225540205836296, "learning_rate": 3.0402553007383604e-05, "loss": 0.0049, "step": 4651 }, { "epoch": 3.3258266309204645, "grad_norm": 0.028713284060359, "learning_rate": 3.0379584347879897e-05, "loss": 0.0113, "step": 4652 }, { "epoch": 3.3265415549597854, "grad_norm": 0.026233794167637825, "learning_rate": 3.0356620581301298e-05, "loss": 0.0077, "step": 4653 }, { "epoch": 3.3272564789991064, "grad_norm": 0.022672725841403008, "learning_rate": 3.0333661713374506e-05, "loss": 0.0093, "step": 4654 }, { "epoch": 3.3279714030384273, "grad_norm": 0.022532016038894653, "learning_rate": 3.0310707749824983e-05, "loss": 0.0063, "step": 4655 }, { "epoch": 3.3279714030384273, "eval_loss": 0.008567819371819496, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 4655 }, { "epoch": 3.328686327077748, "grad_norm": 0.024218138307332993, "learning_rate": 3.0287758696377e-05, "loss": 0.0073, "step": 4656 }, { "epoch": 3.3294012511170687, "grad_norm": 0.02323722466826439, "learning_rate": 3.026481455875354e-05, "loss": 0.0112, "step": 4657 }, { "epoch": 3.3301161751563897, "grad_norm": 0.021889079362154007, "learning_rate": 3.0241875342676412e-05, "loss": 0.0073, "step": 4658 }, { "epoch": 3.3308310991957106, "grad_norm": 0.022722840309143066, "learning_rate": 3.0218941053866168e-05, "loss": 0.0069, "step": 4659 }, { "epoch": 3.331546023235031, "grad_norm": 0.01836545206606388, "learning_rate": 3.019601169804216e-05, "loss": 0.0057, "step": 4660 }, { "epoch": 3.331546023235031, "eval_loss": 0.008517514914274216, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 4660 }, { "epoch": 3.332260947274352, "grad_norm": 0.02535213902592659, "learning_rate": 3.017308728092249e-05, "loss": 0.0103, "step": 4661 }, { "epoch": 3.332975871313673, "grad_norm": 0.020334182307124138, "learning_rate": 3.0150167808224016e-05, "loss": 0.0053, "step": 4662 }, { "epoch": 3.333690795352994, "grad_norm": 0.021126726642251015, "learning_rate": 3.0127253285662417e-05, "loss": 0.0084, "step": 4663 }, { "epoch": 3.3344057193923144, "grad_norm": 0.022188041359186172, "learning_rate": 3.0104343718952065e-05, "loss": 0.0065, "step": 4664 }, { "epoch": 3.3351206434316354, "grad_norm": 0.021202167496085167, "learning_rate": 3.0081439113806144e-05, "loss": 0.0067, "step": 4665 }, { "epoch": 3.3351206434316354, "eval_loss": 0.008568622171878815, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4665 }, { "epoch": 3.3358355674709563, "grad_norm": 0.02415795437991619, "learning_rate": 3.0058539475936575e-05, "loss": 0.0086, "step": 4666 }, { "epoch": 3.336550491510277, "grad_norm": 0.01978079415857792, "learning_rate": 3.0035644811054082e-05, "loss": 0.0045, "step": 4667 }, { "epoch": 3.3372654155495978, "grad_norm": 0.0330812968313694, "learning_rate": 3.0012755124868098e-05, "loss": 0.0103, "step": 4668 }, { "epoch": 3.3379803395889187, "grad_norm": 0.02199944294989109, "learning_rate": 2.9989870423086845e-05, "loss": 0.008, "step": 4669 }, { "epoch": 3.3386952636282397, "grad_norm": 0.021787531673908234, "learning_rate": 2.9966990711417283e-05, "loss": 0.0063, "step": 4670 }, { "epoch": 3.3386952636282397, "eval_loss": 0.00863227155059576, "eval_runtime": 4.5837, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 4670 }, { "epoch": 3.33941018766756, "grad_norm": 0.022085977718234062, "learning_rate": 2.9944115995565146e-05, "loss": 0.0076, "step": 4671 }, { "epoch": 3.340125111706881, "grad_norm": 0.029194308444857597, "learning_rate": 2.992124628123492e-05, "loss": 0.0049, "step": 4672 }, { "epoch": 3.340840035746202, "grad_norm": 0.021020853891968727, "learning_rate": 2.9898381574129817e-05, "loss": 0.0057, "step": 4673 }, { "epoch": 3.341554959785523, "grad_norm": 0.028885651379823685, "learning_rate": 2.9875521879951852e-05, "loss": 0.0074, "step": 4674 }, { "epoch": 3.3422698838248435, "grad_norm": 0.019708452746272087, "learning_rate": 2.985266720440174e-05, "loss": 0.0083, "step": 4675 }, { "epoch": 3.3422698838248435, "eval_loss": 0.008617094717919827, "eval_runtime": 4.6042, "eval_samples_per_second": 10.86, "eval_steps_per_second": 2.824, "step": 4675 }, { "epoch": 3.3429848078641644, "grad_norm": 0.02396577037870884, "learning_rate": 2.9829817553178946e-05, "loss": 0.008, "step": 4676 }, { "epoch": 3.3436997319034854, "grad_norm": 0.02678138017654419, "learning_rate": 2.9806972931981737e-05, "loss": 0.0055, "step": 4677 }, { "epoch": 3.3444146559428063, "grad_norm": 0.03463911637663841, "learning_rate": 2.9784133346507076e-05, "loss": 0.0102, "step": 4678 }, { "epoch": 3.345129579982127, "grad_norm": 0.032480817288160324, "learning_rate": 2.9761298802450666e-05, "loss": 0.0068, "step": 4679 }, { "epoch": 3.3458445040214477, "grad_norm": 0.02262759394943714, "learning_rate": 2.973846930550698e-05, "loss": 0.0086, "step": 4680 }, { "epoch": 3.3458445040214477, "eval_loss": 0.008632917888462543, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4680 }, { "epoch": 3.3465594280607687, "grad_norm": 0.01850459910929203, "learning_rate": 2.9715644861369235e-05, "loss": 0.0043, "step": 4681 }, { "epoch": 3.347274352100089, "grad_norm": 0.021323299035429955, "learning_rate": 2.9692825475729356e-05, "loss": 0.0073, "step": 4682 }, { "epoch": 3.34798927613941, "grad_norm": 0.022513488307595253, "learning_rate": 2.967001115427802e-05, "loss": 0.0071, "step": 4683 }, { "epoch": 3.348704200178731, "grad_norm": 0.022735152393579483, "learning_rate": 2.9647201902704668e-05, "loss": 0.0078, "step": 4684 }, { "epoch": 3.349419124218052, "grad_norm": 0.02635149285197258, "learning_rate": 2.962439772669744e-05, "loss": 0.0092, "step": 4685 }, { "epoch": 3.349419124218052, "eval_loss": 0.008644587360322475, "eval_runtime": 4.5997, "eval_samples_per_second": 10.87, "eval_steps_per_second": 2.826, "step": 4685 }, { "epoch": 3.3501340482573725, "grad_norm": 0.024697085842490196, "learning_rate": 2.9601598631943217e-05, "loss": 0.0092, "step": 4686 }, { "epoch": 3.3508489722966934, "grad_norm": 0.021941153332591057, "learning_rate": 2.957880462412764e-05, "loss": 0.0053, "step": 4687 }, { "epoch": 3.3515638963360144, "grad_norm": 0.02407417632639408, "learning_rate": 2.9556015708935064e-05, "loss": 0.011, "step": 4688 }, { "epoch": 3.3522788203753353, "grad_norm": 0.02000405825674534, "learning_rate": 2.9533231892048556e-05, "loss": 0.0106, "step": 4689 }, { "epoch": 3.352993744414656, "grad_norm": 0.02204802632331848, "learning_rate": 2.951045317914992e-05, "loss": 0.0057, "step": 4690 }, { "epoch": 3.352993744414656, "eval_loss": 0.00851154699921608, "eval_runtime": 4.5979, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.827, "step": 4690 }, { "epoch": 3.3537086684539767, "grad_norm": 0.022799048572778702, "learning_rate": 2.948767957591973e-05, "loss": 0.0078, "step": 4691 }, { "epoch": 3.3544235924932977, "grad_norm": 0.021886490285396576, "learning_rate": 2.9464911088037223e-05, "loss": 0.0077, "step": 4692 }, { "epoch": 3.3551385165326186, "grad_norm": 0.02302434854209423, "learning_rate": 2.9442147721180398e-05, "loss": 0.0075, "step": 4693 }, { "epoch": 3.355853440571939, "grad_norm": 0.026711346581578255, "learning_rate": 2.941938948102595e-05, "loss": 0.0057, "step": 4694 }, { "epoch": 3.35656836461126, "grad_norm": 0.02242152951657772, "learning_rate": 2.9396636373249338e-05, "loss": 0.0089, "step": 4695 }, { "epoch": 3.35656836461126, "eval_loss": 0.00857499148696661, "eval_runtime": 4.58, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 4695 }, { "epoch": 3.357283288650581, "grad_norm": 0.020570775493979454, "learning_rate": 2.93738884035247e-05, "loss": 0.0077, "step": 4696 }, { "epoch": 3.3579982126899015, "grad_norm": 0.0294482484459877, "learning_rate": 2.935114557752491e-05, "loss": 0.0086, "step": 4697 }, { "epoch": 3.3587131367292224, "grad_norm": 0.02529384195804596, "learning_rate": 2.9328407900921557e-05, "loss": 0.0085, "step": 4698 }, { "epoch": 3.3594280607685434, "grad_norm": 0.020779836922883987, "learning_rate": 2.9305675379384944e-05, "loss": 0.0061, "step": 4699 }, { "epoch": 3.3601429848078643, "grad_norm": 0.023401891812682152, "learning_rate": 2.9282948018584118e-05, "loss": 0.0062, "step": 4700 }, { "epoch": 3.3601429848078643, "eval_loss": 0.008620682172477245, "eval_runtime": 4.5764, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.841, "step": 4700 }, { "epoch": 3.360857908847185, "grad_norm": 0.025936052203178406, "learning_rate": 2.926022582418678e-05, "loss": 0.0062, "step": 4701 }, { "epoch": 3.3615728328865058, "grad_norm": 0.023289605975151062, "learning_rate": 2.9237508801859394e-05, "loss": 0.007, "step": 4702 }, { "epoch": 3.3622877569258267, "grad_norm": 0.02665243297815323, "learning_rate": 2.9214796957267088e-05, "loss": 0.0069, "step": 4703 }, { "epoch": 3.363002680965147, "grad_norm": 0.0294373519718647, "learning_rate": 2.9192090296073754e-05, "loss": 0.0075, "step": 4704 }, { "epoch": 3.363717605004468, "grad_norm": 0.031188759952783585, "learning_rate": 2.916938882394194e-05, "loss": 0.0067, "step": 4705 }, { "epoch": 3.363717605004468, "eval_loss": 0.008629784919321537, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 4705 }, { "epoch": 3.364432529043789, "grad_norm": 0.02486369200050831, "learning_rate": 2.914669254653296e-05, "loss": 0.006, "step": 4706 }, { "epoch": 3.36514745308311, "grad_norm": 0.01838667504489422, "learning_rate": 2.9124001469506745e-05, "loss": 0.004, "step": 4707 }, { "epoch": 3.365862377122431, "grad_norm": 0.02141166292130947, "learning_rate": 2.9101315598522005e-05, "loss": 0.0057, "step": 4708 }, { "epoch": 3.3665773011617515, "grad_norm": 0.02729162573814392, "learning_rate": 2.9078634939236142e-05, "loss": 0.0081, "step": 4709 }, { "epoch": 3.3672922252010724, "grad_norm": 0.023364519700407982, "learning_rate": 2.905595949730521e-05, "loss": 0.0058, "step": 4710 }, { "epoch": 3.3672922252010724, "eval_loss": 0.008652192540466785, "eval_runtime": 4.6017, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 4710 }, { "epoch": 3.3680071492403933, "grad_norm": 0.023090776056051254, "learning_rate": 2.903328927838403e-05, "loss": 0.0073, "step": 4711 }, { "epoch": 3.368722073279714, "grad_norm": 0.028110262006521225, "learning_rate": 2.901062428812604e-05, "loss": 0.0087, "step": 4712 }, { "epoch": 3.369436997319035, "grad_norm": 0.018499568104743958, "learning_rate": 2.8987964532183454e-05, "loss": 0.0041, "step": 4713 }, { "epoch": 3.3701519213583557, "grad_norm": 0.024205273017287254, "learning_rate": 2.8965310016207148e-05, "loss": 0.0075, "step": 4714 }, { "epoch": 3.3708668453976767, "grad_norm": 0.02216983027756214, "learning_rate": 2.8942660745846656e-05, "loss": 0.0055, "step": 4715 }, { "epoch": 3.3708668453976767, "eval_loss": 0.008626640774309635, "eval_runtime": 4.5971, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 4715 }, { "epoch": 3.371581769436997, "grad_norm": 0.023190800100564957, "learning_rate": 2.892001672675026e-05, "loss": 0.0068, "step": 4716 }, { "epoch": 3.372296693476318, "grad_norm": 0.02328549139201641, "learning_rate": 2.889737796456492e-05, "loss": 0.006, "step": 4717 }, { "epoch": 3.373011617515639, "grad_norm": 0.02024497650563717, "learning_rate": 2.887474446493625e-05, "loss": 0.0054, "step": 4718 }, { "epoch": 3.3737265415549595, "grad_norm": 0.03262325003743172, "learning_rate": 2.88521162335086e-05, "loss": 0.0096, "step": 4719 }, { "epoch": 3.3744414655942805, "grad_norm": 0.024238016456365585, "learning_rate": 2.8829493275924936e-05, "loss": 0.0072, "step": 4720 }, { "epoch": 3.3744414655942805, "eval_loss": 0.00868293084204197, "eval_runtime": 4.5844, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 4720 }, { "epoch": 3.3751563896336014, "grad_norm": 0.0230542179197073, "learning_rate": 2.8806875597827e-05, "loss": 0.0067, "step": 4721 }, { "epoch": 3.3758713136729224, "grad_norm": 0.0208877082914114, "learning_rate": 2.8784263204855176e-05, "loss": 0.0058, "step": 4722 }, { "epoch": 3.3765862377122433, "grad_norm": 0.022682813927531242, "learning_rate": 2.8761656102648493e-05, "loss": 0.0063, "step": 4723 }, { "epoch": 3.377301161751564, "grad_norm": 0.02169417403638363, "learning_rate": 2.8739054296844738e-05, "loss": 0.0078, "step": 4724 }, { "epoch": 3.3780160857908847, "grad_norm": 0.019385093823075294, "learning_rate": 2.8716457793080277e-05, "loss": 0.0064, "step": 4725 }, { "epoch": 3.3780160857908847, "eval_loss": 0.008602925576269627, "eval_runtime": 4.5837, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 4725 }, { "epoch": 3.3787310098302057, "grad_norm": 0.028720390051603317, "learning_rate": 2.8693866596990238e-05, "loss": 0.0068, "step": 4726 }, { "epoch": 3.379445933869526, "grad_norm": 0.028103193268179893, "learning_rate": 2.8671280714208426e-05, "loss": 0.008, "step": 4727 }, { "epoch": 3.380160857908847, "grad_norm": 0.025026163086295128, "learning_rate": 2.8648700150367236e-05, "loss": 0.0061, "step": 4728 }, { "epoch": 3.380875781948168, "grad_norm": 0.026157932355999947, "learning_rate": 2.862612491109783e-05, "loss": 0.0077, "step": 4729 }, { "epoch": 3.381590705987489, "grad_norm": 0.02247820794582367, "learning_rate": 2.860355500203e-05, "loss": 0.0058, "step": 4730 }, { "epoch": 3.381590705987489, "eval_loss": 0.008511506021022797, "eval_runtime": 4.5808, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 4730 }, { "epoch": 3.3823056300268095, "grad_norm": 0.019572576507925987, "learning_rate": 2.8580990428792204e-05, "loss": 0.0051, "step": 4731 }, { "epoch": 3.3830205540661304, "grad_norm": 0.022979481145739555, "learning_rate": 2.8558431197011575e-05, "loss": 0.0053, "step": 4732 }, { "epoch": 3.3837354781054514, "grad_norm": 0.018278906121850014, "learning_rate": 2.8535877312313952e-05, "loss": 0.0052, "step": 4733 }, { "epoch": 3.384450402144772, "grad_norm": 0.023840194568037987, "learning_rate": 2.851332878032376e-05, "loss": 0.0114, "step": 4734 }, { "epoch": 3.385165326184093, "grad_norm": 0.027249181643128395, "learning_rate": 2.8490785606664178e-05, "loss": 0.0095, "step": 4735 }, { "epoch": 3.385165326184093, "eval_loss": 0.008429769426584244, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 4735 }, { "epoch": 3.3858802502234138, "grad_norm": 0.019333476200699806, "learning_rate": 2.8468247796956964e-05, "loss": 0.0063, "step": 4736 }, { "epoch": 3.3865951742627347, "grad_norm": 0.018177490681409836, "learning_rate": 2.8445715356822606e-05, "loss": 0.0074, "step": 4737 }, { "epoch": 3.3873100983020556, "grad_norm": 0.024489331990480423, "learning_rate": 2.8423188291880242e-05, "loss": 0.0057, "step": 4738 }, { "epoch": 3.388025022341376, "grad_norm": 0.025786567479372025, "learning_rate": 2.8400666607747618e-05, "loss": 0.0117, "step": 4739 }, { "epoch": 3.388739946380697, "grad_norm": 0.021858876571059227, "learning_rate": 2.8378150310041196e-05, "loss": 0.0066, "step": 4740 }, { "epoch": 3.388739946380697, "eval_loss": 0.008379410952329636, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 4740 }, { "epoch": 3.389454870420018, "grad_norm": 0.018424807116389275, "learning_rate": 2.8355639404376087e-05, "loss": 0.0053, "step": 4741 }, { "epoch": 3.3901697944593385, "grad_norm": 0.02239244244992733, "learning_rate": 2.8333133896365994e-05, "loss": 0.0077, "step": 4742 }, { "epoch": 3.3908847184986595, "grad_norm": 0.02503398433327675, "learning_rate": 2.8310633791623403e-05, "loss": 0.0084, "step": 4743 }, { "epoch": 3.3915996425379804, "grad_norm": 0.01775450073182583, "learning_rate": 2.8288139095759315e-05, "loss": 0.0074, "step": 4744 }, { "epoch": 3.3923145665773013, "grad_norm": 0.024135183542966843, "learning_rate": 2.826564981438345e-05, "loss": 0.0117, "step": 4745 }, { "epoch": 3.3923145665773013, "eval_loss": 0.00835241936147213, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 4745 }, { "epoch": 3.393029490616622, "grad_norm": 0.02356024459004402, "learning_rate": 2.82431659531042e-05, "loss": 0.0078, "step": 4746 }, { "epoch": 3.393744414655943, "grad_norm": 0.020749684423208237, "learning_rate": 2.822068751752853e-05, "loss": 0.0052, "step": 4747 }, { "epoch": 3.3944593386952637, "grad_norm": 0.03145569562911987, "learning_rate": 2.8198214513262134e-05, "loss": 0.0113, "step": 4748 }, { "epoch": 3.3951742627345842, "grad_norm": 0.019901959225535393, "learning_rate": 2.8175746945909276e-05, "loss": 0.0074, "step": 4749 }, { "epoch": 3.395889186773905, "grad_norm": 0.02321423962712288, "learning_rate": 2.8153284821072922e-05, "loss": 0.0079, "step": 4750 }, { "epoch": 3.395889186773905, "eval_loss": 0.00834956206381321, "eval_runtime": 4.5945, "eval_samples_per_second": 10.883, "eval_steps_per_second": 2.829, "step": 4750 }, { "epoch": 3.396604110813226, "grad_norm": 0.024659737944602966, "learning_rate": 2.8130828144354687e-05, "loss": 0.0085, "step": 4751 }, { "epoch": 3.397319034852547, "grad_norm": 0.021489065140485764, "learning_rate": 2.810837692135475e-05, "loss": 0.005, "step": 4752 }, { "epoch": 3.3980339588918675, "grad_norm": 0.02327846921980381, "learning_rate": 2.8085931157672008e-05, "loss": 0.0084, "step": 4753 }, { "epoch": 3.3987488829311885, "grad_norm": 0.02188562974333763, "learning_rate": 2.8063490858903984e-05, "loss": 0.0076, "step": 4754 }, { "epoch": 3.3994638069705094, "grad_norm": 0.024425385519862175, "learning_rate": 2.8041056030646785e-05, "loss": 0.006, "step": 4755 }, { "epoch": 3.3994638069705094, "eval_loss": 0.008175905793905258, "eval_runtime": 4.59, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 4755 }, { "epoch": 3.4001787310098304, "grad_norm": 0.021826013922691345, "learning_rate": 2.8018626678495215e-05, "loss": 0.0052, "step": 4756 }, { "epoch": 3.400893655049151, "grad_norm": 0.023097798228263855, "learning_rate": 2.7996202808042703e-05, "loss": 0.0062, "step": 4757 }, { "epoch": 3.401608579088472, "grad_norm": 0.023174649104475975, "learning_rate": 2.7973784424881273e-05, "loss": 0.0097, "step": 4758 }, { "epoch": 3.4023235031277927, "grad_norm": 0.018113980069756508, "learning_rate": 2.7951371534601628e-05, "loss": 0.0053, "step": 4759 }, { "epoch": 3.4030384271671137, "grad_norm": 0.023543836548924446, "learning_rate": 2.7928964142793046e-05, "loss": 0.0069, "step": 4760 }, { "epoch": 3.4030384271671137, "eval_loss": 0.008188351057469845, "eval_runtime": 4.5893, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 4760 }, { "epoch": 3.403753351206434, "grad_norm": 0.019001005217432976, "learning_rate": 2.7906562255043488e-05, "loss": 0.0048, "step": 4761 }, { "epoch": 3.404468275245755, "grad_norm": 0.029223185032606125, "learning_rate": 2.788416587693954e-05, "loss": 0.0115, "step": 4762 }, { "epoch": 3.405183199285076, "grad_norm": 0.025288967415690422, "learning_rate": 2.786177501406635e-05, "loss": 0.0118, "step": 4763 }, { "epoch": 3.4058981233243966, "grad_norm": 0.023422960191965103, "learning_rate": 2.7839389672007754e-05, "loss": 0.0073, "step": 4764 }, { "epoch": 3.4066130473637175, "grad_norm": 0.024529576301574707, "learning_rate": 2.78170098563462e-05, "loss": 0.0052, "step": 4765 }, { "epoch": 3.4066130473637175, "eval_loss": 0.008290449157357216, "eval_runtime": 4.5929, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 4765 }, { "epoch": 3.4073279714030384, "grad_norm": 0.024361969903111458, "learning_rate": 2.7794635572662752e-05, "loss": 0.0067, "step": 4766 }, { "epoch": 3.4080428954423594, "grad_norm": 0.018651217222213745, "learning_rate": 2.7772266826537103e-05, "loss": 0.0065, "step": 4767 }, { "epoch": 3.40875781948168, "grad_norm": 0.020280802622437477, "learning_rate": 2.7749903623547517e-05, "loss": 0.0056, "step": 4768 }, { "epoch": 3.409472743521001, "grad_norm": 0.028040431439876556, "learning_rate": 2.772754596927094e-05, "loss": 0.0085, "step": 4769 }, { "epoch": 3.4101876675603218, "grad_norm": 0.02366771176457405, "learning_rate": 2.7705193869282925e-05, "loss": 0.0092, "step": 4770 }, { "epoch": 3.4101876675603218, "eval_loss": 0.0084155835211277, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4770 }, { "epoch": 3.4109025915996427, "grad_norm": 0.022762056440114975, "learning_rate": 2.768284732915758e-05, "loss": 0.0068, "step": 4771 }, { "epoch": 3.411617515638963, "grad_norm": 0.022916169837117195, "learning_rate": 2.7660506354467707e-05, "loss": 0.0048, "step": 4772 }, { "epoch": 3.412332439678284, "grad_norm": 0.01944137178361416, "learning_rate": 2.7638170950784648e-05, "loss": 0.0068, "step": 4773 }, { "epoch": 3.413047363717605, "grad_norm": 0.024200428277254105, "learning_rate": 2.761584112367841e-05, "loss": 0.0045, "step": 4774 }, { "epoch": 3.413762287756926, "grad_norm": 0.019160205498337746, "learning_rate": 2.75935168787176e-05, "loss": 0.005, "step": 4775 }, { "epoch": 3.413762287756926, "eval_loss": 0.008393624797463417, "eval_runtime": 4.6111, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 4775 }, { "epoch": 3.4144772117962465, "grad_norm": 0.029912803322076797, "learning_rate": 2.7571198221469397e-05, "loss": 0.0091, "step": 4776 }, { "epoch": 3.4151921358355675, "grad_norm": 0.023768264800310135, "learning_rate": 2.7548885157499616e-05, "loss": 0.0086, "step": 4777 }, { "epoch": 3.4159070598748884, "grad_norm": 0.019932296127080917, "learning_rate": 2.7526577692372702e-05, "loss": 0.0052, "step": 4778 }, { "epoch": 3.416621983914209, "grad_norm": 0.029846377670764923, "learning_rate": 2.7504275831651637e-05, "loss": 0.0068, "step": 4779 }, { "epoch": 3.41733690795353, "grad_norm": 0.033321768045425415, "learning_rate": 2.7481979580898053e-05, "loss": 0.016, "step": 4780 }, { "epoch": 3.41733690795353, "eval_loss": 0.008409717120230198, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 4780 }, { "epoch": 3.418051831992851, "grad_norm": 0.01884184032678604, "learning_rate": 2.7459688945672202e-05, "loss": 0.0054, "step": 4781 }, { "epoch": 3.4187667560321717, "grad_norm": 0.017190925776958466, "learning_rate": 2.7437403931532867e-05, "loss": 0.0051, "step": 4782 }, { "epoch": 3.4194816800714922, "grad_norm": 0.020370829850435257, "learning_rate": 2.7415124544037495e-05, "loss": 0.0044, "step": 4783 }, { "epoch": 3.420196604110813, "grad_norm": 0.021770253777503967, "learning_rate": 2.7392850788742084e-05, "loss": 0.0056, "step": 4784 }, { "epoch": 3.420911528150134, "grad_norm": 0.02376275509595871, "learning_rate": 2.737058267120125e-05, "loss": 0.0086, "step": 4785 }, { "epoch": 3.420911528150134, "eval_loss": 0.00830100104212761, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 4785 }, { "epoch": 3.421626452189455, "grad_norm": 0.023877372965216637, "learning_rate": 2.7348320196968206e-05, "loss": 0.0069, "step": 4786 }, { "epoch": 3.4223413762287755, "grad_norm": 0.029294854030013084, "learning_rate": 2.7326063371594758e-05, "loss": 0.0069, "step": 4787 }, { "epoch": 3.4230563002680965, "grad_norm": 0.019843535497784615, "learning_rate": 2.7303812200631308e-05, "loss": 0.0057, "step": 4788 }, { "epoch": 3.4237712243074174, "grad_norm": 0.022315699607133865, "learning_rate": 2.7281566689626802e-05, "loss": 0.0094, "step": 4789 }, { "epoch": 3.4244861483467384, "grad_norm": 0.017635522410273552, "learning_rate": 2.725932684412883e-05, "loss": 0.0063, "step": 4790 }, { "epoch": 3.4244861483467384, "eval_loss": 0.008241728879511356, "eval_runtime": 4.5781, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 4790 }, { "epoch": 3.425201072386059, "grad_norm": 0.022792890667915344, "learning_rate": 2.7237092669683562e-05, "loss": 0.01, "step": 4791 }, { "epoch": 3.42591599642538, "grad_norm": 0.02952914498746395, "learning_rate": 2.7214864171835708e-05, "loss": 0.0153, "step": 4792 }, { "epoch": 3.4266309204647007, "grad_norm": 0.018456758931279182, "learning_rate": 2.7192641356128613e-05, "loss": 0.0049, "step": 4793 }, { "epoch": 3.4273458445040212, "grad_norm": 0.016667183488607407, "learning_rate": 2.7170424228104206e-05, "loss": 0.0064, "step": 4794 }, { "epoch": 3.428060768543342, "grad_norm": 0.028663981705904007, "learning_rate": 2.7148212793302935e-05, "loss": 0.0117, "step": 4795 }, { "epoch": 3.428060768543342, "eval_loss": 0.008267547935247421, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4795 }, { "epoch": 3.428775692582663, "grad_norm": 0.02526610530912876, "learning_rate": 2.712600705726392e-05, "loss": 0.009, "step": 4796 }, { "epoch": 3.429490616621984, "grad_norm": 0.025832077488303185, "learning_rate": 2.7103807025524764e-05, "loss": 0.0083, "step": 4797 }, { "epoch": 3.4302055406613046, "grad_norm": 0.020242100581526756, "learning_rate": 2.7081612703621712e-05, "loss": 0.0066, "step": 4798 }, { "epoch": 3.4309204647006255, "grad_norm": 0.020181579515337944, "learning_rate": 2.7059424097089596e-05, "loss": 0.0071, "step": 4799 }, { "epoch": 3.4316353887399464, "grad_norm": 0.021621806547045708, "learning_rate": 2.703724121146175e-05, "loss": 0.0084, "step": 4800 }, { "epoch": 3.4316353887399464, "eval_loss": 0.008318882435560226, "eval_runtime": 4.6201, "eval_samples_per_second": 10.822, "eval_steps_per_second": 2.814, "step": 4800 }, { "epoch": 3.4323503127792674, "grad_norm": 0.022393088787794113, "learning_rate": 2.7015064052270156e-05, "loss": 0.008, "step": 4801 }, { "epoch": 3.433065236818588, "grad_norm": 0.020902112126350403, "learning_rate": 2.699289262504534e-05, "loss": 0.0073, "step": 4802 }, { "epoch": 3.433780160857909, "grad_norm": 0.026839256286621094, "learning_rate": 2.697072693531637e-05, "loss": 0.0113, "step": 4803 }, { "epoch": 3.4344950848972298, "grad_norm": 0.025196824222803116, "learning_rate": 2.6948566988610935e-05, "loss": 0.0085, "step": 4804 }, { "epoch": 3.4352100089365507, "grad_norm": 0.025637567043304443, "learning_rate": 2.692641279045527e-05, "loss": 0.0118, "step": 4805 }, { "epoch": 3.4352100089365507, "eval_loss": 0.00830208882689476, "eval_runtime": 4.6017, "eval_samples_per_second": 10.866, "eval_steps_per_second": 2.825, "step": 4805 }, { "epoch": 3.435924932975871, "grad_norm": 0.02197367511689663, "learning_rate": 2.6904264346374135e-05, "loss": 0.0048, "step": 4806 }, { "epoch": 3.436639857015192, "grad_norm": 0.020880814641714096, "learning_rate": 2.6882121661890948e-05, "loss": 0.0093, "step": 4807 }, { "epoch": 3.437354781054513, "grad_norm": 0.023645611479878426, "learning_rate": 2.6859984742527577e-05, "loss": 0.0058, "step": 4808 }, { "epoch": 3.4380697050938336, "grad_norm": 0.030822614207863808, "learning_rate": 2.683785359380455e-05, "loss": 0.0076, "step": 4809 }, { "epoch": 3.4387846291331545, "grad_norm": 0.027350332587957382, "learning_rate": 2.6815728221240897e-05, "loss": 0.0099, "step": 4810 }, { "epoch": 3.4387846291331545, "eval_loss": 0.008417513221502304, "eval_runtime": 4.5766, "eval_samples_per_second": 10.925, "eval_steps_per_second": 2.841, "step": 4810 }, { "epoch": 3.4394995531724755, "grad_norm": 0.02152911387383938, "learning_rate": 2.679360863035425e-05, "loss": 0.0068, "step": 4811 }, { "epoch": 3.4402144772117964, "grad_norm": 0.022712018340826035, "learning_rate": 2.677149482666078e-05, "loss": 0.0078, "step": 4812 }, { "epoch": 3.440929401251117, "grad_norm": 0.021539121866226196, "learning_rate": 2.674938681567517e-05, "loss": 0.0067, "step": 4813 }, { "epoch": 3.441644325290438, "grad_norm": 0.02505851723253727, "learning_rate": 2.672728460291073e-05, "loss": 0.008, "step": 4814 }, { "epoch": 3.442359249329759, "grad_norm": 0.0235043466091156, "learning_rate": 2.6705188193879316e-05, "loss": 0.0102, "step": 4815 }, { "epoch": 3.442359249329759, "eval_loss": 0.00833423063158989, "eval_runtime": 4.5895, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 4815 }, { "epoch": 3.4430741733690797, "grad_norm": 0.026538198813796043, "learning_rate": 2.6683097594091254e-05, "loss": 0.0062, "step": 4816 }, { "epoch": 3.4437890974084002, "grad_norm": 0.037957366555929184, "learning_rate": 2.666101280905553e-05, "loss": 0.007, "step": 4817 }, { "epoch": 3.444504021447721, "grad_norm": 0.02206847444176674, "learning_rate": 2.663893384427963e-05, "loss": 0.0069, "step": 4818 }, { "epoch": 3.445218945487042, "grad_norm": 0.022355493158102036, "learning_rate": 2.661686070526956e-05, "loss": 0.0066, "step": 4819 }, { "epoch": 3.445933869526363, "grad_norm": 0.028340017423033714, "learning_rate": 2.659479339752994e-05, "loss": 0.0075, "step": 4820 }, { "epoch": 3.445933869526363, "eval_loss": 0.008384308777749538, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4820 }, { "epoch": 3.4466487935656835, "grad_norm": 0.028901319950819016, "learning_rate": 2.6572731926563864e-05, "loss": 0.013, "step": 4821 }, { "epoch": 3.4473637176050045, "grad_norm": 0.023950446397066116, "learning_rate": 2.655067629787302e-05, "loss": 0.0059, "step": 4822 }, { "epoch": 3.4480786416443254, "grad_norm": 0.026781847700476646, "learning_rate": 2.6528626516957648e-05, "loss": 0.0103, "step": 4823 }, { "epoch": 3.448793565683646, "grad_norm": 0.02899281494319439, "learning_rate": 2.6506582589316463e-05, "loss": 0.009, "step": 4824 }, { "epoch": 3.449508489722967, "grad_norm": 0.022562192752957344, "learning_rate": 2.6484544520446798e-05, "loss": 0.0072, "step": 4825 }, { "epoch": 3.449508489722967, "eval_loss": 0.008281394839286804, "eval_runtime": 4.586, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4825 }, { "epoch": 3.450223413762288, "grad_norm": 0.021635252982378006, "learning_rate": 2.646251231584449e-05, "loss": 0.0056, "step": 4826 }, { "epoch": 3.4509383378016087, "grad_norm": 0.03186687454581261, "learning_rate": 2.644048598100388e-05, "loss": 0.0108, "step": 4827 }, { "epoch": 3.4516532618409292, "grad_norm": 0.023007245734333992, "learning_rate": 2.6418465521417913e-05, "loss": 0.0061, "step": 4828 }, { "epoch": 3.45236818588025, "grad_norm": 0.016486946493387222, "learning_rate": 2.6396450942578044e-05, "loss": 0.0051, "step": 4829 }, { "epoch": 3.453083109919571, "grad_norm": 0.022418923676013947, "learning_rate": 2.6374442249974213e-05, "loss": 0.0077, "step": 4830 }, { "epoch": 3.453083109919571, "eval_loss": 0.008096795529127121, "eval_runtime": 4.6073, "eval_samples_per_second": 10.852, "eval_steps_per_second": 2.822, "step": 4830 }, { "epoch": 3.4537980339588916, "grad_norm": 0.017956919968128204, "learning_rate": 2.635243944909495e-05, "loss": 0.0055, "step": 4831 }, { "epoch": 3.4545129579982126, "grad_norm": 0.024555616080760956, "learning_rate": 2.6330442545427303e-05, "loss": 0.0091, "step": 4832 }, { "epoch": 3.4552278820375335, "grad_norm": 0.021069001406431198, "learning_rate": 2.6308451544456864e-05, "loss": 0.0067, "step": 4833 }, { "epoch": 3.4559428060768544, "grad_norm": 0.026277832686901093, "learning_rate": 2.6286466451667692e-05, "loss": 0.0117, "step": 4834 }, { "epoch": 3.4566577301161754, "grad_norm": 0.02031269297003746, "learning_rate": 2.6264487272542426e-05, "loss": 0.0079, "step": 4835 }, { "epoch": 3.4566577301161754, "eval_loss": 0.008145750500261784, "eval_runtime": 4.6341, "eval_samples_per_second": 10.789, "eval_steps_per_second": 2.805, "step": 4835 }, { "epoch": 3.457372654155496, "grad_norm": 0.022057661786675453, "learning_rate": 2.6242514012562254e-05, "loss": 0.006, "step": 4836 }, { "epoch": 3.458087578194817, "grad_norm": 0.02272937074303627, "learning_rate": 2.6220546677206792e-05, "loss": 0.0079, "step": 4837 }, { "epoch": 3.4588025022341378, "grad_norm": 0.029701659455895424, "learning_rate": 2.619858527195428e-05, "loss": 0.0079, "step": 4838 }, { "epoch": 3.4595174262734583, "grad_norm": 0.016184702515602112, "learning_rate": 2.6176629802281437e-05, "loss": 0.0049, "step": 4839 }, { "epoch": 3.460232350312779, "grad_norm": 0.018585704267024994, "learning_rate": 2.6154680273663468e-05, "loss": 0.0053, "step": 4840 }, { "epoch": 3.460232350312779, "eval_loss": 0.00815163180232048, "eval_runtime": 4.578, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 4840 }, { "epoch": 3.4609472743521, "grad_norm": 0.027559686452150345, "learning_rate": 2.6132736691574168e-05, "loss": 0.0113, "step": 4841 }, { "epoch": 3.461662198391421, "grad_norm": 0.020242849364876747, "learning_rate": 2.61107990614858e-05, "loss": 0.0061, "step": 4842 }, { "epoch": 3.4623771224307416, "grad_norm": 0.02354811318218708, "learning_rate": 2.6088867388869143e-05, "loss": 0.0072, "step": 4843 }, { "epoch": 3.4630920464700625, "grad_norm": 0.029736429452896118, "learning_rate": 2.606694167919353e-05, "loss": 0.0127, "step": 4844 }, { "epoch": 3.4638069705093835, "grad_norm": 0.024060862138867378, "learning_rate": 2.6045021937926738e-05, "loss": 0.0083, "step": 4845 }, { "epoch": 3.4638069705093835, "eval_loss": 0.00812817458063364, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 4845 }, { "epoch": 3.464521894548704, "grad_norm": 0.018901322036981583, "learning_rate": 2.6023108170535115e-05, "loss": 0.0054, "step": 4846 }, { "epoch": 3.465236818588025, "grad_norm": 0.027677619829773903, "learning_rate": 2.6001200382483525e-05, "loss": 0.0077, "step": 4847 }, { "epoch": 3.465951742627346, "grad_norm": 0.018249496817588806, "learning_rate": 2.5979298579235273e-05, "loss": 0.0043, "step": 4848 }, { "epoch": 3.466666666666667, "grad_norm": 0.021359844133257866, "learning_rate": 2.5957402766252237e-05, "loss": 0.0073, "step": 4849 }, { "epoch": 3.4673815907059877, "grad_norm": 0.019447600468993187, "learning_rate": 2.59355129489948e-05, "loss": 0.006, "step": 4850 }, { "epoch": 3.4673815907059877, "eval_loss": 0.008055025711655617, "eval_runtime": 4.596, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 4850 }, { "epoch": 3.4680965147453082, "grad_norm": 0.03272027522325516, "learning_rate": 2.5913629132921785e-05, "loss": 0.0136, "step": 4851 }, { "epoch": 3.468811438784629, "grad_norm": 0.020277399569749832, "learning_rate": 2.5891751323490588e-05, "loss": 0.0047, "step": 4852 }, { "epoch": 3.46952636282395, "grad_norm": 0.02798105962574482, "learning_rate": 2.586987952615707e-05, "loss": 0.0122, "step": 4853 }, { "epoch": 3.4702412868632706, "grad_norm": 0.020818714052438736, "learning_rate": 2.584801374637562e-05, "loss": 0.0061, "step": 4854 }, { "epoch": 3.4709562109025915, "grad_norm": 0.024272248148918152, "learning_rate": 2.582615398959912e-05, "loss": 0.01, "step": 4855 }, { "epoch": 3.4709562109025915, "eval_loss": 0.008082043379545212, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 4855 }, { "epoch": 3.4716711349419125, "grad_norm": 0.02475105971097946, "learning_rate": 2.5804300261278903e-05, "loss": 0.007, "step": 4856 }, { "epoch": 3.4723860589812334, "grad_norm": 0.02555907890200615, "learning_rate": 2.578245256686488e-05, "loss": 0.0076, "step": 4857 }, { "epoch": 3.473100983020554, "grad_norm": 0.03368031606078148, "learning_rate": 2.576061091180536e-05, "loss": 0.0066, "step": 4858 }, { "epoch": 3.473815907059875, "grad_norm": 0.02845730073750019, "learning_rate": 2.5738775301547236e-05, "loss": 0.0071, "step": 4859 }, { "epoch": 3.474530831099196, "grad_norm": 0.024505099281668663, "learning_rate": 2.5716945741535865e-05, "loss": 0.0062, "step": 4860 }, { "epoch": 3.474530831099196, "eval_loss": 0.008186967112123966, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 4860 }, { "epoch": 3.4752457551385163, "grad_norm": 0.018694285303354263, "learning_rate": 2.5695122237215054e-05, "loss": 0.0061, "step": 4861 }, { "epoch": 3.4759606791778372, "grad_norm": 0.022329770028591156, "learning_rate": 2.567330479402714e-05, "loss": 0.0057, "step": 4862 }, { "epoch": 3.476675603217158, "grad_norm": 0.021646864712238312, "learning_rate": 2.5651493417412985e-05, "loss": 0.0052, "step": 4863 }, { "epoch": 3.477390527256479, "grad_norm": 0.023992953822016716, "learning_rate": 2.5629688112811832e-05, "loss": 0.007, "step": 4864 }, { "epoch": 3.4781054512958, "grad_norm": 0.02137872390449047, "learning_rate": 2.56078888856615e-05, "loss": 0.0058, "step": 4865 }, { "epoch": 3.4781054512958, "eval_loss": 0.008179757744073868, "eval_runtime": 4.5931, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 4865 }, { "epoch": 3.4788203753351206, "grad_norm": 0.030872996896505356, "learning_rate": 2.558609574139829e-05, "loss": 0.0126, "step": 4866 }, { "epoch": 3.4795352993744415, "grad_norm": 0.023421868681907654, "learning_rate": 2.5564308685456917e-05, "loss": 0.0069, "step": 4867 }, { "epoch": 3.4802502234137624, "grad_norm": 0.027516189962625504, "learning_rate": 2.5542527723270655e-05, "loss": 0.0147, "step": 4868 }, { "epoch": 3.480965147453083, "grad_norm": 0.025165969505906105, "learning_rate": 2.552075286027119e-05, "loss": 0.0075, "step": 4869 }, { "epoch": 3.481680071492404, "grad_norm": 0.02207118645310402, "learning_rate": 2.5498984101888747e-05, "loss": 0.007, "step": 4870 }, { "epoch": 3.481680071492404, "eval_loss": 0.00803685374557972, "eval_runtime": 4.5845, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 4870 }, { "epoch": 3.482394995531725, "grad_norm": 0.024713819846510887, "learning_rate": 2.5477221453552013e-05, "loss": 0.0084, "step": 4871 }, { "epoch": 3.4831099195710458, "grad_norm": 0.01753501407802105, "learning_rate": 2.5455464920688105e-05, "loss": 0.0069, "step": 4872 }, { "epoch": 3.4838248436103663, "grad_norm": 0.02438146062195301, "learning_rate": 2.5433714508722674e-05, "loss": 0.0065, "step": 4873 }, { "epoch": 3.484539767649687, "grad_norm": 0.024356719106435776, "learning_rate": 2.541197022307984e-05, "loss": 0.0109, "step": 4874 }, { "epoch": 3.485254691689008, "grad_norm": 0.02912554331123829, "learning_rate": 2.5390232069182118e-05, "loss": 0.0124, "step": 4875 }, { "epoch": 3.485254691689008, "eval_loss": 0.00801042653620243, "eval_runtime": 4.5814, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 4875 }, { "epoch": 3.4859696157283286, "grad_norm": 0.02168929949402809, "learning_rate": 2.5368500052450638e-05, "loss": 0.0071, "step": 4876 }, { "epoch": 3.4866845397676496, "grad_norm": 0.02228470705449581, "learning_rate": 2.5346774178304845e-05, "loss": 0.0064, "step": 4877 }, { "epoch": 3.4873994638069705, "grad_norm": 0.0225304514169693, "learning_rate": 2.532505445216275e-05, "loss": 0.0091, "step": 4878 }, { "epoch": 3.4881143878462915, "grad_norm": 0.02690170891582966, "learning_rate": 2.530334087944082e-05, "loss": 0.0054, "step": 4879 }, { "epoch": 3.488829311885612, "grad_norm": 0.02556675672531128, "learning_rate": 2.528163346555392e-05, "loss": 0.0068, "step": 4880 }, { "epoch": 3.488829311885612, "eval_loss": 0.008240952156484127, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4880 }, { "epoch": 3.489544235924933, "grad_norm": 0.02298605442047119, "learning_rate": 2.5259932215915487e-05, "loss": 0.0084, "step": 4881 }, { "epoch": 3.490259159964254, "grad_norm": 0.02693949081003666, "learning_rate": 2.5238237135937315e-05, "loss": 0.0056, "step": 4882 }, { "epoch": 3.490974084003575, "grad_norm": 0.01579262688755989, "learning_rate": 2.5216548231029723e-05, "loss": 0.004, "step": 4883 }, { "epoch": 3.4916890080428953, "grad_norm": 0.02332223206758499, "learning_rate": 2.5194865506601504e-05, "loss": 0.0053, "step": 4884 }, { "epoch": 3.4924039320822162, "grad_norm": 0.018989156931638718, "learning_rate": 2.5173188968059836e-05, "loss": 0.0045, "step": 4885 }, { "epoch": 3.4924039320822162, "eval_loss": 0.00819552130997181, "eval_runtime": 4.5797, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 4885 }, { "epoch": 3.493118856121537, "grad_norm": 0.023014338687062263, "learning_rate": 2.515151862081041e-05, "loss": 0.0072, "step": 4886 }, { "epoch": 3.493833780160858, "grad_norm": 0.03122827038168907, "learning_rate": 2.5129854470257397e-05, "loss": 0.0087, "step": 4887 }, { "epoch": 3.4945487042001786, "grad_norm": 0.023656141012907028, "learning_rate": 2.5108196521803335e-05, "loss": 0.0103, "step": 4888 }, { "epoch": 3.4952636282394995, "grad_norm": 0.033159296959638596, "learning_rate": 2.508654478084929e-05, "loss": 0.0089, "step": 4889 }, { "epoch": 3.4959785522788205, "grad_norm": 0.026589566841721535, "learning_rate": 2.5064899252794783e-05, "loss": 0.0088, "step": 4890 }, { "epoch": 3.4959785522788205, "eval_loss": 0.008277270942926407, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 4890 }, { "epoch": 3.496693476318141, "grad_norm": 0.02528086118400097, "learning_rate": 2.504325994303771e-05, "loss": 0.0059, "step": 4891 }, { "epoch": 3.497408400357462, "grad_norm": 0.02344701439142227, "learning_rate": 2.502162685697452e-05, "loss": 0.0066, "step": 4892 }, { "epoch": 3.498123324396783, "grad_norm": 0.023317309096455574, "learning_rate": 2.500000000000001e-05, "loss": 0.0062, "step": 4893 }, { "epoch": 3.498838248436104, "grad_norm": 0.02708684653043747, "learning_rate": 2.4978379377507488e-05, "loss": 0.0076, "step": 4894 }, { "epoch": 3.4995531724754243, "grad_norm": 0.019320989027619362, "learning_rate": 2.495676499488871e-05, "loss": 0.0042, "step": 4895 }, { "epoch": 3.4995531724754243, "eval_loss": 0.008398124948143959, "eval_runtime": 4.5804, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 4895 }, { "epoch": 3.5002680965147452, "grad_norm": 0.03042864054441452, "learning_rate": 2.4935156857533816e-05, "loss": 0.0109, "step": 4896 }, { "epoch": 3.500983020554066, "grad_norm": 0.018291238695383072, "learning_rate": 2.491355497083145e-05, "loss": 0.0042, "step": 4897 }, { "epoch": 3.5016979445933867, "grad_norm": 0.030053384602069855, "learning_rate": 2.4891959340168668e-05, "loss": 0.0086, "step": 4898 }, { "epoch": 3.5024128686327076, "grad_norm": 0.023789148777723312, "learning_rate": 2.4870369970930977e-05, "loss": 0.0051, "step": 4899 }, { "epoch": 3.5031277926720286, "grad_norm": 0.02183215692639351, "learning_rate": 2.4848786868502342e-05, "loss": 0.0046, "step": 4900 }, { "epoch": 3.5031277926720286, "eval_loss": 0.008213535882532597, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 4900 }, { "epoch": 3.5038427167113495, "grad_norm": 0.021718047559261322, "learning_rate": 2.4827210038265102e-05, "loss": 0.0071, "step": 4901 }, { "epoch": 3.5045576407506704, "grad_norm": 0.022741766646504402, "learning_rate": 2.4805639485600084e-05, "loss": 0.0059, "step": 4902 }, { "epoch": 3.505272564789991, "grad_norm": 0.020603349432349205, "learning_rate": 2.478407521588656e-05, "loss": 0.0044, "step": 4903 }, { "epoch": 3.505987488829312, "grad_norm": 0.022775372490286827, "learning_rate": 2.4762517234502168e-05, "loss": 0.0064, "step": 4904 }, { "epoch": 3.506702412868633, "grad_norm": 0.02532549761235714, "learning_rate": 2.4740965546823075e-05, "loss": 0.0085, "step": 4905 }, { "epoch": 3.506702412868633, "eval_loss": 0.008199973963201046, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 4905 }, { "epoch": 3.5074173369079533, "grad_norm": 0.027857914566993713, "learning_rate": 2.4719420158223773e-05, "loss": 0.0083, "step": 4906 }, { "epoch": 3.5081322609472743, "grad_norm": 0.03225867822766304, "learning_rate": 2.469788107407726e-05, "loss": 0.0082, "step": 4907 }, { "epoch": 3.508847184986595, "grad_norm": 0.02414637990295887, "learning_rate": 2.4676348299754952e-05, "loss": 0.0064, "step": 4908 }, { "epoch": 3.509562109025916, "grad_norm": 0.02392679452896118, "learning_rate": 2.465482184062665e-05, "loss": 0.0074, "step": 4909 }, { "epoch": 3.510277033065237, "grad_norm": 0.027604682371020317, "learning_rate": 2.4633301702060612e-05, "loss": 0.0109, "step": 4910 }, { "epoch": 3.510277033065237, "eval_loss": 0.00823928602039814, "eval_runtime": 4.586, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 4910 }, { "epoch": 3.5109919571045576, "grad_norm": 0.029050031676888466, "learning_rate": 2.4611787889423543e-05, "loss": 0.0124, "step": 4911 }, { "epoch": 3.5117068811438785, "grad_norm": 0.022290395572781563, "learning_rate": 2.45902804080805e-05, "loss": 0.0059, "step": 4912 }, { "epoch": 3.512421805183199, "grad_norm": 0.030168259516358376, "learning_rate": 2.4568779263395026e-05, "loss": 0.0075, "step": 4913 }, { "epoch": 3.51313672922252, "grad_norm": 0.023851625621318817, "learning_rate": 2.454728446072907e-05, "loss": 0.0079, "step": 4914 }, { "epoch": 3.513851653261841, "grad_norm": 0.02403203397989273, "learning_rate": 2.4525796005442963e-05, "loss": 0.0049, "step": 4915 }, { "epoch": 3.513851653261841, "eval_loss": 0.008366400375962257, "eval_runtime": 4.5797, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 4915 }, { "epoch": 3.514566577301162, "grad_norm": 0.02500888705253601, "learning_rate": 2.450431390289552e-05, "loss": 0.0045, "step": 4916 }, { "epoch": 3.515281501340483, "grad_norm": 0.023345792666077614, "learning_rate": 2.4482838158443882e-05, "loss": 0.0077, "step": 4917 }, { "epoch": 3.5159964253798033, "grad_norm": 0.024431174620985985, "learning_rate": 2.4461368777443683e-05, "loss": 0.006, "step": 4918 }, { "epoch": 3.516711349419124, "grad_norm": 0.029950859025120735, "learning_rate": 2.4439905765248945e-05, "loss": 0.0098, "step": 4919 }, { "epoch": 3.517426273458445, "grad_norm": 0.024214861914515495, "learning_rate": 2.441844912721209e-05, "loss": 0.0064, "step": 4920 }, { "epoch": 3.517426273458445, "eval_loss": 0.008368651382625103, "eval_runtime": 4.5819, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4920 }, { "epoch": 3.5181411974977657, "grad_norm": 0.026137931272387505, "learning_rate": 2.439699886868398e-05, "loss": 0.0072, "step": 4921 }, { "epoch": 3.5188561215370866, "grad_norm": 0.021378815174102783, "learning_rate": 2.4375554995013826e-05, "loss": 0.0049, "step": 4922 }, { "epoch": 3.5195710455764075, "grad_norm": 0.022469570860266685, "learning_rate": 2.4354117511549314e-05, "loss": 0.006, "step": 4923 }, { "epoch": 3.5202859696157285, "grad_norm": 0.025574415922164917, "learning_rate": 2.4332686423636515e-05, "loss": 0.0077, "step": 4924 }, { "epoch": 3.5210008936550494, "grad_norm": 0.01976904459297657, "learning_rate": 2.431126173661986e-05, "loss": 0.0048, "step": 4925 }, { "epoch": 3.5210008936550494, "eval_loss": 0.008344686590135098, "eval_runtime": 4.5787, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 4925 }, { "epoch": 3.52171581769437, "grad_norm": 0.021154150366783142, "learning_rate": 2.4289843455842253e-05, "loss": 0.0068, "step": 4926 }, { "epoch": 3.522430741733691, "grad_norm": 0.0249443668872118, "learning_rate": 2.4268431586644985e-05, "loss": 0.0097, "step": 4927 }, { "epoch": 3.5231456657730114, "grad_norm": 0.02614513225853443, "learning_rate": 2.424702613436769e-05, "loss": 0.0103, "step": 4928 }, { "epoch": 3.5238605898123323, "grad_norm": 0.029453027993440628, "learning_rate": 2.422562710434848e-05, "loss": 0.012, "step": 4929 }, { "epoch": 3.5245755138516532, "grad_norm": 0.02024916000664234, "learning_rate": 2.4204234501923794e-05, "loss": 0.007, "step": 4930 }, { "epoch": 3.5245755138516532, "eval_loss": 0.008233697153627872, "eval_runtime": 4.5779, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 4930 }, { "epoch": 3.525290437890974, "grad_norm": 0.01821180246770382, "learning_rate": 2.4182848332428532e-05, "loss": 0.0063, "step": 4931 }, { "epoch": 3.526005361930295, "grad_norm": 0.021290866658091545, "learning_rate": 2.4161468601195964e-05, "loss": 0.0057, "step": 4932 }, { "epoch": 3.5267202859696156, "grad_norm": 0.02275168150663376, "learning_rate": 2.4140095313557732e-05, "loss": 0.0064, "step": 4933 }, { "epoch": 3.5274352100089366, "grad_norm": 0.023119205608963966, "learning_rate": 2.4118728474843892e-05, "loss": 0.0086, "step": 4934 }, { "epoch": 3.5281501340482575, "grad_norm": 0.02161107398569584, "learning_rate": 2.409736809038292e-05, "loss": 0.0059, "step": 4935 }, { "epoch": 3.5281501340482575, "eval_loss": 0.008300404995679855, "eval_runtime": 4.6106, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 4935 }, { "epoch": 3.528865058087578, "grad_norm": 0.028413452208042145, "learning_rate": 2.407601416550162e-05, "loss": 0.0074, "step": 4936 }, { "epoch": 3.529579982126899, "grad_norm": 0.025913024321198463, "learning_rate": 2.4054666705525227e-05, "loss": 0.0115, "step": 4937 }, { "epoch": 3.53029490616622, "grad_norm": 0.021268069744110107, "learning_rate": 2.4033325715777376e-05, "loss": 0.008, "step": 4938 }, { "epoch": 3.531009830205541, "grad_norm": 0.023303169757127762, "learning_rate": 2.4011991201580037e-05, "loss": 0.0049, "step": 4939 }, { "epoch": 3.5317247542448613, "grad_norm": 0.020776720717549324, "learning_rate": 2.3990663168253624e-05, "loss": 0.0052, "step": 4940 }, { "epoch": 3.5317247542448613, "eval_loss": 0.008536205627024174, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 4940 }, { "epoch": 3.5324396782841823, "grad_norm": 0.026149936020374298, "learning_rate": 2.396934162111686e-05, "loss": 0.0077, "step": 4941 }, { "epoch": 3.533154602323503, "grad_norm": 0.024004608392715454, "learning_rate": 2.3948026565486964e-05, "loss": 0.008, "step": 4942 }, { "epoch": 3.5338695263628237, "grad_norm": 0.02184041403234005, "learning_rate": 2.3926718006679416e-05, "loss": 0.0039, "step": 4943 }, { "epoch": 3.5345844504021446, "grad_norm": 0.030292687937617302, "learning_rate": 2.3905415950008146e-05, "loss": 0.0084, "step": 4944 }, { "epoch": 3.5352993744414656, "grad_norm": 0.029898490756750107, "learning_rate": 2.388412040078547e-05, "loss": 0.0105, "step": 4945 }, { "epoch": 3.5352993744414656, "eval_loss": 0.00852225162088871, "eval_runtime": 4.597, "eval_samples_per_second": 10.877, "eval_steps_per_second": 2.828, "step": 4945 }, { "epoch": 3.5360142984807865, "grad_norm": 0.022993572056293488, "learning_rate": 2.3862831364322013e-05, "loss": 0.0079, "step": 4946 }, { "epoch": 3.5367292225201075, "grad_norm": 0.030914457514882088, "learning_rate": 2.384154884592684e-05, "loss": 0.01, "step": 4947 }, { "epoch": 3.537444146559428, "grad_norm": 0.025593506172299385, "learning_rate": 2.3820272850907394e-05, "loss": 0.0067, "step": 4948 }, { "epoch": 3.538159070598749, "grad_norm": 0.026204992085695267, "learning_rate": 2.379900338456942e-05, "loss": 0.007, "step": 4949 }, { "epoch": 3.53887399463807, "grad_norm": 0.022692400962114334, "learning_rate": 2.3777740452217108e-05, "loss": 0.0079, "step": 4950 }, { "epoch": 3.53887399463807, "eval_loss": 0.008340204134583473, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4950 }, { "epoch": 3.5395889186773903, "grad_norm": 0.02302849292755127, "learning_rate": 2.375648405915301e-05, "loss": 0.0092, "step": 4951 }, { "epoch": 3.5403038427167113, "grad_norm": 0.02339511550962925, "learning_rate": 2.373523421067799e-05, "loss": 0.0064, "step": 4952 }, { "epoch": 3.541018766756032, "grad_norm": 0.020449267700314522, "learning_rate": 2.3713990912091366e-05, "loss": 0.0061, "step": 4953 }, { "epoch": 3.541733690795353, "grad_norm": 0.019431089982390404, "learning_rate": 2.3692754168690727e-05, "loss": 0.0049, "step": 4954 }, { "epoch": 3.5424486148346737, "grad_norm": 0.02170463278889656, "learning_rate": 2.3671523985772104e-05, "loss": 0.0089, "step": 4955 }, { "epoch": 3.5424486148346737, "eval_loss": 0.008330865763127804, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 4955 }, { "epoch": 3.5431635388739946, "grad_norm": 0.01933048851788044, "learning_rate": 2.365030036862988e-05, "loss": 0.0045, "step": 4956 }, { "epoch": 3.5438784629133155, "grad_norm": 0.023595716804265976, "learning_rate": 2.3629083322556744e-05, "loss": 0.0055, "step": 4957 }, { "epoch": 3.544593386952636, "grad_norm": 0.02327708527445793, "learning_rate": 2.3607872852843815e-05, "loss": 0.0099, "step": 4958 }, { "epoch": 3.545308310991957, "grad_norm": 0.018815314397215843, "learning_rate": 2.358666896478056e-05, "loss": 0.005, "step": 4959 }, { "epoch": 3.546023235031278, "grad_norm": 0.017937609925866127, "learning_rate": 2.3565471663654753e-05, "loss": 0.0055, "step": 4960 }, { "epoch": 3.546023235031278, "eval_loss": 0.008449967950582504, "eval_runtime": 4.5991, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 4960 }, { "epoch": 3.546738159070599, "grad_norm": 0.02035214565694332, "learning_rate": 2.3544280954752573e-05, "loss": 0.0074, "step": 4961 }, { "epoch": 3.54745308310992, "grad_norm": 0.024309681728482246, "learning_rate": 2.3523096843358573e-05, "loss": 0.0104, "step": 4962 }, { "epoch": 3.5481680071492403, "grad_norm": 0.019211556762456894, "learning_rate": 2.3501919334755584e-05, "loss": 0.0066, "step": 4963 }, { "epoch": 3.5488829311885612, "grad_norm": 0.023883512243628502, "learning_rate": 2.348074843422487e-05, "loss": 0.0095, "step": 4964 }, { "epoch": 3.549597855227882, "grad_norm": 0.02149995230138302, "learning_rate": 2.3459584147045998e-05, "loss": 0.0089, "step": 4965 }, { "epoch": 3.549597855227882, "eval_loss": 0.008358889259397984, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 4965 }, { "epoch": 3.5503127792672027, "grad_norm": 0.021258721128106117, "learning_rate": 2.3438426478496932e-05, "loss": 0.0069, "step": 4966 }, { "epoch": 3.5510277033065236, "grad_norm": 0.018782060593366623, "learning_rate": 2.341727543385392e-05, "loss": 0.0049, "step": 4967 }, { "epoch": 3.5517426273458446, "grad_norm": 0.027173584327101707, "learning_rate": 2.3396131018391605e-05, "loss": 0.0101, "step": 4968 }, { "epoch": 3.5524575513851655, "grad_norm": 0.022042736411094666, "learning_rate": 2.3374993237382992e-05, "loss": 0.0063, "step": 4969 }, { "epoch": 3.553172475424486, "grad_norm": 0.01961604319512844, "learning_rate": 2.3353862096099367e-05, "loss": 0.0052, "step": 4970 }, { "epoch": 3.553172475424486, "eval_loss": 0.00840297993272543, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 4970 }, { "epoch": 3.553887399463807, "grad_norm": 0.026447996497154236, "learning_rate": 2.3332737599810412e-05, "loss": 0.0083, "step": 4971 }, { "epoch": 3.554602323503128, "grad_norm": 0.02616027556359768, "learning_rate": 2.3311619753784165e-05, "loss": 0.0075, "step": 4972 }, { "epoch": 3.5553172475424484, "grad_norm": 0.029261864721775055, "learning_rate": 2.329050856328694e-05, "loss": 0.0124, "step": 4973 }, { "epoch": 3.5560321715817693, "grad_norm": 0.02687664143741131, "learning_rate": 2.3269404033583442e-05, "loss": 0.007, "step": 4974 }, { "epoch": 3.5567470956210903, "grad_norm": 0.02529659867286682, "learning_rate": 2.324830616993673e-05, "loss": 0.0073, "step": 4975 }, { "epoch": 3.5567470956210903, "eval_loss": 0.008415605872869492, "eval_runtime": 4.6091, "eval_samples_per_second": 10.848, "eval_steps_per_second": 2.821, "step": 4975 }, { "epoch": 3.557462019660411, "grad_norm": 0.033146630972623825, "learning_rate": 2.3227214977608136e-05, "loss": 0.0131, "step": 4976 }, { "epoch": 3.558176943699732, "grad_norm": 0.02379899099469185, "learning_rate": 2.32061304618574e-05, "loss": 0.0063, "step": 4977 }, { "epoch": 3.5588918677390526, "grad_norm": 0.026553379371762276, "learning_rate": 2.3185052627942527e-05, "loss": 0.0134, "step": 4978 }, { "epoch": 3.5596067917783736, "grad_norm": 0.03206164762377739, "learning_rate": 2.3163981481119913e-05, "loss": 0.0071, "step": 4979 }, { "epoch": 3.5603217158176945, "grad_norm": 0.026612846180796623, "learning_rate": 2.314291702664428e-05, "loss": 0.0059, "step": 4980 }, { "epoch": 3.5603217158176945, "eval_loss": 0.008240469731390476, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 4980 }, { "epoch": 3.561036639857015, "grad_norm": 0.03350212052464485, "learning_rate": 2.3121859269768638e-05, "loss": 0.0152, "step": 4981 }, { "epoch": 3.561751563896336, "grad_norm": 0.0219548549503088, "learning_rate": 2.3100808215744363e-05, "loss": 0.0088, "step": 4982 }, { "epoch": 3.562466487935657, "grad_norm": 0.02564047835767269, "learning_rate": 2.3079763869821174e-05, "loss": 0.0058, "step": 4983 }, { "epoch": 3.563181411974978, "grad_norm": 0.018351353704929352, "learning_rate": 2.3058726237247065e-05, "loss": 0.0056, "step": 4984 }, { "epoch": 3.5638963360142983, "grad_norm": 0.020484741777181625, "learning_rate": 2.3037695323268394e-05, "loss": 0.0053, "step": 4985 }, { "epoch": 3.5638963360142983, "eval_loss": 0.008029916323721409, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 4985 }, { "epoch": 3.5646112600536193, "grad_norm": 0.019640447571873665, "learning_rate": 2.3016671133129835e-05, "loss": 0.0066, "step": 4986 }, { "epoch": 3.56532618409294, "grad_norm": 0.022813454270362854, "learning_rate": 2.2995653672074398e-05, "loss": 0.0072, "step": 4987 }, { "epoch": 3.5660411081322607, "grad_norm": 0.024910999462008476, "learning_rate": 2.2974642945343406e-05, "loss": 0.0058, "step": 4988 }, { "epoch": 3.5667560321715817, "grad_norm": 0.020132726058363914, "learning_rate": 2.2953638958176464e-05, "loss": 0.0075, "step": 4989 }, { "epoch": 3.5674709562109026, "grad_norm": 0.018625637516379356, "learning_rate": 2.2932641715811582e-05, "loss": 0.0061, "step": 4990 }, { "epoch": 3.5674709562109026, "eval_loss": 0.008027344010770321, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 4990 }, { "epoch": 3.5681858802502235, "grad_norm": 0.022247133776545525, "learning_rate": 2.2911651223484987e-05, "loss": 0.0081, "step": 4991 }, { "epoch": 3.5689008042895445, "grad_norm": 0.019625281915068626, "learning_rate": 2.2890667486431293e-05, "loss": 0.0054, "step": 4992 }, { "epoch": 3.569615728328865, "grad_norm": 0.02079673856496811, "learning_rate": 2.2869690509883434e-05, "loss": 0.0048, "step": 4993 }, { "epoch": 3.570330652368186, "grad_norm": 0.03123803623020649, "learning_rate": 2.284872029907259e-05, "loss": 0.0105, "step": 4994 }, { "epoch": 3.571045576407507, "grad_norm": 0.02464817650616169, "learning_rate": 2.2827756859228316e-05, "loss": 0.0057, "step": 4995 }, { "epoch": 3.571045576407507, "eval_loss": 0.008109057322144508, "eval_runtime": 4.5878, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 4995 }, { "epoch": 3.5717605004468274, "grad_norm": 0.023095345124602318, "learning_rate": 2.280680019557848e-05, "loss": 0.0087, "step": 4996 }, { "epoch": 3.5724754244861483, "grad_norm": 0.033568382263183594, "learning_rate": 2.2785850313349205e-05, "loss": 0.013, "step": 4997 }, { "epoch": 3.5731903485254692, "grad_norm": 0.02110052853822708, "learning_rate": 2.276490721776497e-05, "loss": 0.0075, "step": 4998 }, { "epoch": 3.57390527256479, "grad_norm": 0.023484840989112854, "learning_rate": 2.2743970914048574e-05, "loss": 0.0055, "step": 4999 }, { "epoch": 3.5746201966041107, "grad_norm": 0.028160491958260536, "learning_rate": 2.2723041407421053e-05, "loss": 0.0108, "step": 5000 }, { "epoch": 3.5746201966041107, "eval_loss": 0.008175024762749672, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5000 }, { "epoch": 3.5753351206434316, "grad_norm": 0.021381784230470657, "learning_rate": 2.270211870310184e-05, "loss": 0.0048, "step": 5001 }, { "epoch": 3.5760500446827526, "grad_norm": 0.032119862735271454, "learning_rate": 2.268120280630857e-05, "loss": 0.0065, "step": 5002 }, { "epoch": 3.576764968722073, "grad_norm": 0.02312600240111351, "learning_rate": 2.266029372225727e-05, "loss": 0.0053, "step": 5003 }, { "epoch": 3.577479892761394, "grad_norm": 0.031012529507279396, "learning_rate": 2.2639391456162246e-05, "loss": 0.0082, "step": 5004 }, { "epoch": 3.578194816800715, "grad_norm": 0.024921439588069916, "learning_rate": 2.2618496013236045e-05, "loss": 0.0128, "step": 5005 }, { "epoch": 3.578194816800715, "eval_loss": 0.00827100034803152, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 5005 }, { "epoch": 3.578909740840036, "grad_norm": 0.018323950469493866, "learning_rate": 2.2597607398689587e-05, "loss": 0.0044, "step": 5006 }, { "epoch": 3.579624664879357, "grad_norm": 0.02920425869524479, "learning_rate": 2.257672561773207e-05, "loss": 0.0062, "step": 5007 }, { "epoch": 3.5803395889186773, "grad_norm": 0.01886408030986786, "learning_rate": 2.255585067557093e-05, "loss": 0.0045, "step": 5008 }, { "epoch": 3.5810545129579983, "grad_norm": 0.029960997402668, "learning_rate": 2.253498257741201e-05, "loss": 0.0101, "step": 5009 }, { "epoch": 3.581769436997319, "grad_norm": 0.031917788088321686, "learning_rate": 2.2514121328459332e-05, "loss": 0.0059, "step": 5010 }, { "epoch": 3.581769436997319, "eval_loss": 0.008287528529763222, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5010 }, { "epoch": 3.5824843610366397, "grad_norm": 0.017780696973204613, "learning_rate": 2.2493266933915276e-05, "loss": 0.0044, "step": 5011 }, { "epoch": 3.5831992850759606, "grad_norm": 0.026671387255191803, "learning_rate": 2.247241939898051e-05, "loss": 0.0079, "step": 5012 }, { "epoch": 3.5839142091152816, "grad_norm": 0.02150764688849449, "learning_rate": 2.2451578728853945e-05, "loss": 0.0064, "step": 5013 }, { "epoch": 3.5846291331546025, "grad_norm": 0.02775406651198864, "learning_rate": 2.2430744928732855e-05, "loss": 0.008, "step": 5014 }, { "epoch": 3.585344057193923, "grad_norm": 0.026315366849303246, "learning_rate": 2.240991800381271e-05, "loss": 0.0072, "step": 5015 }, { "epoch": 3.585344057193923, "eval_loss": 0.008237438276410103, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5015 }, { "epoch": 3.586058981233244, "grad_norm": 0.02167453244328499, "learning_rate": 2.2389097959287338e-05, "loss": 0.0088, "step": 5016 }, { "epoch": 3.586773905272565, "grad_norm": 0.02223857492208481, "learning_rate": 2.2368284800348842e-05, "loss": 0.0085, "step": 5017 }, { "epoch": 3.5874888293118854, "grad_norm": 0.02158801071345806, "learning_rate": 2.234747853218756e-05, "loss": 0.0055, "step": 5018 }, { "epoch": 3.5882037533512063, "grad_norm": 0.022844897583127022, "learning_rate": 2.2326679159992157e-05, "loss": 0.0072, "step": 5019 }, { "epoch": 3.5889186773905273, "grad_norm": 0.019999660551548004, "learning_rate": 2.230588668894959e-05, "loss": 0.0058, "step": 5020 }, { "epoch": 3.5889186773905273, "eval_loss": 0.008233774453401566, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5020 }, { "epoch": 3.589633601429848, "grad_norm": 0.03444530814886093, "learning_rate": 2.228510112424503e-05, "loss": 0.0095, "step": 5021 }, { "epoch": 3.590348525469169, "grad_norm": 0.03165658935904503, "learning_rate": 2.2264322471061988e-05, "loss": 0.0119, "step": 5022 }, { "epoch": 3.5910634495084897, "grad_norm": 0.02723386324942112, "learning_rate": 2.224355073458224e-05, "loss": 0.0102, "step": 5023 }, { "epoch": 3.5917783735478106, "grad_norm": 0.019087424501776695, "learning_rate": 2.22227859199858e-05, "loss": 0.004, "step": 5024 }, { "epoch": 3.592493297587131, "grad_norm": 0.02998395636677742, "learning_rate": 2.220202803245101e-05, "loss": 0.0086, "step": 5025 }, { "epoch": 3.592493297587131, "eval_loss": 0.008131448179483414, "eval_runtime": 4.5789, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 5025 }, { "epoch": 3.593208221626452, "grad_norm": 0.02314995415508747, "learning_rate": 2.2181277077154426e-05, "loss": 0.0058, "step": 5026 }, { "epoch": 3.593923145665773, "grad_norm": 0.023408176377415657, "learning_rate": 2.2160533059270926e-05, "loss": 0.0094, "step": 5027 }, { "epoch": 3.594638069705094, "grad_norm": 0.028399208560585976, "learning_rate": 2.2139795983973656e-05, "loss": 0.0098, "step": 5028 }, { "epoch": 3.595352993744415, "grad_norm": 0.02401285246014595, "learning_rate": 2.2119065856433967e-05, "loss": 0.0063, "step": 5029 }, { "epoch": 3.5960679177837354, "grad_norm": 0.03950977697968483, "learning_rate": 2.2098342681821556e-05, "loss": 0.0081, "step": 5030 }, { "epoch": 3.5960679177837354, "eval_loss": 0.00812968797981739, "eval_runtime": 4.6131, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.818, "step": 5030 }, { "epoch": 3.5967828418230563, "grad_norm": 0.018738172948360443, "learning_rate": 2.207762646530434e-05, "loss": 0.0039, "step": 5031 }, { "epoch": 3.5974977658623772, "grad_norm": 0.024599405005574226, "learning_rate": 2.2056917212048524e-05, "loss": 0.0076, "step": 5032 }, { "epoch": 3.5982126899016977, "grad_norm": 0.022450441494584084, "learning_rate": 2.203621492721858e-05, "loss": 0.0074, "step": 5033 }, { "epoch": 3.5989276139410187, "grad_norm": 0.02578446827828884, "learning_rate": 2.2015519615977194e-05, "loss": 0.0082, "step": 5034 }, { "epoch": 3.5996425379803396, "grad_norm": 0.025684231892228127, "learning_rate": 2.1994831283485363e-05, "loss": 0.0123, "step": 5035 }, { "epoch": 3.5996425379803396, "eval_loss": 0.008055629208683968, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5035 }, { "epoch": 3.6003574620196606, "grad_norm": 0.023534733802080154, "learning_rate": 2.197414993490235e-05, "loss": 0.0078, "step": 5036 }, { "epoch": 3.6010723860589815, "grad_norm": 0.019114935770630836, "learning_rate": 2.195347557538562e-05, "loss": 0.0041, "step": 5037 }, { "epoch": 3.601787310098302, "grad_norm": 0.023599419742822647, "learning_rate": 2.193280821009096e-05, "loss": 0.0082, "step": 5038 }, { "epoch": 3.602502234137623, "grad_norm": 0.024303052574396133, "learning_rate": 2.1912147844172353e-05, "loss": 0.0089, "step": 5039 }, { "epoch": 3.6032171581769434, "grad_norm": 0.022866088896989822, "learning_rate": 2.1891494482782084e-05, "loss": 0.0065, "step": 5040 }, { "epoch": 3.6032171581769434, "eval_loss": 0.00808207131922245, "eval_runtime": 4.5873, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 5040 }, { "epoch": 3.6039320822162644, "grad_norm": 0.03404388204216957, "learning_rate": 2.1870848131070687e-05, "loss": 0.0094, "step": 5041 }, { "epoch": 3.6046470062555853, "grad_norm": 0.03634524717926979, "learning_rate": 2.18502087941869e-05, "loss": 0.0054, "step": 5042 }, { "epoch": 3.6053619302949063, "grad_norm": 0.027072053402662277, "learning_rate": 2.1829576477277763e-05, "loss": 0.0087, "step": 5043 }, { "epoch": 3.606076854334227, "grad_norm": 0.02598402462899685, "learning_rate": 2.1808951185488567e-05, "loss": 0.0055, "step": 5044 }, { "epoch": 3.6067917783735477, "grad_norm": 0.028509104624390602, "learning_rate": 2.17883329239628e-05, "loss": 0.0097, "step": 5045 }, { "epoch": 3.6067917783735477, "eval_loss": 0.007975279353559017, "eval_runtime": 4.58, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5045 }, { "epoch": 3.6075067024128686, "grad_norm": 0.02923518605530262, "learning_rate": 2.176772169784224e-05, "loss": 0.0104, "step": 5046 }, { "epoch": 3.6082216264521896, "grad_norm": 0.02402065135538578, "learning_rate": 2.174711751226693e-05, "loss": 0.0062, "step": 5047 }, { "epoch": 3.60893655049151, "grad_norm": 0.027452712878584862, "learning_rate": 2.1726520372375075e-05, "loss": 0.0084, "step": 5048 }, { "epoch": 3.609651474530831, "grad_norm": 0.024389801546931267, "learning_rate": 2.170593028330322e-05, "loss": 0.0069, "step": 5049 }, { "epoch": 3.610366398570152, "grad_norm": 0.025911787524819374, "learning_rate": 2.168534725018607e-05, "loss": 0.0086, "step": 5050 }, { "epoch": 3.610366398570152, "eval_loss": 0.007867034524679184, "eval_runtime": 4.5915, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 5050 }, { "epoch": 3.611081322609473, "grad_norm": 0.021777866408228874, "learning_rate": 2.166477127815663e-05, "loss": 0.0054, "step": 5051 }, { "epoch": 3.611796246648794, "grad_norm": 0.026926683261990547, "learning_rate": 2.1644202372346113e-05, "loss": 0.0128, "step": 5052 }, { "epoch": 3.6125111706881143, "grad_norm": 0.01957591623067856, "learning_rate": 2.1623640537883977e-05, "loss": 0.0081, "step": 5053 }, { "epoch": 3.6132260947274353, "grad_norm": 0.02134442888200283, "learning_rate": 2.1603085779897935e-05, "loss": 0.0071, "step": 5054 }, { "epoch": 3.6139410187667558, "grad_norm": 0.019170865416526794, "learning_rate": 2.1582538103513893e-05, "loss": 0.0065, "step": 5055 }, { "epoch": 3.6139410187667558, "eval_loss": 0.007884739898145199, "eval_runtime": 4.5825, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5055 }, { "epoch": 3.6146559428060767, "grad_norm": 0.019748665392398834, "learning_rate": 2.1561997513856024e-05, "loss": 0.0047, "step": 5056 }, { "epoch": 3.6153708668453977, "grad_norm": 0.025222398340702057, "learning_rate": 2.1541464016046746e-05, "loss": 0.0085, "step": 5057 }, { "epoch": 3.6160857908847186, "grad_norm": 0.01916574127972126, "learning_rate": 2.152093761520665e-05, "loss": 0.0054, "step": 5058 }, { "epoch": 3.6168007149240395, "grad_norm": 0.023502998054027557, "learning_rate": 2.1500418316454617e-05, "loss": 0.0058, "step": 5059 }, { "epoch": 3.61751563896336, "grad_norm": 0.02818041481077671, "learning_rate": 2.147990612490775e-05, "loss": 0.0067, "step": 5060 }, { "epoch": 3.61751563896336, "eval_loss": 0.007893518544733524, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5060 }, { "epoch": 3.618230563002681, "grad_norm": 0.02406724914908409, "learning_rate": 2.1459401045681326e-05, "loss": 0.0085, "step": 5061 }, { "epoch": 3.618945487042002, "grad_norm": 0.02248404175043106, "learning_rate": 2.1438903083888927e-05, "loss": 0.0071, "step": 5062 }, { "epoch": 3.6196604110813224, "grad_norm": 0.02306230738759041, "learning_rate": 2.141841224464229e-05, "loss": 0.0054, "step": 5063 }, { "epoch": 3.6203753351206434, "grad_norm": 0.02857796475291252, "learning_rate": 2.139792853305141e-05, "loss": 0.0053, "step": 5064 }, { "epoch": 3.6210902591599643, "grad_norm": 0.021453263238072395, "learning_rate": 2.1377451954224525e-05, "loss": 0.005, "step": 5065 }, { "epoch": 3.6210902591599643, "eval_loss": 0.007941114716231823, "eval_runtime": 4.6065, "eval_samples_per_second": 10.854, "eval_steps_per_second": 2.822, "step": 5065 }, { "epoch": 3.6218051831992852, "grad_norm": 0.02228475548326969, "learning_rate": 2.135698251326803e-05, "loss": 0.004, "step": 5066 }, { "epoch": 3.6225201072386057, "grad_norm": 0.02313629910349846, "learning_rate": 2.133652021528661e-05, "loss": 0.0073, "step": 5067 }, { "epoch": 3.6232350312779267, "grad_norm": 0.024446401745080948, "learning_rate": 2.131606506538314e-05, "loss": 0.007, "step": 5068 }, { "epoch": 3.6239499553172476, "grad_norm": 0.022963495925068855, "learning_rate": 2.1295617068658685e-05, "loss": 0.0055, "step": 5069 }, { "epoch": 3.624664879356568, "grad_norm": 0.016468141227960587, "learning_rate": 2.1275176230212568e-05, "loss": 0.0047, "step": 5070 }, { "epoch": 3.624664879356568, "eval_loss": 0.007901350036263466, "eval_runtime": 4.5789, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 5070 }, { "epoch": 3.625379803395889, "grad_norm": 0.021827371791005135, "learning_rate": 2.125474255514232e-05, "loss": 0.0051, "step": 5071 }, { "epoch": 3.62609472743521, "grad_norm": 0.022189119830727577, "learning_rate": 2.123431604854365e-05, "loss": 0.0075, "step": 5072 }, { "epoch": 3.626809651474531, "grad_norm": 0.024582337588071823, "learning_rate": 2.121389671551054e-05, "loss": 0.0071, "step": 5073 }, { "epoch": 3.627524575513852, "grad_norm": 0.022601287811994553, "learning_rate": 2.1193484561135095e-05, "loss": 0.0053, "step": 5074 }, { "epoch": 3.6282394995531724, "grad_norm": 0.03423520550131798, "learning_rate": 2.1173079590507733e-05, "loss": 0.0139, "step": 5075 }, { "epoch": 3.6282394995531724, "eval_loss": 0.00792184378951788, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 5075 }, { "epoch": 3.6289544235924933, "grad_norm": 0.02627391368150711, "learning_rate": 2.1152681808717035e-05, "loss": 0.0062, "step": 5076 }, { "epoch": 3.6296693476318143, "grad_norm": 0.025708032771945, "learning_rate": 2.1132291220849743e-05, "loss": 0.0062, "step": 5077 }, { "epoch": 3.6303842716711348, "grad_norm": 0.02009611763060093, "learning_rate": 2.1111907831990892e-05, "loss": 0.0063, "step": 5078 }, { "epoch": 3.6310991957104557, "grad_norm": 0.025348937138915062, "learning_rate": 2.1091531647223632e-05, "loss": 0.006, "step": 5079 }, { "epoch": 3.6318141197497766, "grad_norm": 0.01819893904030323, "learning_rate": 2.1071162671629387e-05, "loss": 0.0065, "step": 5080 }, { "epoch": 3.6318141197497766, "eval_loss": 0.007892213761806488, "eval_runtime": 4.5794, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 5080 }, { "epoch": 3.6325290437890976, "grad_norm": 0.022117160260677338, "learning_rate": 2.105080091028777e-05, "loss": 0.0095, "step": 5081 }, { "epoch": 3.633243967828418, "grad_norm": 0.027222631499171257, "learning_rate": 2.1030446368276546e-05, "loss": 0.0068, "step": 5082 }, { "epoch": 3.633958891867739, "grad_norm": 0.016917865723371506, "learning_rate": 2.1010099050671745e-05, "loss": 0.0047, "step": 5083 }, { "epoch": 3.63467381590706, "grad_norm": 0.015480107627809048, "learning_rate": 2.098975896254757e-05, "loss": 0.004, "step": 5084 }, { "epoch": 3.6353887399463805, "grad_norm": 0.025099512189626694, "learning_rate": 2.0969426108976387e-05, "loss": 0.0062, "step": 5085 }, { "epoch": 3.6353887399463805, "eval_loss": 0.007870346307754517, "eval_runtime": 4.6088, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 5085 }, { "epoch": 3.6361036639857014, "grad_norm": 0.017278289422392845, "learning_rate": 2.094910049502882e-05, "loss": 0.0043, "step": 5086 }, { "epoch": 3.6368185880250223, "grad_norm": 0.02623290941119194, "learning_rate": 2.092878212577363e-05, "loss": 0.009, "step": 5087 }, { "epoch": 3.6375335120643433, "grad_norm": 0.03126358240842819, "learning_rate": 2.0908471006277814e-05, "loss": 0.0135, "step": 5088 }, { "epoch": 3.638248436103664, "grad_norm": 0.023255426436662674, "learning_rate": 2.0888167141606556e-05, "loss": 0.0052, "step": 5089 }, { "epoch": 3.6389633601429847, "grad_norm": 0.02576283924281597, "learning_rate": 2.0867870536823185e-05, "loss": 0.0073, "step": 5090 }, { "epoch": 3.6389633601429847, "eval_loss": 0.007959332317113876, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5090 }, { "epoch": 3.6396782841823057, "grad_norm": 0.02490738034248352, "learning_rate": 2.0847581196989274e-05, "loss": 0.0064, "step": 5091 }, { "epoch": 3.6403932082216266, "grad_norm": 0.029702048748731613, "learning_rate": 2.0827299127164572e-05, "loss": 0.0111, "step": 5092 }, { "epoch": 3.641108132260947, "grad_norm": 0.024499917402863503, "learning_rate": 2.080702433240699e-05, "loss": 0.01, "step": 5093 }, { "epoch": 3.641823056300268, "grad_norm": 0.02145523577928543, "learning_rate": 2.078675681777264e-05, "loss": 0.0054, "step": 5094 }, { "epoch": 3.642537980339589, "grad_norm": 0.02153155580163002, "learning_rate": 2.0766496588315852e-05, "loss": 0.0067, "step": 5095 }, { "epoch": 3.642537980339589, "eval_loss": 0.008068427443504333, "eval_runtime": 4.6068, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 5095 }, { "epoch": 3.64325290437891, "grad_norm": 0.02646663971245289, "learning_rate": 2.0746243649089065e-05, "loss": 0.0069, "step": 5096 }, { "epoch": 3.6439678284182304, "grad_norm": 0.02155366539955139, "learning_rate": 2.072599800514296e-05, "loss": 0.0052, "step": 5097 }, { "epoch": 3.6446827524575514, "grad_norm": 0.027583716437220573, "learning_rate": 2.0705759661526385e-05, "loss": 0.0074, "step": 5098 }, { "epoch": 3.6453976764968723, "grad_norm": 0.02679644711315632, "learning_rate": 2.0685528623286365e-05, "loss": 0.0085, "step": 5099 }, { "epoch": 3.646112600536193, "grad_norm": 0.022217417135834694, "learning_rate": 2.0665304895468112e-05, "loss": 0.0053, "step": 5100 }, { "epoch": 3.646112600536193, "eval_loss": 0.008070986717939377, "eval_runtime": 4.5973, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 5100 }, { "epoch": 3.6468275245755137, "grad_norm": 0.029209226369857788, "learning_rate": 2.0645088483114973e-05, "loss": 0.0089, "step": 5101 }, { "epoch": 3.6475424486148347, "grad_norm": 0.02241871878504753, "learning_rate": 2.0624879391268536e-05, "loss": 0.0048, "step": 5102 }, { "epoch": 3.6482573726541556, "grad_norm": 0.026625856757164, "learning_rate": 2.0604677624968498e-05, "loss": 0.0088, "step": 5103 }, { "epoch": 3.6489722966934766, "grad_norm": 0.022339167073369026, "learning_rate": 2.0584483189252777e-05, "loss": 0.0079, "step": 5104 }, { "epoch": 3.649687220732797, "grad_norm": 0.01977895013988018, "learning_rate": 2.056429608915747e-05, "loss": 0.0057, "step": 5105 }, { "epoch": 3.649687220732797, "eval_loss": 0.008043388836085796, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5105 }, { "epoch": 3.650402144772118, "grad_norm": 0.03173450380563736, "learning_rate": 2.0544116329716773e-05, "loss": 0.0088, "step": 5106 }, { "epoch": 3.651117068811439, "grad_norm": 0.02054230310022831, "learning_rate": 2.052394391596313e-05, "loss": 0.0051, "step": 5107 }, { "epoch": 3.6518319928507594, "grad_norm": 0.022182419896125793, "learning_rate": 2.0503778852927136e-05, "loss": 0.0076, "step": 5108 }, { "epoch": 3.6525469168900804, "grad_norm": 0.02111845090985298, "learning_rate": 2.04836211456375e-05, "loss": 0.0052, "step": 5109 }, { "epoch": 3.6532618409294013, "grad_norm": 0.02073364146053791, "learning_rate": 2.0463470799121175e-05, "loss": 0.0063, "step": 5110 }, { "epoch": 3.6532618409294013, "eval_loss": 0.008014487102627754, "eval_runtime": 4.5797, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 5110 }, { "epoch": 3.6539767649687223, "grad_norm": 0.024625863879919052, "learning_rate": 2.0443327818403212e-05, "loss": 0.006, "step": 5111 }, { "epoch": 3.6546916890080428, "grad_norm": 0.025025425478816032, "learning_rate": 2.042319220850686e-05, "loss": 0.0066, "step": 5112 }, { "epoch": 3.6554066130473637, "grad_norm": 0.02355380728840828, "learning_rate": 2.0403063974453547e-05, "loss": 0.0086, "step": 5113 }, { "epoch": 3.6561215370866846, "grad_norm": 0.02606126479804516, "learning_rate": 2.0382943121262797e-05, "loss": 0.0097, "step": 5114 }, { "epoch": 3.656836461126005, "grad_norm": 0.02769467607140541, "learning_rate": 2.036282965395236e-05, "loss": 0.0108, "step": 5115 }, { "epoch": 3.656836461126005, "eval_loss": 0.008077939040958881, "eval_runtime": 4.6107, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.82, "step": 5115 }, { "epoch": 3.657551385165326, "grad_norm": 0.02526829205453396, "learning_rate": 2.0342723577538126e-05, "loss": 0.0073, "step": 5116 }, { "epoch": 3.658266309204647, "grad_norm": 0.030082592740654945, "learning_rate": 2.0322624897034105e-05, "loss": 0.0068, "step": 5117 }, { "epoch": 3.658981233243968, "grad_norm": 0.029148241505026817, "learning_rate": 2.0302533617452508e-05, "loss": 0.0087, "step": 5118 }, { "epoch": 3.659696157283289, "grad_norm": 0.02527160756289959, "learning_rate": 2.028244974380368e-05, "loss": 0.0116, "step": 5119 }, { "epoch": 3.6604110813226094, "grad_norm": 0.03384966030716896, "learning_rate": 2.026237328109613e-05, "loss": 0.0116, "step": 5120 }, { "epoch": 3.6604110813226094, "eval_loss": 0.007936274632811546, "eval_runtime": 4.5761, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.841, "step": 5120 }, { "epoch": 3.6611260053619303, "grad_norm": 0.02657047100365162, "learning_rate": 2.0242304234336522e-05, "loss": 0.0078, "step": 5121 }, { "epoch": 3.6618409294012513, "grad_norm": 0.020186729729175568, "learning_rate": 2.0222242608529627e-05, "loss": 0.0047, "step": 5122 }, { "epoch": 3.6625558534405718, "grad_norm": 0.025750601664185524, "learning_rate": 2.020218840867842e-05, "loss": 0.0077, "step": 5123 }, { "epoch": 3.6632707774798927, "grad_norm": 0.022980842739343643, "learning_rate": 2.018214163978402e-05, "loss": 0.0072, "step": 5124 }, { "epoch": 3.6639857015192137, "grad_norm": 0.01839456334710121, "learning_rate": 2.0162102306845637e-05, "loss": 0.0047, "step": 5125 }, { "epoch": 3.6639857015192137, "eval_loss": 0.007932187058031559, "eval_runtime": 4.5813, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5125 }, { "epoch": 3.6647006255585346, "grad_norm": 0.02264019101858139, "learning_rate": 2.0142070414860704e-05, "loss": 0.0053, "step": 5126 }, { "epoch": 3.665415549597855, "grad_norm": 0.023247284814715385, "learning_rate": 2.0122045968824723e-05, "loss": 0.0086, "step": 5127 }, { "epoch": 3.666130473637176, "grad_norm": 0.024437887594103813, "learning_rate": 2.0102028973731392e-05, "loss": 0.0073, "step": 5128 }, { "epoch": 3.666845397676497, "grad_norm": 0.02498762682080269, "learning_rate": 2.008201943457255e-05, "loss": 0.0088, "step": 5129 }, { "epoch": 3.6675603217158175, "grad_norm": 0.02149789221584797, "learning_rate": 2.0062017356338137e-05, "loss": 0.0051, "step": 5130 }, { "epoch": 3.6675603217158175, "eval_loss": 0.00786600075662136, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5130 }, { "epoch": 3.6682752457551384, "grad_norm": 0.02330482006072998, "learning_rate": 2.0042022744016264e-05, "loss": 0.0048, "step": 5131 }, { "epoch": 3.6689901697944594, "grad_norm": 0.022045858204364777, "learning_rate": 2.00220356025932e-05, "loss": 0.0062, "step": 5132 }, { "epoch": 3.6697050938337803, "grad_norm": 0.026230178773403168, "learning_rate": 2.0002055937053278e-05, "loss": 0.0068, "step": 5133 }, { "epoch": 3.6704200178731012, "grad_norm": 0.028924329206347466, "learning_rate": 1.9982083752379048e-05, "loss": 0.0106, "step": 5134 }, { "epoch": 3.6711349419124217, "grad_norm": 0.023955339565873146, "learning_rate": 1.9962119053551136e-05, "loss": 0.0067, "step": 5135 }, { "epoch": 3.6711349419124217, "eval_loss": 0.007804023567587137, "eval_runtime": 4.5789, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 5135 }, { "epoch": 3.6718498659517427, "grad_norm": 0.023266678676009178, "learning_rate": 1.9942161845548336e-05, "loss": 0.0065, "step": 5136 }, { "epoch": 3.672564789991063, "grad_norm": 0.02235720306634903, "learning_rate": 1.9922212133347574e-05, "loss": 0.0078, "step": 5137 }, { "epoch": 3.673279714030384, "grad_norm": 0.0320582389831543, "learning_rate": 1.9902269921923866e-05, "loss": 0.0089, "step": 5138 }, { "epoch": 3.673994638069705, "grad_norm": 0.025415511801838875, "learning_rate": 1.98823352162504e-05, "loss": 0.0079, "step": 5139 }, { "epoch": 3.674709562109026, "grad_norm": 0.02677248790860176, "learning_rate": 1.98624080212985e-05, "loss": 0.005, "step": 5140 }, { "epoch": 3.674709562109026, "eval_loss": 0.007821472361683846, "eval_runtime": 4.5794, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 5140 }, { "epoch": 3.675424486148347, "grad_norm": 0.024197416380047798, "learning_rate": 1.984248834203754e-05, "loss": 0.0049, "step": 5141 }, { "epoch": 3.6761394101876674, "grad_norm": 0.020101070404052734, "learning_rate": 1.982257618343515e-05, "loss": 0.0045, "step": 5142 }, { "epoch": 3.6768543342269884, "grad_norm": 0.024932894855737686, "learning_rate": 1.9802671550456948e-05, "loss": 0.0067, "step": 5143 }, { "epoch": 3.6775692582663093, "grad_norm": 0.025798529386520386, "learning_rate": 1.978277444806676e-05, "loss": 0.0078, "step": 5144 }, { "epoch": 3.67828418230563, "grad_norm": 0.025381896644830704, "learning_rate": 1.9762884881226535e-05, "loss": 0.0065, "step": 5145 }, { "epoch": 3.67828418230563, "eval_loss": 0.007868722081184387, "eval_runtime": 4.5847, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 5145 }, { "epoch": 3.6789991063449508, "grad_norm": 0.02904551662504673, "learning_rate": 1.974300285489627e-05, "loss": 0.0112, "step": 5146 }, { "epoch": 3.6797140303842717, "grad_norm": 0.02891986072063446, "learning_rate": 1.972312837403416e-05, "loss": 0.0102, "step": 5147 }, { "epoch": 3.6804289544235926, "grad_norm": 0.025322461500763893, "learning_rate": 1.97032614435965e-05, "loss": 0.0054, "step": 5148 }, { "epoch": 3.6811438784629136, "grad_norm": 0.02467629872262478, "learning_rate": 1.9683402068537655e-05, "loss": 0.0088, "step": 5149 }, { "epoch": 3.681858802502234, "grad_norm": 0.030663693323731422, "learning_rate": 1.9663550253810183e-05, "loss": 0.0085, "step": 5150 }, { "epoch": 3.681858802502234, "eval_loss": 0.007848748005926609, "eval_runtime": 4.593, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 5150 }, { "epoch": 3.682573726541555, "grad_norm": 0.02538185380399227, "learning_rate": 1.9643706004364675e-05, "loss": 0.0089, "step": 5151 }, { "epoch": 3.6832886505808755, "grad_norm": 0.024363776668906212, "learning_rate": 1.962386932514989e-05, "loss": 0.0052, "step": 5152 }, { "epoch": 3.6840035746201965, "grad_norm": 0.023206720128655434, "learning_rate": 1.9604040221112713e-05, "loss": 0.0058, "step": 5153 }, { "epoch": 3.6847184986595174, "grad_norm": 0.027367260307073593, "learning_rate": 1.958421869719807e-05, "loss": 0.0093, "step": 5154 }, { "epoch": 3.6854334226988383, "grad_norm": 0.020794346928596497, "learning_rate": 1.9564404758349055e-05, "loss": 0.0044, "step": 5155 }, { "epoch": 3.6854334226988383, "eval_loss": 0.007719523273408413, "eval_runtime": 4.5815, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5155 }, { "epoch": 3.6861483467381593, "grad_norm": 0.022477753460407257, "learning_rate": 1.954459840950687e-05, "loss": 0.006, "step": 5156 }, { "epoch": 3.6868632707774798, "grad_norm": 0.028231970965862274, "learning_rate": 1.9524799655610776e-05, "loss": 0.0076, "step": 5157 }, { "epoch": 3.6875781948168007, "grad_norm": 0.031128041446208954, "learning_rate": 1.9505008501598206e-05, "loss": 0.0122, "step": 5158 }, { "epoch": 3.6882931188561217, "grad_norm": 0.02811504900455475, "learning_rate": 1.948522495240463e-05, "loss": 0.0071, "step": 5159 }, { "epoch": 3.689008042895442, "grad_norm": 0.031231572851538658, "learning_rate": 1.9465449012963672e-05, "loss": 0.0095, "step": 5160 }, { "epoch": 3.689008042895442, "eval_loss": 0.007687387056648731, "eval_runtime": 4.5804, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5160 }, { "epoch": 3.689722966934763, "grad_norm": 0.023641172796487808, "learning_rate": 1.9445680688207064e-05, "loss": 0.0089, "step": 5161 }, { "epoch": 3.690437890974084, "grad_norm": 0.02312302030622959, "learning_rate": 1.9425919983064574e-05, "loss": 0.0071, "step": 5162 }, { "epoch": 3.691152815013405, "grad_norm": 0.02438041940331459, "learning_rate": 1.940616690246413e-05, "loss": 0.0065, "step": 5163 }, { "epoch": 3.691867739052726, "grad_norm": 0.027242131531238556, "learning_rate": 1.9386421451331737e-05, "loss": 0.0069, "step": 5164 }, { "epoch": 3.6925826630920464, "grad_norm": 0.02106199413537979, "learning_rate": 1.936668363459152e-05, "loss": 0.0048, "step": 5165 }, { "epoch": 3.6925826630920464, "eval_loss": 0.007784338667988777, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5165 }, { "epoch": 3.6932975871313674, "grad_norm": 0.022565808147192, "learning_rate": 1.934695345716568e-05, "loss": 0.004, "step": 5166 }, { "epoch": 3.694012511170688, "grad_norm": 0.09167232364416122, "learning_rate": 1.932723092397449e-05, "loss": 0.0055, "step": 5167 }, { "epoch": 3.694727435210009, "grad_norm": 0.02700038067996502, "learning_rate": 1.930751603993635e-05, "loss": 0.0104, "step": 5168 }, { "epoch": 3.6954423592493297, "grad_norm": 0.03201116994023323, "learning_rate": 1.9287808809967772e-05, "loss": 0.0063, "step": 5169 }, { "epoch": 3.6961572832886507, "grad_norm": 0.02118769660592079, "learning_rate": 1.9268109238983282e-05, "loss": 0.0045, "step": 5170 }, { "epoch": 3.6961572832886507, "eval_loss": 0.007785717956721783, "eval_runtime": 4.5779, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 5170 }, { "epoch": 3.6968722073279716, "grad_norm": 0.021378299221396446, "learning_rate": 1.924841733189558e-05, "loss": 0.0045, "step": 5171 }, { "epoch": 3.697587131367292, "grad_norm": 0.026579178869724274, "learning_rate": 1.922873309361542e-05, "loss": 0.0066, "step": 5172 }, { "epoch": 3.698302055406613, "grad_norm": 0.024132726714015007, "learning_rate": 1.9209056529051617e-05, "loss": 0.0065, "step": 5173 }, { "epoch": 3.699016979445934, "grad_norm": 0.021876949816942215, "learning_rate": 1.9189387643111135e-05, "loss": 0.0076, "step": 5174 }, { "epoch": 3.6997319034852545, "grad_norm": 0.027622919529676437, "learning_rate": 1.9169726440698942e-05, "loss": 0.0103, "step": 5175 }, { "epoch": 3.6997319034852545, "eval_loss": 0.007793270517140627, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5175 }, { "epoch": 3.7004468275245754, "grad_norm": 0.02247031405568123, "learning_rate": 1.9150072926718166e-05, "loss": 0.0051, "step": 5176 }, { "epoch": 3.7011617515638964, "grad_norm": 0.024038033559918404, "learning_rate": 1.9130427106069992e-05, "loss": 0.0065, "step": 5177 }, { "epoch": 3.7018766756032173, "grad_norm": 0.024044005200266838, "learning_rate": 1.911078898365365e-05, "loss": 0.0054, "step": 5178 }, { "epoch": 3.7025915996425383, "grad_norm": 0.019593972712755203, "learning_rate": 1.9091158564366502e-05, "loss": 0.0045, "step": 5179 }, { "epoch": 3.7033065236818588, "grad_norm": 0.03807418793439865, "learning_rate": 1.907153585310398e-05, "loss": 0.0061, "step": 5180 }, { "epoch": 3.7033065236818588, "eval_loss": 0.007802205625921488, "eval_runtime": 4.5978, "eval_samples_per_second": 10.875, "eval_steps_per_second": 2.827, "step": 5180 }, { "epoch": 3.7040214477211797, "grad_norm": 0.02779831364750862, "learning_rate": 1.9051920854759542e-05, "loss": 0.009, "step": 5181 }, { "epoch": 3.7047363717605, "grad_norm": 0.026649249717593193, "learning_rate": 1.9032313574224807e-05, "loss": 0.0064, "step": 5182 }, { "epoch": 3.705451295799821, "grad_norm": 0.03039734996855259, "learning_rate": 1.901271401638939e-05, "loss": 0.011, "step": 5183 }, { "epoch": 3.706166219839142, "grad_norm": 0.02727808617055416, "learning_rate": 1.8993122186141023e-05, "loss": 0.0053, "step": 5184 }, { "epoch": 3.706881143878463, "grad_norm": 0.029461830854415894, "learning_rate": 1.8973538088365507e-05, "loss": 0.0071, "step": 5185 }, { "epoch": 3.706881143878463, "eval_loss": 0.007761786226183176, "eval_runtime": 4.5926, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 5185 }, { "epoch": 3.707596067917784, "grad_norm": 0.027248507365584373, "learning_rate": 1.8953961727946707e-05, "loss": 0.0064, "step": 5186 }, { "epoch": 3.7083109919571045, "grad_norm": 0.03202877938747406, "learning_rate": 1.893439310976659e-05, "loss": 0.0084, "step": 5187 }, { "epoch": 3.7090259159964254, "grad_norm": 0.029406610876321793, "learning_rate": 1.8914832238705114e-05, "loss": 0.0078, "step": 5188 }, { "epoch": 3.7097408400357463, "grad_norm": 0.027089552953839302, "learning_rate": 1.8895279119640386e-05, "loss": 0.0057, "step": 5189 }, { "epoch": 3.710455764075067, "grad_norm": 0.02513948082923889, "learning_rate": 1.887573375744856e-05, "loss": 0.0054, "step": 5190 }, { "epoch": 3.710455764075067, "eval_loss": 0.007843445055186749, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 5190 }, { "epoch": 3.7111706881143878, "grad_norm": 0.02721596695482731, "learning_rate": 1.8856196157003808e-05, "loss": 0.0076, "step": 5191 }, { "epoch": 3.7118856121537087, "grad_norm": 0.02052859403192997, "learning_rate": 1.883666632317842e-05, "loss": 0.0043, "step": 5192 }, { "epoch": 3.7126005361930297, "grad_norm": 0.023575808852910995, "learning_rate": 1.8817144260842756e-05, "loss": 0.005, "step": 5193 }, { "epoch": 3.71331546023235, "grad_norm": 0.028572428971529007, "learning_rate": 1.879762997486517e-05, "loss": 0.0078, "step": 5194 }, { "epoch": 3.714030384271671, "grad_norm": 0.026849515736103058, "learning_rate": 1.877812347011214e-05, "loss": 0.0056, "step": 5195 }, { "epoch": 3.714030384271671, "eval_loss": 0.007771648932248354, "eval_runtime": 4.5808, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5195 }, { "epoch": 3.714745308310992, "grad_norm": 0.024615740403532982, "learning_rate": 1.8758624751448213e-05, "loss": 0.0044, "step": 5196 }, { "epoch": 3.7154602323503125, "grad_norm": 0.035229649394750595, "learning_rate": 1.873913382373591e-05, "loss": 0.0121, "step": 5197 }, { "epoch": 3.7161751563896335, "grad_norm": 0.026173364371061325, "learning_rate": 1.8719650691835914e-05, "loss": 0.0088, "step": 5198 }, { "epoch": 3.7168900804289544, "grad_norm": 0.02132599800825119, "learning_rate": 1.8700175360606882e-05, "loss": 0.0066, "step": 5199 }, { "epoch": 3.7176050044682754, "grad_norm": 0.028367837890982628, "learning_rate": 1.8680707834905565e-05, "loss": 0.0087, "step": 5200 }, { "epoch": 3.7176050044682754, "eval_loss": 0.007685234770178795, "eval_runtime": 4.6038, "eval_samples_per_second": 10.861, "eval_steps_per_second": 2.824, "step": 5200 }, { "epoch": 3.7183199285075963, "grad_norm": 0.01927781105041504, "learning_rate": 1.8661248119586783e-05, "loss": 0.0042, "step": 5201 }, { "epoch": 3.719034852546917, "grad_norm": 0.021039379760622978, "learning_rate": 1.8641796219503348e-05, "loss": 0.0043, "step": 5202 }, { "epoch": 3.7197497765862377, "grad_norm": 0.02869022823870182, "learning_rate": 1.8622352139506183e-05, "loss": 0.0079, "step": 5203 }, { "epoch": 3.7204647006255587, "grad_norm": 0.045762475579977036, "learning_rate": 1.8602915884444257e-05, "loss": 0.0059, "step": 5204 }, { "epoch": 3.721179624664879, "grad_norm": 0.024738233536481857, "learning_rate": 1.8583487459164526e-05, "loss": 0.0069, "step": 5205 }, { "epoch": 3.721179624664879, "eval_loss": 0.007690598256886005, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5205 }, { "epoch": 3.7218945487042, "grad_norm": 0.02672836370766163, "learning_rate": 1.856406686851208e-05, "loss": 0.0074, "step": 5206 }, { "epoch": 3.722609472743521, "grad_norm": 0.022381914779543877, "learning_rate": 1.8544654117329957e-05, "loss": 0.0071, "step": 5207 }, { "epoch": 3.723324396782842, "grad_norm": 0.02863289974629879, "learning_rate": 1.8525249210459343e-05, "loss": 0.0089, "step": 5208 }, { "epoch": 3.7240393208221625, "grad_norm": 0.024373216554522514, "learning_rate": 1.850585215273942e-05, "loss": 0.0072, "step": 5209 }, { "epoch": 3.7247542448614834, "grad_norm": 0.026654919609427452, "learning_rate": 1.848646294900739e-05, "loss": 0.0105, "step": 5210 }, { "epoch": 3.7247542448614834, "eval_loss": 0.007624953053891659, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5210 }, { "epoch": 3.7254691689008044, "grad_norm": 0.02832888811826706, "learning_rate": 1.846708160409854e-05, "loss": 0.0079, "step": 5211 }, { "epoch": 3.726184092940125, "grad_norm": 0.029846368357539177, "learning_rate": 1.8447708122846145e-05, "loss": 0.0061, "step": 5212 }, { "epoch": 3.726899016979446, "grad_norm": 0.02452557161450386, "learning_rate": 1.842834251008157e-05, "loss": 0.0084, "step": 5213 }, { "epoch": 3.7276139410187668, "grad_norm": 0.019373605027794838, "learning_rate": 1.8408984770634206e-05, "loss": 0.0042, "step": 5214 }, { "epoch": 3.7283288650580877, "grad_norm": 0.02879202924668789, "learning_rate": 1.838963490933145e-05, "loss": 0.0077, "step": 5215 }, { "epoch": 3.7283288650580877, "eval_loss": 0.007610850501805544, "eval_runtime": 4.6147, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 5215 }, { "epoch": 3.7290437890974086, "grad_norm": 0.02819385565817356, "learning_rate": 1.837029293099877e-05, "loss": 0.009, "step": 5216 }, { "epoch": 3.729758713136729, "grad_norm": 0.025626981630921364, "learning_rate": 1.8350958840459666e-05, "loss": 0.0053, "step": 5217 }, { "epoch": 3.73047363717605, "grad_norm": 0.025912068784236908, "learning_rate": 1.8331632642535623e-05, "loss": 0.0061, "step": 5218 }, { "epoch": 3.731188561215371, "grad_norm": 0.02409983240067959, "learning_rate": 1.831231434204622e-05, "loss": 0.0059, "step": 5219 }, { "epoch": 3.7319034852546915, "grad_norm": 0.02414947748184204, "learning_rate": 1.829300394380906e-05, "loss": 0.0047, "step": 5220 }, { "epoch": 3.7319034852546915, "eval_loss": 0.007556145079433918, "eval_runtime": 4.6111, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 5220 }, { "epoch": 3.7326184092940125, "grad_norm": 0.020029651001095772, "learning_rate": 1.8273701452639713e-05, "loss": 0.0067, "step": 5221 }, { "epoch": 3.7333333333333334, "grad_norm": 0.02894645556807518, "learning_rate": 1.825440687335186e-05, "loss": 0.0123, "step": 5222 }, { "epoch": 3.7340482573726543, "grad_norm": 0.0262465737760067, "learning_rate": 1.8235120210757134e-05, "loss": 0.0076, "step": 5223 }, { "epoch": 3.734763181411975, "grad_norm": 0.0286286398768425, "learning_rate": 1.8215841469665246e-05, "loss": 0.007, "step": 5224 }, { "epoch": 3.7354781054512958, "grad_norm": 0.02856992930173874, "learning_rate": 1.8196570654883932e-05, "loss": 0.0058, "step": 5225 }, { "epoch": 3.7354781054512958, "eval_loss": 0.007613189518451691, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5225 }, { "epoch": 3.7361930294906167, "grad_norm": 0.02514106035232544, "learning_rate": 1.8177307771218893e-05, "loss": 0.0073, "step": 5226 }, { "epoch": 3.736907953529937, "grad_norm": 0.02125760167837143, "learning_rate": 1.8158052823473925e-05, "loss": 0.0058, "step": 5227 }, { "epoch": 3.737622877569258, "grad_norm": 0.027004871517419815, "learning_rate": 1.8138805816450816e-05, "loss": 0.0079, "step": 5228 }, { "epoch": 3.738337801608579, "grad_norm": 0.035509899258613586, "learning_rate": 1.8119566754949324e-05, "loss": 0.0074, "step": 5229 }, { "epoch": 3.7390527256479, "grad_norm": 0.026962030678987503, "learning_rate": 1.8100335643767347e-05, "loss": 0.0068, "step": 5230 }, { "epoch": 3.7390527256479, "eval_loss": 0.007621945813298225, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5230 }, { "epoch": 3.739767649687221, "grad_norm": 0.026385527104139328, "learning_rate": 1.8081112487700664e-05, "loss": 0.0053, "step": 5231 }, { "epoch": 3.7404825737265415, "grad_norm": 0.026231463998556137, "learning_rate": 1.8061897291543156e-05, "loss": 0.0085, "step": 5232 }, { "epoch": 3.7411974977658624, "grad_norm": 0.028148196637630463, "learning_rate": 1.804269006008671e-05, "loss": 0.0086, "step": 5233 }, { "epoch": 3.7419124218051834, "grad_norm": 0.021042097359895706, "learning_rate": 1.802349079812118e-05, "loss": 0.0046, "step": 5234 }, { "epoch": 3.742627345844504, "grad_norm": 0.022062746807932854, "learning_rate": 1.8004299510434492e-05, "loss": 0.0082, "step": 5235 }, { "epoch": 3.742627345844504, "eval_loss": 0.007697477005422115, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5235 }, { "epoch": 3.743342269883825, "grad_norm": 0.021183345466852188, "learning_rate": 1.7985116201812524e-05, "loss": 0.0057, "step": 5236 }, { "epoch": 3.7440571939231457, "grad_norm": 0.02468756213784218, "learning_rate": 1.796594087703921e-05, "loss": 0.0111, "step": 5237 }, { "epoch": 3.7447721179624667, "grad_norm": 0.023392124101519585, "learning_rate": 1.7946773540896506e-05, "loss": 0.0083, "step": 5238 }, { "epoch": 3.745487042001787, "grad_norm": 0.020532092079520226, "learning_rate": 1.7927614198164306e-05, "loss": 0.005, "step": 5239 }, { "epoch": 3.746201966041108, "grad_norm": 0.02145230583846569, "learning_rate": 1.7908462853620566e-05, "loss": 0.0046, "step": 5240 }, { "epoch": 3.746201966041108, "eval_loss": 0.0077255298383533955, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 5240 }, { "epoch": 3.746916890080429, "grad_norm": 0.0285240076482296, "learning_rate": 1.7889319512041263e-05, "loss": 0.0097, "step": 5241 }, { "epoch": 3.7476318141197495, "grad_norm": 0.0187169648706913, "learning_rate": 1.787018417820031e-05, "loss": 0.0044, "step": 5242 }, { "epoch": 3.7483467381590705, "grad_norm": 0.02526545710861683, "learning_rate": 1.785105685686968e-05, "loss": 0.0076, "step": 5243 }, { "epoch": 3.7490616621983914, "grad_norm": 0.01837155781686306, "learning_rate": 1.7831937552819345e-05, "loss": 0.0038, "step": 5244 }, { "epoch": 3.7497765862377124, "grad_norm": 0.025229187682271004, "learning_rate": 1.7812826270817227e-05, "loss": 0.0051, "step": 5245 }, { "epoch": 3.7497765862377124, "eval_loss": 0.007704653777182102, "eval_runtime": 4.5928, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 5245 }, { "epoch": 3.7504915102770333, "grad_norm": 0.028891203925013542, "learning_rate": 1.779372301562933e-05, "loss": 0.0094, "step": 5246 }, { "epoch": 3.751206434316354, "grad_norm": 0.029974140226840973, "learning_rate": 1.7774627792019565e-05, "loss": 0.0106, "step": 5247 }, { "epoch": 3.7519213583556748, "grad_norm": 0.02326284348964691, "learning_rate": 1.775554060474991e-05, "loss": 0.0055, "step": 5248 }, { "epoch": 3.7526362823949957, "grad_norm": 0.028750866651535034, "learning_rate": 1.7736461458580323e-05, "loss": 0.0058, "step": 5249 }, { "epoch": 3.753351206434316, "grad_norm": 0.03037985786795616, "learning_rate": 1.7717390358268716e-05, "loss": 0.0058, "step": 5250 }, { "epoch": 3.753351206434316, "eval_loss": 0.007712868042290211, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5250 }, { "epoch": 3.754066130473637, "grad_norm": 0.020772039890289307, "learning_rate": 1.7698327308571043e-05, "loss": 0.004, "step": 5251 }, { "epoch": 3.754781054512958, "grad_norm": 0.0257219560444355, "learning_rate": 1.767927231424124e-05, "loss": 0.0087, "step": 5252 }, { "epoch": 3.755495978552279, "grad_norm": 0.025873303413391113, "learning_rate": 1.766022538003122e-05, "loss": 0.0074, "step": 5253 }, { "epoch": 3.7562109025915995, "grad_norm": 0.029224423691630363, "learning_rate": 1.7641186510690917e-05, "loss": 0.01, "step": 5254 }, { "epoch": 3.7569258266309205, "grad_norm": 0.028409412130713463, "learning_rate": 1.7622155710968186e-05, "loss": 0.0087, "step": 5255 }, { "epoch": 3.7569258266309205, "eval_loss": 0.007787346839904785, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5255 }, { "epoch": 3.7576407506702414, "grad_norm": 0.02364375628530979, "learning_rate": 1.7603132985608945e-05, "loss": 0.004, "step": 5256 }, { "epoch": 3.758355674709562, "grad_norm": 0.03370967134833336, "learning_rate": 1.7584118339357075e-05, "loss": 0.0061, "step": 5257 }, { "epoch": 3.759070598748883, "grad_norm": 0.02827250398695469, "learning_rate": 1.75651117769544e-05, "loss": 0.006, "step": 5258 }, { "epoch": 3.7597855227882038, "grad_norm": 0.028120527043938637, "learning_rate": 1.7546113303140803e-05, "loss": 0.0101, "step": 5259 }, { "epoch": 3.7605004468275247, "grad_norm": 0.026773259043693542, "learning_rate": 1.7527122922654077e-05, "loss": 0.0092, "step": 5260 }, { "epoch": 3.7605004468275247, "eval_loss": 0.007751609664410353, "eval_runtime": 4.5958, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.829, "step": 5260 }, { "epoch": 3.7612153708668457, "grad_norm": 0.022461531683802605, "learning_rate": 1.7508140640230035e-05, "loss": 0.0052, "step": 5261 }, { "epoch": 3.761930294906166, "grad_norm": 0.031022191047668457, "learning_rate": 1.7489166460602495e-05, "loss": 0.0118, "step": 5262 }, { "epoch": 3.762645218945487, "grad_norm": 0.03500255569815636, "learning_rate": 1.7470200388503182e-05, "loss": 0.0098, "step": 5263 }, { "epoch": 3.7633601429848076, "grad_norm": 0.025805287063121796, "learning_rate": 1.7451242428661868e-05, "loss": 0.0051, "step": 5264 }, { "epoch": 3.7640750670241285, "grad_norm": 0.022367140278220177, "learning_rate": 1.7432292585806276e-05, "loss": 0.0055, "step": 5265 }, { "epoch": 3.7640750670241285, "eval_loss": 0.007667801342904568, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 5265 }, { "epoch": 3.7647899910634495, "grad_norm": 0.03336739167571068, "learning_rate": 1.7413350864662086e-05, "loss": 0.0106, "step": 5266 }, { "epoch": 3.7655049151027704, "grad_norm": 0.035946521908044815, "learning_rate": 1.739441726995298e-05, "loss": 0.0102, "step": 5267 }, { "epoch": 3.7662198391420914, "grad_norm": 0.026191163808107376, "learning_rate": 1.7375491806400616e-05, "loss": 0.0105, "step": 5268 }, { "epoch": 3.766934763181412, "grad_norm": 0.03380187600851059, "learning_rate": 1.7356574478724592e-05, "loss": 0.0102, "step": 5269 }, { "epoch": 3.767649687220733, "grad_norm": 0.025834698230028152, "learning_rate": 1.7337665291642523e-05, "loss": 0.0042, "step": 5270 }, { "epoch": 3.767649687220733, "eval_loss": 0.007637403439730406, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5270 }, { "epoch": 3.7683646112600537, "grad_norm": 0.023991011083126068, "learning_rate": 1.7318764249869935e-05, "loss": 0.0098, "step": 5271 }, { "epoch": 3.7690795352993742, "grad_norm": 0.02365702949464321, "learning_rate": 1.7299871358120374e-05, "loss": 0.0075, "step": 5272 }, { "epoch": 3.769794459338695, "grad_norm": 0.03144839406013489, "learning_rate": 1.7280986621105355e-05, "loss": 0.0081, "step": 5273 }, { "epoch": 3.770509383378016, "grad_norm": 0.025859514251351357, "learning_rate": 1.7262110043534284e-05, "loss": 0.0092, "step": 5274 }, { "epoch": 3.771224307417337, "grad_norm": 0.02056470327079296, "learning_rate": 1.7243241630114664e-05, "loss": 0.0048, "step": 5275 }, { "epoch": 3.771224307417337, "eval_loss": 0.007691248320043087, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5275 }, { "epoch": 3.771939231456658, "grad_norm": 0.02697356417775154, "learning_rate": 1.722438138555183e-05, "loss": 0.0087, "step": 5276 }, { "epoch": 3.7726541554959785, "grad_norm": 0.024518052116036415, "learning_rate": 1.720552931454915e-05, "loss": 0.0064, "step": 5277 }, { "epoch": 3.7733690795352994, "grad_norm": 0.0223266389220953, "learning_rate": 1.7186685421807962e-05, "loss": 0.007, "step": 5278 }, { "epoch": 3.77408400357462, "grad_norm": 0.02355293743312359, "learning_rate": 1.7167849712027506e-05, "loss": 0.0069, "step": 5279 }, { "epoch": 3.774798927613941, "grad_norm": 0.03136883303523064, "learning_rate": 1.714902218990504e-05, "loss": 0.0091, "step": 5280 }, { "epoch": 3.774798927613941, "eval_loss": 0.007428450044244528, "eval_runtime": 4.6004, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 5280 }, { "epoch": 3.775513851653262, "grad_norm": 0.021379029378294945, "learning_rate": 1.713020286013577e-05, "loss": 0.0043, "step": 5281 }, { "epoch": 3.7762287756925828, "grad_norm": 0.022498564794659615, "learning_rate": 1.7111391727412805e-05, "loss": 0.0068, "step": 5282 }, { "epoch": 3.7769436997319037, "grad_norm": 0.027095668017864227, "learning_rate": 1.7092588796427306e-05, "loss": 0.0059, "step": 5283 }, { "epoch": 3.777658623771224, "grad_norm": 0.03317838907241821, "learning_rate": 1.7073794071868282e-05, "loss": 0.0127, "step": 5284 }, { "epoch": 3.778373547810545, "grad_norm": 0.022458115592598915, "learning_rate": 1.7055007558422775e-05, "loss": 0.0039, "step": 5285 }, { "epoch": 3.778373547810545, "eval_loss": 0.007321636192500591, "eval_runtime": 4.5818, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5285 }, { "epoch": 3.779088471849866, "grad_norm": 0.024231992661952972, "learning_rate": 1.7036229260775766e-05, "loss": 0.0084, "step": 5286 }, { "epoch": 3.7798033958891866, "grad_norm": 0.02800634875893593, "learning_rate": 1.701745918361014e-05, "loss": 0.0077, "step": 5287 }, { "epoch": 3.7805183199285075, "grad_norm": 0.027282241731882095, "learning_rate": 1.6998697331606777e-05, "loss": 0.0079, "step": 5288 }, { "epoch": 3.7812332439678285, "grad_norm": 0.030423006042838097, "learning_rate": 1.6979943709444518e-05, "loss": 0.0096, "step": 5289 }, { "epoch": 3.7819481680071494, "grad_norm": 0.026072019711136818, "learning_rate": 1.6961198321800093e-05, "loss": 0.0069, "step": 5290 }, { "epoch": 3.7819481680071494, "eval_loss": 0.00733243627473712, "eval_runtime": 4.5814, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5290 }, { "epoch": 3.7826630920464703, "grad_norm": 0.023962484672665596, "learning_rate": 1.694246117334823e-05, "loss": 0.0061, "step": 5291 }, { "epoch": 3.783378016085791, "grad_norm": 0.021577155217528343, "learning_rate": 1.6923732268761595e-05, "loss": 0.0041, "step": 5292 }, { "epoch": 3.7840929401251118, "grad_norm": 0.02261415682733059, "learning_rate": 1.6905011612710764e-05, "loss": 0.0045, "step": 5293 }, { "epoch": 3.7848078641644323, "grad_norm": 0.02046770043671131, "learning_rate": 1.6886299209864314e-05, "loss": 0.0052, "step": 5294 }, { "epoch": 3.785522788203753, "grad_norm": 0.030191965401172638, "learning_rate": 1.686759506488869e-05, "loss": 0.0075, "step": 5295 }, { "epoch": 3.785522788203753, "eval_loss": 0.007343644741922617, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5295 }, { "epoch": 3.786237712243074, "grad_norm": 0.024290811270475388, "learning_rate": 1.6848899182448347e-05, "loss": 0.0099, "step": 5296 }, { "epoch": 3.786952636282395, "grad_norm": 0.021963395178318024, "learning_rate": 1.683021156720564e-05, "loss": 0.0063, "step": 5297 }, { "epoch": 3.787667560321716, "grad_norm": 0.020886173471808434, "learning_rate": 1.6811532223820874e-05, "loss": 0.0034, "step": 5298 }, { "epoch": 3.7883824843610365, "grad_norm": 0.029053909704089165, "learning_rate": 1.679286115695231e-05, "loss": 0.0124, "step": 5299 }, { "epoch": 3.7890974084003575, "grad_norm": 0.027275625616312027, "learning_rate": 1.677419837125609e-05, "loss": 0.0065, "step": 5300 }, { "epoch": 3.7890974084003575, "eval_loss": 0.007310152053833008, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5300 }, { "epoch": 3.7898123324396784, "grad_norm": 0.026477094739675522, "learning_rate": 1.675554387138634e-05, "loss": 0.0048, "step": 5301 }, { "epoch": 3.790527256478999, "grad_norm": 0.0198544729501009, "learning_rate": 1.673689766199513e-05, "loss": 0.0039, "step": 5302 }, { "epoch": 3.79124218051832, "grad_norm": 0.022216789424419403, "learning_rate": 1.6718259747732405e-05, "loss": 0.0045, "step": 5303 }, { "epoch": 3.791957104557641, "grad_norm": 0.021965378895401955, "learning_rate": 1.6699630133246086e-05, "loss": 0.004, "step": 5304 }, { "epoch": 3.7926720285969617, "grad_norm": 0.02968132123351097, "learning_rate": 1.6681008823182024e-05, "loss": 0.0066, "step": 5305 }, { "epoch": 3.7926720285969617, "eval_loss": 0.007355362176895142, "eval_runtime": 4.5951, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 5305 }, { "epoch": 3.7933869526362827, "grad_norm": 0.025030745193362236, "learning_rate": 1.6662395822183973e-05, "loss": 0.0055, "step": 5306 }, { "epoch": 3.794101876675603, "grad_norm": 0.021535921841859818, "learning_rate": 1.6643791134893643e-05, "loss": 0.0077, "step": 5307 }, { "epoch": 3.794816800714924, "grad_norm": 0.0272658783942461, "learning_rate": 1.6625194765950635e-05, "loss": 0.0052, "step": 5308 }, { "epoch": 3.7955317247542446, "grad_norm": 0.024409588426351547, "learning_rate": 1.660660671999251e-05, "loss": 0.0054, "step": 5309 }, { "epoch": 3.7962466487935655, "grad_norm": 0.024536557495594025, "learning_rate": 1.6588027001654765e-05, "loss": 0.0059, "step": 5310 }, { "epoch": 3.7962466487935655, "eval_loss": 0.007385312579572201, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 5310 }, { "epoch": 3.7969615728328865, "grad_norm": 0.020266959443688393, "learning_rate": 1.6569455615570757e-05, "loss": 0.0065, "step": 5311 }, { "epoch": 3.7976764968722074, "grad_norm": 0.024079661816358566, "learning_rate": 1.6550892566371822e-05, "loss": 0.0053, "step": 5312 }, { "epoch": 3.7983914209115284, "grad_norm": 0.024218132719397545, "learning_rate": 1.6532337858687218e-05, "loss": 0.0045, "step": 5313 }, { "epoch": 3.799106344950849, "grad_norm": 0.025427749380469322, "learning_rate": 1.651379149714407e-05, "loss": 0.0075, "step": 5314 }, { "epoch": 3.79982126899017, "grad_norm": 0.0268947072327137, "learning_rate": 1.649525348636748e-05, "loss": 0.0083, "step": 5315 }, { "epoch": 3.79982126899017, "eval_loss": 0.007447536569088697, "eval_runtime": 4.5912, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.832, "step": 5315 }, { "epoch": 3.8005361930294908, "grad_norm": 0.0277300663292408, "learning_rate": 1.647672383098045e-05, "loss": 0.0099, "step": 5316 }, { "epoch": 3.8012511170688112, "grad_norm": 0.024532858282327652, "learning_rate": 1.6458202535603866e-05, "loss": 0.0035, "step": 5317 }, { "epoch": 3.801966041108132, "grad_norm": 0.0229893047362566, "learning_rate": 1.6439689604856566e-05, "loss": 0.0071, "step": 5318 }, { "epoch": 3.802680965147453, "grad_norm": 0.02337624318897724, "learning_rate": 1.6421185043355304e-05, "loss": 0.0078, "step": 5319 }, { "epoch": 3.803395889186774, "grad_norm": 0.023005660623311996, "learning_rate": 1.6402688855714732e-05, "loss": 0.0066, "step": 5320 }, { "epoch": 3.803395889186774, "eval_loss": 0.007571964990347624, "eval_runtime": 4.5854, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5320 }, { "epoch": 3.8041108132260946, "grad_norm": 0.021177176386117935, "learning_rate": 1.63842010465474e-05, "loss": 0.0053, "step": 5321 }, { "epoch": 3.8048257372654155, "grad_norm": 0.03330070525407791, "learning_rate": 1.6365721620463786e-05, "loss": 0.0045, "step": 5322 }, { "epoch": 3.8055406613047364, "grad_norm": 0.03046579100191593, "learning_rate": 1.6347250582072305e-05, "loss": 0.0076, "step": 5323 }, { "epoch": 3.806255585344057, "grad_norm": 0.01997659169137478, "learning_rate": 1.6328787935979206e-05, "loss": 0.0045, "step": 5324 }, { "epoch": 3.806970509383378, "grad_norm": 0.024359285831451416, "learning_rate": 1.631033368678872e-05, "loss": 0.0061, "step": 5325 }, { "epoch": 3.806970509383378, "eval_loss": 0.0074658384546637535, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.836, "step": 5325 }, { "epoch": 3.807685433422699, "grad_norm": 0.030659392476081848, "learning_rate": 1.6291887839102964e-05, "loss": 0.0097, "step": 5326 }, { "epoch": 3.8084003574620198, "grad_norm": 0.025608979165554047, "learning_rate": 1.6273450397521923e-05, "loss": 0.0076, "step": 5327 }, { "epoch": 3.8091152815013407, "grad_norm": 0.02480766922235489, "learning_rate": 1.625502136664352e-05, "loss": 0.0063, "step": 5328 }, { "epoch": 3.809830205540661, "grad_norm": 0.023527832701802254, "learning_rate": 1.6236600751063596e-05, "loss": 0.0059, "step": 5329 }, { "epoch": 3.810545129579982, "grad_norm": 0.025283006951212883, "learning_rate": 1.6218188555375835e-05, "loss": 0.0069, "step": 5330 }, { "epoch": 3.810545129579982, "eval_loss": 0.007438949774950743, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5330 }, { "epoch": 3.811260053619303, "grad_norm": 0.025842178612947464, "learning_rate": 1.619978478417189e-05, "loss": 0.0053, "step": 5331 }, { "epoch": 3.8119749776586236, "grad_norm": 0.02732466161251068, "learning_rate": 1.618138944204125e-05, "loss": 0.0081, "step": 5332 }, { "epoch": 3.8126899016979445, "grad_norm": 0.0267244391143322, "learning_rate": 1.616300253357135e-05, "loss": 0.0063, "step": 5333 }, { "epoch": 3.8134048257372655, "grad_norm": 0.021567558869719505, "learning_rate": 1.6144624063347514e-05, "loss": 0.0064, "step": 5334 }, { "epoch": 3.8141197497765864, "grad_norm": 0.02914063259959221, "learning_rate": 1.6126254035952926e-05, "loss": 0.0079, "step": 5335 }, { "epoch": 3.8141197497765864, "eval_loss": 0.007451266050338745, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5335 }, { "epoch": 3.814834673815907, "grad_norm": 0.022849811241030693, "learning_rate": 1.6107892455968705e-05, "loss": 0.0063, "step": 5336 }, { "epoch": 3.815549597855228, "grad_norm": 0.02524534799158573, "learning_rate": 1.6089539327973857e-05, "loss": 0.0057, "step": 5337 }, { "epoch": 3.816264521894549, "grad_norm": 0.02678363211452961, "learning_rate": 1.6071194656545246e-05, "loss": 0.0068, "step": 5338 }, { "epoch": 3.8169794459338693, "grad_norm": 0.031063009053468704, "learning_rate": 1.6052858446257674e-05, "loss": 0.0055, "step": 5339 }, { "epoch": 3.8176943699731902, "grad_norm": 0.028365688398480415, "learning_rate": 1.6034530701683804e-05, "loss": 0.0074, "step": 5340 }, { "epoch": 3.8176943699731902, "eval_loss": 0.007445878814905882, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5340 }, { "epoch": 3.818409294012511, "grad_norm": 0.033307235687971115, "learning_rate": 1.6016211427394195e-05, "loss": 0.01, "step": 5341 }, { "epoch": 3.819124218051832, "grad_norm": 0.02669103629887104, "learning_rate": 1.599790062795732e-05, "loss": 0.0075, "step": 5342 }, { "epoch": 3.819839142091153, "grad_norm": 0.029789021238684654, "learning_rate": 1.5979598307939468e-05, "loss": 0.0053, "step": 5343 }, { "epoch": 3.8205540661304735, "grad_norm": 0.022896580398082733, "learning_rate": 1.5961304471904898e-05, "loss": 0.0048, "step": 5344 }, { "epoch": 3.8212689901697945, "grad_norm": 0.02326788753271103, "learning_rate": 1.5943019124415687e-05, "loss": 0.0063, "step": 5345 }, { "epoch": 3.8212689901697945, "eval_loss": 0.0073146396316587925, "eval_runtime": 4.6119, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 5345 }, { "epoch": 3.8219839142091154, "grad_norm": 0.023492110893130302, "learning_rate": 1.5924742270031822e-05, "loss": 0.0073, "step": 5346 }, { "epoch": 3.822698838248436, "grad_norm": 0.027340393513441086, "learning_rate": 1.5906473913311206e-05, "loss": 0.0056, "step": 5347 }, { "epoch": 3.823413762287757, "grad_norm": 0.023774640634655952, "learning_rate": 1.588821405880954e-05, "loss": 0.0072, "step": 5348 }, { "epoch": 3.824128686327078, "grad_norm": 0.036709267646074295, "learning_rate": 1.586996271108048e-05, "loss": 0.0117, "step": 5349 }, { "epoch": 3.8248436103663987, "grad_norm": 0.019384071230888367, "learning_rate": 1.5851719874675554e-05, "loss": 0.0044, "step": 5350 }, { "epoch": 3.8248436103663987, "eval_loss": 0.007292693480849266, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 5350 }, { "epoch": 3.8255585344057192, "grad_norm": 0.023059381172060966, "learning_rate": 1.58334855541441e-05, "loss": 0.0063, "step": 5351 }, { "epoch": 3.82627345844504, "grad_norm": 0.029281362891197205, "learning_rate": 1.5815259754033407e-05, "loss": 0.0106, "step": 5352 }, { "epoch": 3.826988382484361, "grad_norm": 0.021762756630778313, "learning_rate": 1.579704247888863e-05, "loss": 0.0069, "step": 5353 }, { "epoch": 3.8277033065236816, "grad_norm": 0.016358807682991028, "learning_rate": 1.5778833733252735e-05, "loss": 0.0036, "step": 5354 }, { "epoch": 3.8284182305630026, "grad_norm": 0.027044260874390602, "learning_rate": 1.576063352166665e-05, "loss": 0.0051, "step": 5355 }, { "epoch": 3.8284182305630026, "eval_loss": 0.007279955316334963, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5355 }, { "epoch": 3.8291331546023235, "grad_norm": 0.01562016922980547, "learning_rate": 1.57424418486691e-05, "loss": 0.0034, "step": 5356 }, { "epoch": 3.8298480786416444, "grad_norm": 0.03152468055486679, "learning_rate": 1.5724258718796712e-05, "loss": 0.0119, "step": 5357 }, { "epoch": 3.8305630026809654, "grad_norm": 0.028945086523890495, "learning_rate": 1.5706084136584013e-05, "loss": 0.0121, "step": 5358 }, { "epoch": 3.831277926720286, "grad_norm": 0.02050665207207203, "learning_rate": 1.5687918106563326e-05, "loss": 0.0045, "step": 5359 }, { "epoch": 3.831992850759607, "grad_norm": 0.02874249592423439, "learning_rate": 1.566976063326491e-05, "loss": 0.0076, "step": 5360 }, { "epoch": 3.831992850759607, "eval_loss": 0.0072502573020756245, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5360 }, { "epoch": 3.8327077747989278, "grad_norm": 0.029901256784796715, "learning_rate": 1.5651611721216863e-05, "loss": 0.0102, "step": 5361 }, { "epoch": 3.8334226988382483, "grad_norm": 0.0254481453448534, "learning_rate": 1.5633471374945113e-05, "loss": 0.0061, "step": 5362 }, { "epoch": 3.834137622877569, "grad_norm": 0.027340834960341454, "learning_rate": 1.5615339598973544e-05, "loss": 0.0051, "step": 5363 }, { "epoch": 3.83485254691689, "grad_norm": 0.029481226578354836, "learning_rate": 1.55972163978238e-05, "loss": 0.0089, "step": 5364 }, { "epoch": 3.835567470956211, "grad_norm": 0.018366944044828415, "learning_rate": 1.5579101776015442e-05, "loss": 0.0041, "step": 5365 }, { "epoch": 3.835567470956211, "eval_loss": 0.007250961381942034, "eval_runtime": 4.5866, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 5365 }, { "epoch": 3.8362823949955316, "grad_norm": 0.019585121423006058, "learning_rate": 1.5560995738065908e-05, "loss": 0.0064, "step": 5366 }, { "epoch": 3.8369973190348525, "grad_norm": 0.022597450762987137, "learning_rate": 1.5542898288490427e-05, "loss": 0.0039, "step": 5367 }, { "epoch": 3.8377122430741735, "grad_norm": 0.03486715257167816, "learning_rate": 1.5524809431802162e-05, "loss": 0.0074, "step": 5368 }, { "epoch": 3.838427167113494, "grad_norm": 0.020545747131109238, "learning_rate": 1.5506729172512067e-05, "loss": 0.0039, "step": 5369 }, { "epoch": 3.839142091152815, "grad_norm": 0.028882445767521858, "learning_rate": 1.5488657515129e-05, "loss": 0.0101, "step": 5370 }, { "epoch": 3.839142091152815, "eval_loss": 0.007229707669466734, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5370 }, { "epoch": 3.839857015192136, "grad_norm": 0.0233939066529274, "learning_rate": 1.5470594464159683e-05, "loss": 0.0046, "step": 5371 }, { "epoch": 3.840571939231457, "grad_norm": 0.032288867980241776, "learning_rate": 1.5452540024108625e-05, "loss": 0.0116, "step": 5372 }, { "epoch": 3.8412868632707777, "grad_norm": 0.02501652017235756, "learning_rate": 1.5434494199478244e-05, "loss": 0.0074, "step": 5373 }, { "epoch": 3.8420017873100982, "grad_norm": 0.026168981567025185, "learning_rate": 1.5416456994768812e-05, "loss": 0.0046, "step": 5374 }, { "epoch": 3.842716711349419, "grad_norm": 0.0328313447535038, "learning_rate": 1.5398428414478407e-05, "loss": 0.0059, "step": 5375 }, { "epoch": 3.842716711349419, "eval_loss": 0.0073029715567827225, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5375 }, { "epoch": 3.84343163538874, "grad_norm": 0.02395367994904518, "learning_rate": 1.5380408463102997e-05, "loss": 0.0061, "step": 5376 }, { "epoch": 3.8441465594280606, "grad_norm": 0.024900374934077263, "learning_rate": 1.5362397145136397e-05, "loss": 0.0098, "step": 5377 }, { "epoch": 3.8448614834673815, "grad_norm": 0.026697389781475067, "learning_rate": 1.5344394465070235e-05, "loss": 0.01, "step": 5378 }, { "epoch": 3.8455764075067025, "grad_norm": 0.033190760761499405, "learning_rate": 1.532640042739402e-05, "loss": 0.0077, "step": 5379 }, { "epoch": 3.8462913315460234, "grad_norm": 0.02140617184340954, "learning_rate": 1.5308415036595074e-05, "loss": 0.0068, "step": 5380 }, { "epoch": 3.8462913315460234, "eval_loss": 0.007300182245671749, "eval_runtime": 4.5993, "eval_samples_per_second": 10.871, "eval_steps_per_second": 2.827, "step": 5380 }, { "epoch": 3.847006255585344, "grad_norm": 0.019147230312228203, "learning_rate": 1.5290438297158598e-05, "loss": 0.0037, "step": 5381 }, { "epoch": 3.847721179624665, "grad_norm": 0.02319330908358097, "learning_rate": 1.527247021356763e-05, "loss": 0.0047, "step": 5382 }, { "epoch": 3.848436103663986, "grad_norm": 0.028992963954806328, "learning_rate": 1.5254510790303e-05, "loss": 0.0059, "step": 5383 }, { "epoch": 3.8491510277033063, "grad_norm": 0.020816711708903313, "learning_rate": 1.5236560031843444e-05, "loss": 0.004, "step": 5384 }, { "epoch": 3.8498659517426272, "grad_norm": 0.022445494309067726, "learning_rate": 1.5218617942665498e-05, "loss": 0.0042, "step": 5385 }, { "epoch": 3.8498659517426272, "eval_loss": 0.007281775586307049, "eval_runtime": 4.6019, "eval_samples_per_second": 10.865, "eval_steps_per_second": 2.825, "step": 5385 }, { "epoch": 3.850580875781948, "grad_norm": 0.0206309761852026, "learning_rate": 1.5200684527243552e-05, "loss": 0.0035, "step": 5386 }, { "epoch": 3.851295799821269, "grad_norm": 0.031711939722299576, "learning_rate": 1.518275979004985e-05, "loss": 0.005, "step": 5387 }, { "epoch": 3.85201072386059, "grad_norm": 0.02535167522728443, "learning_rate": 1.5164843735554408e-05, "loss": 0.0049, "step": 5388 }, { "epoch": 3.8527256478999106, "grad_norm": 0.027442973107099533, "learning_rate": 1.514693636822514e-05, "loss": 0.0073, "step": 5389 }, { "epoch": 3.8534405719392315, "grad_norm": 0.03069327026605606, "learning_rate": 1.5129037692527792e-05, "loss": 0.0089, "step": 5390 }, { "epoch": 3.8534405719392315, "eval_loss": 0.007355134002864361, "eval_runtime": 4.5908, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 5390 }, { "epoch": 3.854155495978552, "grad_norm": 0.030435945838689804, "learning_rate": 1.5111147712925882e-05, "loss": 0.0078, "step": 5391 }, { "epoch": 3.854870420017873, "grad_norm": 0.028546195477247238, "learning_rate": 1.5093266433880837e-05, "loss": 0.0077, "step": 5392 }, { "epoch": 3.855585344057194, "grad_norm": 0.024265781044960022, "learning_rate": 1.5075393859851844e-05, "loss": 0.0046, "step": 5393 }, { "epoch": 3.856300268096515, "grad_norm": 0.028783291578292847, "learning_rate": 1.5057529995295972e-05, "loss": 0.0105, "step": 5394 }, { "epoch": 3.8570151921358358, "grad_norm": 0.024355193600058556, "learning_rate": 1.503967484466811e-05, "loss": 0.0066, "step": 5395 }, { "epoch": 3.8570151921358358, "eval_loss": 0.007393990643322468, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5395 }, { "epoch": 3.8577301161751563, "grad_norm": 0.03319816663861275, "learning_rate": 1.502182841242094e-05, "loss": 0.0116, "step": 5396 }, { "epoch": 3.858445040214477, "grad_norm": 0.02007928304374218, "learning_rate": 1.5003990703004995e-05, "loss": 0.0035, "step": 5397 }, { "epoch": 3.859159964253798, "grad_norm": 0.021891146898269653, "learning_rate": 1.4986161720868658e-05, "loss": 0.0058, "step": 5398 }, { "epoch": 3.8598748882931186, "grad_norm": 0.024270353838801384, "learning_rate": 1.4968341470458065e-05, "loss": 0.0067, "step": 5399 }, { "epoch": 3.8605898123324396, "grad_norm": 0.022198088467121124, "learning_rate": 1.495052995621724e-05, "loss": 0.0049, "step": 5400 }, { "epoch": 3.8605898123324396, "eval_loss": 0.007397189736366272, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 5400 }, { "epoch": 3.8613047363717605, "grad_norm": 0.032529477030038834, "learning_rate": 1.4932727182588024e-05, "loss": 0.0071, "step": 5401 }, { "epoch": 3.8620196604110815, "grad_norm": 0.021829085424542427, "learning_rate": 1.491493315401002e-05, "loss": 0.0045, "step": 5402 }, { "epoch": 3.8627345844504024, "grad_norm": 0.02211351878941059, "learning_rate": 1.4897147874920725e-05, "loss": 0.004, "step": 5403 }, { "epoch": 3.863449508489723, "grad_norm": 0.027576085180044174, "learning_rate": 1.4879371349755394e-05, "loss": 0.0053, "step": 5404 }, { "epoch": 3.864164432529044, "grad_norm": 0.027750015258789062, "learning_rate": 1.4861603582947131e-05, "loss": 0.0069, "step": 5405 }, { "epoch": 3.864164432529044, "eval_loss": 0.007405236829072237, "eval_runtime": 4.5872, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 5405 }, { "epoch": 3.8648793565683643, "grad_norm": 0.023450331762433052, "learning_rate": 1.4843844578926864e-05, "loss": 0.0045, "step": 5406 }, { "epoch": 3.8655942806076853, "grad_norm": 0.022602612152695656, "learning_rate": 1.482609434212328e-05, "loss": 0.0064, "step": 5407 }, { "epoch": 3.8663092046470062, "grad_norm": 0.03192317485809326, "learning_rate": 1.4808352876962983e-05, "loss": 0.0131, "step": 5408 }, { "epoch": 3.867024128686327, "grad_norm": 0.026633959263563156, "learning_rate": 1.4790620187870275e-05, "loss": 0.0042, "step": 5409 }, { "epoch": 3.867739052725648, "grad_norm": 0.029223036020994186, "learning_rate": 1.477289627926734e-05, "loss": 0.0096, "step": 5410 }, { "epoch": 3.867739052725648, "eval_loss": 0.007419346831738949, "eval_runtime": 4.6094, "eval_samples_per_second": 10.847, "eval_steps_per_second": 2.82, "step": 5410 }, { "epoch": 3.8684539767649686, "grad_norm": 0.02784627676010132, "learning_rate": 1.4755181155574166e-05, "loss": 0.007, "step": 5411 }, { "epoch": 3.8691689008042895, "grad_norm": 0.026186082512140274, "learning_rate": 1.4737474821208513e-05, "loss": 0.0061, "step": 5412 }, { "epoch": 3.8698838248436105, "grad_norm": 0.0315246656537056, "learning_rate": 1.4719777280585984e-05, "loss": 0.0074, "step": 5413 }, { "epoch": 3.870598748882931, "grad_norm": 0.031181689351797104, "learning_rate": 1.4702088538119996e-05, "loss": 0.008, "step": 5414 }, { "epoch": 3.871313672922252, "grad_norm": 0.02845790423452854, "learning_rate": 1.4684408598221721e-05, "loss": 0.0076, "step": 5415 }, { "epoch": 3.871313672922252, "eval_loss": 0.0072725811041891575, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 5415 }, { "epoch": 3.872028596961573, "grad_norm": 0.023933952674269676, "learning_rate": 1.4666737465300202e-05, "loss": 0.0044, "step": 5416 }, { "epoch": 3.872743521000894, "grad_norm": 0.0232107974588871, "learning_rate": 1.4649075143762225e-05, "loss": 0.0063, "step": 5417 }, { "epoch": 3.8734584450402147, "grad_norm": 0.021557016298174858, "learning_rate": 1.4631421638012422e-05, "loss": 0.0038, "step": 5418 }, { "epoch": 3.8741733690795352, "grad_norm": 0.035332392901182175, "learning_rate": 1.4613776952453228e-05, "loss": 0.0102, "step": 5419 }, { "epoch": 3.874888293118856, "grad_norm": 0.02797609008848667, "learning_rate": 1.4596141091484828e-05, "loss": 0.0077, "step": 5420 }, { "epoch": 3.874888293118856, "eval_loss": 0.007028554566204548, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 5420 }, { "epoch": 3.8756032171581767, "grad_norm": 0.02243499457836151, "learning_rate": 1.4578514059505255e-05, "loss": 0.0042, "step": 5421 }, { "epoch": 3.8763181411974976, "grad_norm": 0.019933000206947327, "learning_rate": 1.4560895860910345e-05, "loss": 0.0043, "step": 5422 }, { "epoch": 3.8770330652368186, "grad_norm": 0.029939396306872368, "learning_rate": 1.4543286500093678e-05, "loss": 0.0092, "step": 5423 }, { "epoch": 3.8777479892761395, "grad_norm": 0.023285318166017532, "learning_rate": 1.4525685981446679e-05, "loss": 0.0048, "step": 5424 }, { "epoch": 3.8784629133154604, "grad_norm": 0.035507529973983765, "learning_rate": 1.4508094309358572e-05, "loss": 0.0102, "step": 5425 }, { "epoch": 3.8784629133154604, "eval_loss": 0.006920042913407087, "eval_runtime": 4.5795, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 5425 }, { "epoch": 3.879177837354781, "grad_norm": 0.023755287751555443, "learning_rate": 1.449051148821632e-05, "loss": 0.0075, "step": 5426 }, { "epoch": 3.879892761394102, "grad_norm": 0.021565740928053856, "learning_rate": 1.4472937522404744e-05, "loss": 0.0045, "step": 5427 }, { "epoch": 3.880607685433423, "grad_norm": 0.030905025079846382, "learning_rate": 1.4455372416306406e-05, "loss": 0.0135, "step": 5428 }, { "epoch": 3.8813226094727433, "grad_norm": 0.026690956205129623, "learning_rate": 1.4437816174301682e-05, "loss": 0.0071, "step": 5429 }, { "epoch": 3.8820375335120643, "grad_norm": 0.026334432885050774, "learning_rate": 1.4420268800768744e-05, "loss": 0.0062, "step": 5430 }, { "epoch": 3.8820375335120643, "eval_loss": 0.006924851797521114, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 5430 }, { "epoch": 3.882752457551385, "grad_norm": 0.02870616316795349, "learning_rate": 1.4402730300083533e-05, "loss": 0.0108, "step": 5431 }, { "epoch": 3.883467381590706, "grad_norm": 0.02607223391532898, "learning_rate": 1.438520067661982e-05, "loss": 0.0076, "step": 5432 }, { "epoch": 3.8841823056300266, "grad_norm": 0.03505956754088402, "learning_rate": 1.4367679934749085e-05, "loss": 0.0102, "step": 5433 }, { "epoch": 3.8848972296693476, "grad_norm": 0.024677656590938568, "learning_rate": 1.4350168078840654e-05, "loss": 0.0058, "step": 5434 }, { "epoch": 3.8856121537086685, "grad_norm": 0.02319561317563057, "learning_rate": 1.4332665113261646e-05, "loss": 0.0067, "step": 5435 }, { "epoch": 3.8856121537086685, "eval_loss": 0.0069025191478431225, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 5435 }, { "epoch": 3.886327077747989, "grad_norm": 0.026311049237847328, "learning_rate": 1.4315171042376896e-05, "loss": 0.0058, "step": 5436 }, { "epoch": 3.88704200178731, "grad_norm": 0.024240337312221527, "learning_rate": 1.4297685870549088e-05, "loss": 0.0081, "step": 5437 }, { "epoch": 3.887756925826631, "grad_norm": 0.02726398967206478, "learning_rate": 1.4280209602138672e-05, "loss": 0.0083, "step": 5438 }, { "epoch": 3.888471849865952, "grad_norm": 0.023146824911236763, "learning_rate": 1.4262742241503835e-05, "loss": 0.0067, "step": 5439 }, { "epoch": 3.889186773905273, "grad_norm": 0.026688946411013603, "learning_rate": 1.4245283793000608e-05, "loss": 0.0062, "step": 5440 }, { "epoch": 3.889186773905273, "eval_loss": 0.006917247548699379, "eval_runtime": 4.5799, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5440 }, { "epoch": 3.8899016979445933, "grad_norm": 0.021465374156832695, "learning_rate": 1.422783426098273e-05, "loss": 0.0045, "step": 5441 }, { "epoch": 3.8906166219839142, "grad_norm": 0.023244014009833336, "learning_rate": 1.4210393649801779e-05, "loss": 0.0049, "step": 5442 }, { "epoch": 3.891331546023235, "grad_norm": 0.029030093923211098, "learning_rate": 1.4192961963807094e-05, "loss": 0.005, "step": 5443 }, { "epoch": 3.8920464700625557, "grad_norm": 0.03379908576607704, "learning_rate": 1.4175539207345739e-05, "loss": 0.0063, "step": 5444 }, { "epoch": 3.8927613941018766, "grad_norm": 0.0223431084305048, "learning_rate": 1.4158125384762605e-05, "loss": 0.0061, "step": 5445 }, { "epoch": 3.8927613941018766, "eval_loss": 0.006954575888812542, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 5445 }, { "epoch": 3.8934763181411975, "grad_norm": 0.031098879873752594, "learning_rate": 1.4140720500400361e-05, "loss": 0.0099, "step": 5446 }, { "epoch": 3.8941912421805185, "grad_norm": 0.019696548581123352, "learning_rate": 1.4123324558599387e-05, "loss": 0.0077, "step": 5447 }, { "epoch": 3.894906166219839, "grad_norm": 0.03410012647509575, "learning_rate": 1.4105937563697892e-05, "loss": 0.0083, "step": 5448 }, { "epoch": 3.89562109025916, "grad_norm": 0.020521575585007668, "learning_rate": 1.4088559520031841e-05, "loss": 0.0043, "step": 5449 }, { "epoch": 3.896336014298481, "grad_norm": 0.021693691611289978, "learning_rate": 1.4071190431934934e-05, "loss": 0.0049, "step": 5450 }, { "epoch": 3.896336014298481, "eval_loss": 0.007015639916062355, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 5450 }, { "epoch": 3.8970509383378014, "grad_norm": 0.02792513184249401, "learning_rate": 1.4053830303738669e-05, "loss": 0.0077, "step": 5451 }, { "epoch": 3.8977658623771223, "grad_norm": 0.03213990107178688, "learning_rate": 1.4036479139772308e-05, "loss": 0.01, "step": 5452 }, { "epoch": 3.8984807864164432, "grad_norm": 0.030512642115354538, "learning_rate": 1.4019136944362882e-05, "loss": 0.0111, "step": 5453 }, { "epoch": 3.899195710455764, "grad_norm": 0.028227144852280617, "learning_rate": 1.4001803721835149e-05, "loss": 0.0071, "step": 5454 }, { "epoch": 3.899910634495085, "grad_norm": 0.025612499564886093, "learning_rate": 1.3984479476511675e-05, "loss": 0.0073, "step": 5455 }, { "epoch": 3.899910634495085, "eval_loss": 0.007108114194124937, "eval_runtime": 4.5873, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 5455 }, { "epoch": 3.9006255585344056, "grad_norm": 0.02424742840230465, "learning_rate": 1.3967164212712774e-05, "loss": 0.0056, "step": 5456 }, { "epoch": 3.9013404825737266, "grad_norm": 0.022492533549666405, "learning_rate": 1.3949857934756495e-05, "loss": 0.0063, "step": 5457 }, { "epoch": 3.9020554066130475, "grad_norm": 0.026176106184720993, "learning_rate": 1.3932560646958664e-05, "loss": 0.0059, "step": 5458 }, { "epoch": 3.902770330652368, "grad_norm": 0.02534174732863903, "learning_rate": 1.3915272353632897e-05, "loss": 0.0046, "step": 5459 }, { "epoch": 3.903485254691689, "grad_norm": 0.019194820895791054, "learning_rate": 1.3897993059090491e-05, "loss": 0.0039, "step": 5460 }, { "epoch": 3.903485254691689, "eval_loss": 0.0072045777924358845, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 5460 }, { "epoch": 3.90420017873101, "grad_norm": 0.025667859241366386, "learning_rate": 1.3880722767640575e-05, "loss": 0.0073, "step": 5461 }, { "epoch": 3.904915102770331, "grad_norm": 0.029067980125546455, "learning_rate": 1.3863461483590007e-05, "loss": 0.0068, "step": 5462 }, { "epoch": 3.9056300268096513, "grad_norm": 0.02095995843410492, "learning_rate": 1.3846209211243365e-05, "loss": 0.0059, "step": 5463 }, { "epoch": 3.9063449508489723, "grad_norm": 0.020415276288986206, "learning_rate": 1.3828965954903039e-05, "loss": 0.0038, "step": 5464 }, { "epoch": 3.907059874888293, "grad_norm": 0.030017632991075516, "learning_rate": 1.3811731718869109e-05, "loss": 0.0075, "step": 5465 }, { "epoch": 3.907059874888293, "eval_loss": 0.0072000096552073956, "eval_runtime": 4.5956, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 5465 }, { "epoch": 3.9077747989276137, "grad_norm": 0.024990493431687355, "learning_rate": 1.3794506507439452e-05, "loss": 0.0061, "step": 5466 }, { "epoch": 3.9084897229669346, "grad_norm": 0.03322731330990791, "learning_rate": 1.3777290324909698e-05, "loss": 0.0105, "step": 5467 }, { "epoch": 3.9092046470062556, "grad_norm": 0.028778454288840294, "learning_rate": 1.3760083175573168e-05, "loss": 0.0091, "step": 5468 }, { "epoch": 3.9099195710455765, "grad_norm": 0.02639981172978878, "learning_rate": 1.374288506372099e-05, "loss": 0.0091, "step": 5469 }, { "epoch": 3.9106344950848975, "grad_norm": 0.024324780330061913, "learning_rate": 1.3725695993642029e-05, "loss": 0.0066, "step": 5470 }, { "epoch": 3.9106344950848975, "eval_loss": 0.007192546501755714, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5470 }, { "epoch": 3.911349419124218, "grad_norm": 0.02720702439546585, "learning_rate": 1.3708515969622854e-05, "loss": 0.0093, "step": 5471 }, { "epoch": 3.912064343163539, "grad_norm": 0.020217258483171463, "learning_rate": 1.3691344995947818e-05, "loss": 0.0049, "step": 5472 }, { "epoch": 3.91277926720286, "grad_norm": 0.027291065081954002, "learning_rate": 1.3674183076899017e-05, "loss": 0.007, "step": 5473 }, { "epoch": 3.9134941912421803, "grad_norm": 0.028713544830679893, "learning_rate": 1.3657030216756262e-05, "loss": 0.008, "step": 5474 }, { "epoch": 3.9142091152815013, "grad_norm": 0.021466298028826714, "learning_rate": 1.3639886419797148e-05, "loss": 0.0044, "step": 5475 }, { "epoch": 3.9142091152815013, "eval_loss": 0.007116630673408508, "eval_runtime": 4.6215, "eval_samples_per_second": 10.819, "eval_steps_per_second": 2.813, "step": 5475 }, { "epoch": 3.9149240393208222, "grad_norm": 0.02197590284049511, "learning_rate": 1.3622751690296948e-05, "loss": 0.0047, "step": 5476 }, { "epoch": 3.915638963360143, "grad_norm": 0.024456333369016647, "learning_rate": 1.3605626032528745e-05, "loss": 0.0043, "step": 5477 }, { "epoch": 3.9163538873994637, "grad_norm": 0.028932472690939903, "learning_rate": 1.3588509450763282e-05, "loss": 0.0075, "step": 5478 }, { "epoch": 3.9170688114387846, "grad_norm": 0.025686495006084442, "learning_rate": 1.3571401949269102e-05, "loss": 0.0052, "step": 5479 }, { "epoch": 3.9177837354781055, "grad_norm": 0.027260655537247658, "learning_rate": 1.3554303532312474e-05, "loss": 0.0064, "step": 5480 }, { "epoch": 3.9177837354781055, "eval_loss": 0.007070094812661409, "eval_runtime": 4.6214, "eval_samples_per_second": 10.819, "eval_steps_per_second": 2.813, "step": 5480 }, { "epoch": 3.918498659517426, "grad_norm": 0.028895817697048187, "learning_rate": 1.3537214204157361e-05, "loss": 0.0051, "step": 5481 }, { "epoch": 3.919213583556747, "grad_norm": 0.026170363649725914, "learning_rate": 1.3520133969065502e-05, "loss": 0.0118, "step": 5482 }, { "epoch": 3.919928507596068, "grad_norm": 0.022687269374728203, "learning_rate": 1.350306283129637e-05, "loss": 0.0049, "step": 5483 }, { "epoch": 3.920643431635389, "grad_norm": 0.020597511902451515, "learning_rate": 1.3486000795107118e-05, "loss": 0.0057, "step": 5484 }, { "epoch": 3.92135835567471, "grad_norm": 0.02753262221813202, "learning_rate": 1.346894786475268e-05, "loss": 0.0088, "step": 5485 }, { "epoch": 3.92135835567471, "eval_loss": 0.007012483198195696, "eval_runtime": 4.5813, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5485 }, { "epoch": 3.9220732797140303, "grad_norm": 0.020062973722815514, "learning_rate": 1.3451904044485725e-05, "loss": 0.006, "step": 5486 }, { "epoch": 3.9227882037533512, "grad_norm": 0.025987893342971802, "learning_rate": 1.3434869338556593e-05, "loss": 0.0083, "step": 5487 }, { "epoch": 3.923503127792672, "grad_norm": 0.020967548713088036, "learning_rate": 1.341784375121342e-05, "loss": 0.0041, "step": 5488 }, { "epoch": 3.9242180518319927, "grad_norm": 0.019205698743462563, "learning_rate": 1.3400827286702e-05, "loss": 0.0049, "step": 5489 }, { "epoch": 3.9249329758713136, "grad_norm": 0.020728757604956627, "learning_rate": 1.3383819949265908e-05, "loss": 0.0043, "step": 5490 }, { "epoch": 3.9249329758713136, "eval_loss": 0.006960311904549599, "eval_runtime": 4.5874, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 5490 }, { "epoch": 3.9256478999106346, "grad_norm": 0.026063911616802216, "learning_rate": 1.336682174314643e-05, "loss": 0.0071, "step": 5491 }, { "epoch": 3.9263628239499555, "grad_norm": 0.023541226983070374, "learning_rate": 1.334983267258254e-05, "loss": 0.0049, "step": 5492 }, { "epoch": 3.927077747989276, "grad_norm": 0.026729917153716087, "learning_rate": 1.3332852741810975e-05, "loss": 0.0046, "step": 5493 }, { "epoch": 3.927792672028597, "grad_norm": 0.028776800259947777, "learning_rate": 1.331588195506619e-05, "loss": 0.0076, "step": 5494 }, { "epoch": 3.928507596067918, "grad_norm": 0.02926851250231266, "learning_rate": 1.3298920316580304e-05, "loss": 0.0095, "step": 5495 }, { "epoch": 3.928507596067918, "eval_loss": 0.007042599376291037, "eval_runtime": 4.5871, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 5495 }, { "epoch": 3.9292225201072384, "grad_norm": 0.02655724063515663, "learning_rate": 1.3281967830583265e-05, "loss": 0.0083, "step": 5496 }, { "epoch": 3.9299374441465593, "grad_norm": 0.023185325786471367, "learning_rate": 1.326502450130262e-05, "loss": 0.0039, "step": 5497 }, { "epoch": 3.9306523681858803, "grad_norm": 0.030129792168736458, "learning_rate": 1.3248090332963697e-05, "loss": 0.0081, "step": 5498 }, { "epoch": 3.931367292225201, "grad_norm": 0.023116694763302803, "learning_rate": 1.3231165329789546e-05, "loss": 0.0063, "step": 5499 }, { "epoch": 3.932082216264522, "grad_norm": 0.02063196338713169, "learning_rate": 1.3214249496000885e-05, "loss": 0.0037, "step": 5500 }, { "epoch": 3.932082216264522, "eval_loss": 0.0070355660282075405, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5500 }, { "epoch": 3.9327971403038426, "grad_norm": 0.0210881270468235, "learning_rate": 1.3197342835816196e-05, "loss": 0.0045, "step": 5501 }, { "epoch": 3.9335120643431636, "grad_norm": 0.025793813169002533, "learning_rate": 1.318044535345162e-05, "loss": 0.0065, "step": 5502 }, { "epoch": 3.9342269883824845, "grad_norm": 0.030478429049253464, "learning_rate": 1.3163557053121061e-05, "loss": 0.0072, "step": 5503 }, { "epoch": 3.934941912421805, "grad_norm": 0.030258705839514732, "learning_rate": 1.3146677939036117e-05, "loss": 0.0046, "step": 5504 }, { "epoch": 3.935656836461126, "grad_norm": 0.02423689514398575, "learning_rate": 1.3129808015406065e-05, "loss": 0.0077, "step": 5505 }, { "epoch": 3.935656836461126, "eval_loss": 0.00708279712125659, "eval_runtime": 4.5791, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 5505 }, { "epoch": 3.936371760500447, "grad_norm": 0.024213913828134537, "learning_rate": 1.3112947286437926e-05, "loss": 0.0089, "step": 5506 }, { "epoch": 3.937086684539768, "grad_norm": 0.025404896587133408, "learning_rate": 1.309609575633644e-05, "loss": 0.0052, "step": 5507 }, { "epoch": 3.9378016085790883, "grad_norm": 0.032358936965465546, "learning_rate": 1.3079253429303989e-05, "loss": 0.0059, "step": 5508 }, { "epoch": 3.9385165326184093, "grad_norm": 0.02672230452299118, "learning_rate": 1.3062420309540718e-05, "loss": 0.0052, "step": 5509 }, { "epoch": 3.9392314566577302, "grad_norm": 0.026148241013288498, "learning_rate": 1.3045596401244476e-05, "loss": 0.0066, "step": 5510 }, { "epoch": 3.9392314566577302, "eval_loss": 0.007007496897131205, "eval_runtime": 4.6274, "eval_samples_per_second": 10.805, "eval_steps_per_second": 2.809, "step": 5510 }, { "epoch": 3.9399463806970507, "grad_norm": 0.022175854071974754, "learning_rate": 1.3028781708610765e-05, "loss": 0.0039, "step": 5511 }, { "epoch": 3.9406613047363717, "grad_norm": 0.023205718025565147, "learning_rate": 1.301197623583285e-05, "loss": 0.0034, "step": 5512 }, { "epoch": 3.9413762287756926, "grad_norm": 0.03172365576028824, "learning_rate": 1.2995179987101647e-05, "loss": 0.0077, "step": 5513 }, { "epoch": 3.9420911528150135, "grad_norm": 0.036712124943733215, "learning_rate": 1.297839296660579e-05, "loss": 0.0078, "step": 5514 }, { "epoch": 3.9428060768543345, "grad_norm": 0.020129097625613213, "learning_rate": 1.2961615178531644e-05, "loss": 0.0046, "step": 5515 }, { "epoch": 3.9428060768543345, "eval_loss": 0.0070679690688848495, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5515 }, { "epoch": 3.943521000893655, "grad_norm": 0.0274917334318161, "learning_rate": 1.2944846627063206e-05, "loss": 0.0071, "step": 5516 }, { "epoch": 3.944235924932976, "grad_norm": 0.026265159249305725, "learning_rate": 1.2928087316382225e-05, "loss": 0.007, "step": 5517 }, { "epoch": 3.9449508489722964, "grad_norm": 0.02811834216117859, "learning_rate": 1.2911337250668116e-05, "loss": 0.005, "step": 5518 }, { "epoch": 3.9456657730116174, "grad_norm": 0.029883844777941704, "learning_rate": 1.2894596434098005e-05, "loss": 0.0112, "step": 5519 }, { "epoch": 3.9463806970509383, "grad_norm": 0.025291508063673973, "learning_rate": 1.2877864870846724e-05, "loss": 0.0064, "step": 5520 }, { "epoch": 3.9463806970509383, "eval_loss": 0.007060622796416283, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 5520 }, { "epoch": 3.9470956210902592, "grad_norm": 0.024802934378385544, "learning_rate": 1.2861142565086738e-05, "loss": 0.0071, "step": 5521 }, { "epoch": 3.94781054512958, "grad_norm": 0.02303851582109928, "learning_rate": 1.2844429520988261e-05, "loss": 0.0051, "step": 5522 }, { "epoch": 3.9485254691689007, "grad_norm": 0.02452917955815792, "learning_rate": 1.2827725742719204e-05, "loss": 0.0066, "step": 5523 }, { "epoch": 3.9492403932082216, "grad_norm": 0.033106304705142975, "learning_rate": 1.2811031234445103e-05, "loss": 0.0095, "step": 5524 }, { "epoch": 3.9499553172475426, "grad_norm": 0.028944838792085648, "learning_rate": 1.2794346000329255e-05, "loss": 0.0073, "step": 5525 }, { "epoch": 3.9499553172475426, "eval_loss": 0.007016900461167097, "eval_runtime": 4.5869, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 5525 }, { "epoch": 3.950670241286863, "grad_norm": 0.020401226356625557, "learning_rate": 1.2777670044532585e-05, "loss": 0.0061, "step": 5526 }, { "epoch": 3.951385165326184, "grad_norm": 0.02543577551841736, "learning_rate": 1.2761003371213743e-05, "loss": 0.0053, "step": 5527 }, { "epoch": 3.952100089365505, "grad_norm": 0.023454803973436356, "learning_rate": 1.2744345984529065e-05, "loss": 0.0064, "step": 5528 }, { "epoch": 3.952815013404826, "grad_norm": 0.025615515187382698, "learning_rate": 1.2727697888632533e-05, "loss": 0.0079, "step": 5529 }, { "epoch": 3.953529937444147, "grad_norm": 0.031228598207235336, "learning_rate": 1.2711059087675852e-05, "loss": 0.0109, "step": 5530 }, { "epoch": 3.953529937444147, "eval_loss": 0.006890392862260342, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5530 }, { "epoch": 3.9542448614834673, "grad_norm": 0.02801814116537571, "learning_rate": 1.2694429585808404e-05, "loss": 0.0074, "step": 5531 }, { "epoch": 3.9549597855227883, "grad_norm": 0.03051871247589588, "learning_rate": 1.2677809387177219e-05, "loss": 0.0072, "step": 5532 }, { "epoch": 3.9556747095621088, "grad_norm": 0.024469805881381035, "learning_rate": 1.266119849592704e-05, "loss": 0.004, "step": 5533 }, { "epoch": 3.9563896336014297, "grad_norm": 0.02124415896832943, "learning_rate": 1.264459691620029e-05, "loss": 0.0041, "step": 5534 }, { "epoch": 3.9571045576407506, "grad_norm": 0.023842260241508484, "learning_rate": 1.2628004652137044e-05, "loss": 0.0046, "step": 5535 }, { "epoch": 3.9571045576407506, "eval_loss": 0.006936993915587664, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5535 }, { "epoch": 3.9578194816800716, "grad_norm": 0.021202782168984413, "learning_rate": 1.2611421707875082e-05, "loss": 0.0036, "step": 5536 }, { "epoch": 3.9585344057193925, "grad_norm": 0.022206716239452362, "learning_rate": 1.2594848087549827e-05, "loss": 0.0043, "step": 5537 }, { "epoch": 3.959249329758713, "grad_norm": 0.020683271810412407, "learning_rate": 1.257828379529441e-05, "loss": 0.0044, "step": 5538 }, { "epoch": 3.959964253798034, "grad_norm": 0.024193530902266502, "learning_rate": 1.2561728835239633e-05, "loss": 0.0049, "step": 5539 }, { "epoch": 3.960679177837355, "grad_norm": 0.04252808168530464, "learning_rate": 1.2545183211513917e-05, "loss": 0.0141, "step": 5540 }, { "epoch": 3.960679177837355, "eval_loss": 0.006964739877730608, "eval_runtime": 4.6115, "eval_samples_per_second": 10.843, "eval_steps_per_second": 2.819, "step": 5540 }, { "epoch": 3.9613941018766754, "grad_norm": 0.020364221185445786, "learning_rate": 1.2528646928243459e-05, "loss": 0.0031, "step": 5541 }, { "epoch": 3.9621090259159963, "grad_norm": 0.03362711891531944, "learning_rate": 1.2512119989552023e-05, "loss": 0.0081, "step": 5542 }, { "epoch": 3.9628239499553173, "grad_norm": 0.025193743407726288, "learning_rate": 1.2495602399561096e-05, "loss": 0.007, "step": 5543 }, { "epoch": 3.9635388739946382, "grad_norm": 0.027979817241430283, "learning_rate": 1.2479094162389837e-05, "loss": 0.0071, "step": 5544 }, { "epoch": 3.964253798033959, "grad_norm": 0.02396785467863083, "learning_rate": 1.2462595282155031e-05, "loss": 0.0054, "step": 5545 }, { "epoch": 3.964253798033959, "eval_loss": 0.006970370654016733, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5545 }, { "epoch": 3.9649687220732797, "grad_norm": 0.02598544955253601, "learning_rate": 1.2446105762971166e-05, "loss": 0.0062, "step": 5546 }, { "epoch": 3.9656836461126006, "grad_norm": 0.031603872776031494, "learning_rate": 1.2429625608950413e-05, "loss": 0.0099, "step": 5547 }, { "epoch": 3.966398570151921, "grad_norm": 0.02327454835176468, "learning_rate": 1.2413154824202544e-05, "loss": 0.005, "step": 5548 }, { "epoch": 3.967113494191242, "grad_norm": 0.020813781768083572, "learning_rate": 1.2396693412835058e-05, "loss": 0.0036, "step": 5549 }, { "epoch": 3.967828418230563, "grad_norm": 0.02077315002679825, "learning_rate": 1.2380241378953067e-05, "loss": 0.0046, "step": 5550 }, { "epoch": 3.967828418230563, "eval_loss": 0.007016555406153202, "eval_runtime": 4.5894, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 5550 }, { "epoch": 3.968543342269884, "grad_norm": 0.027743080630898476, "learning_rate": 1.2363798726659375e-05, "loss": 0.0084, "step": 5551 }, { "epoch": 3.969258266309205, "grad_norm": 0.030142858624458313, "learning_rate": 1.234736546005446e-05, "loss": 0.0053, "step": 5552 }, { "epoch": 3.9699731903485254, "grad_norm": 0.03296090289950371, "learning_rate": 1.2330941583236406e-05, "loss": 0.0086, "step": 5553 }, { "epoch": 3.9706881143878463, "grad_norm": 0.03662712499499321, "learning_rate": 1.2314527100301004e-05, "loss": 0.0062, "step": 5554 }, { "epoch": 3.9714030384271672, "grad_norm": 0.020093651488423347, "learning_rate": 1.2298122015341695e-05, "loss": 0.0063, "step": 5555 }, { "epoch": 3.9714030384271672, "eval_loss": 0.006981757935136557, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 5555 }, { "epoch": 3.9721179624664877, "grad_norm": 0.03942215442657471, "learning_rate": 1.2281726332449545e-05, "loss": 0.0103, "step": 5556 }, { "epoch": 3.9728328865058087, "grad_norm": 0.03203514590859413, "learning_rate": 1.226534005571331e-05, "loss": 0.007, "step": 5557 }, { "epoch": 3.9735478105451296, "grad_norm": 0.02123391442000866, "learning_rate": 1.2248963189219398e-05, "loss": 0.0053, "step": 5558 }, { "epoch": 3.9742627345844506, "grad_norm": 0.023960420861840248, "learning_rate": 1.2232595737051838e-05, "loss": 0.0046, "step": 5559 }, { "epoch": 3.974977658623771, "grad_norm": 0.029707973822951317, "learning_rate": 1.221623770329236e-05, "loss": 0.0074, "step": 5560 }, { "epoch": 3.974977658623771, "eval_loss": 0.006999256554991007, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5560 }, { "epoch": 3.975692582663092, "grad_norm": 0.021122414618730545, "learning_rate": 1.2199889092020289e-05, "loss": 0.0045, "step": 5561 }, { "epoch": 3.976407506702413, "grad_norm": 0.07098445296287537, "learning_rate": 1.2183549907312625e-05, "loss": 0.0043, "step": 5562 }, { "epoch": 3.9771224307417334, "grad_norm": 0.03379693999886513, "learning_rate": 1.2167220153244075e-05, "loss": 0.0104, "step": 5563 }, { "epoch": 3.9778373547810544, "grad_norm": 0.028213266283273697, "learning_rate": 1.2150899833886892e-05, "loss": 0.0082, "step": 5564 }, { "epoch": 3.9785522788203753, "grad_norm": 0.028489813208580017, "learning_rate": 1.2134588953311055e-05, "loss": 0.0108, "step": 5565 }, { "epoch": 3.9785522788203753, "eval_loss": 0.006996180396527052, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5565 }, { "epoch": 3.9792672028596963, "grad_norm": 0.022210992872714996, "learning_rate": 1.2118287515584132e-05, "loss": 0.0043, "step": 5566 }, { "epoch": 3.979982126899017, "grad_norm": 0.022594014182686806, "learning_rate": 1.2101995524771375e-05, "loss": 0.0061, "step": 5567 }, { "epoch": 3.9806970509383377, "grad_norm": 0.01915205456316471, "learning_rate": 1.2085712984935693e-05, "loss": 0.0037, "step": 5568 }, { "epoch": 3.9814119749776586, "grad_norm": 0.030680129304528236, "learning_rate": 1.2069439900137575e-05, "loss": 0.0122, "step": 5569 }, { "epoch": 3.9821268990169796, "grad_norm": 0.025805659592151642, "learning_rate": 1.2053176274435207e-05, "loss": 0.0087, "step": 5570 }, { "epoch": 3.9821268990169796, "eval_loss": 0.0069848899729549885, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5570 }, { "epoch": 3.9828418230563, "grad_norm": 0.02727646939456463, "learning_rate": 1.2036922111884413e-05, "loss": 0.0045, "step": 5571 }, { "epoch": 3.983556747095621, "grad_norm": 0.020668037235736847, "learning_rate": 1.2020677416538623e-05, "loss": 0.0047, "step": 5572 }, { "epoch": 3.984271671134942, "grad_norm": 0.020814042538404465, "learning_rate": 1.2004442192448957e-05, "loss": 0.0041, "step": 5573 }, { "epoch": 3.984986595174263, "grad_norm": 0.022249115630984306, "learning_rate": 1.1988216443664101e-05, "loss": 0.0057, "step": 5574 }, { "epoch": 3.9857015192135834, "grad_norm": 0.01766892708837986, "learning_rate": 1.1972000174230452e-05, "loss": 0.004, "step": 5575 }, { "epoch": 3.9857015192135834, "eval_loss": 0.006945286877453327, "eval_runtime": 4.5964, "eval_samples_per_second": 10.878, "eval_steps_per_second": 2.828, "step": 5575 }, { "epoch": 3.9864164432529043, "grad_norm": 0.027132442221045494, "learning_rate": 1.195579338819201e-05, "loss": 0.0083, "step": 5576 }, { "epoch": 3.9871313672922253, "grad_norm": 0.02215275540947914, "learning_rate": 1.1939596089590394e-05, "loss": 0.0056, "step": 5577 }, { "epoch": 3.987846291331546, "grad_norm": 0.025803254917263985, "learning_rate": 1.1923408282464887e-05, "loss": 0.0077, "step": 5578 }, { "epoch": 3.9885612153708667, "grad_norm": 0.03114502876996994, "learning_rate": 1.1907229970852408e-05, "loss": 0.0047, "step": 5579 }, { "epoch": 3.9892761394101877, "grad_norm": 0.01789805479347706, "learning_rate": 1.1891061158787458e-05, "loss": 0.0037, "step": 5580 }, { "epoch": 3.9892761394101877, "eval_loss": 0.006966172717511654, "eval_runtime": 4.6026, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 5580 }, { "epoch": 3.9899910634495086, "grad_norm": 0.02489348314702511, "learning_rate": 1.1874901850302223e-05, "loss": 0.0057, "step": 5581 }, { "epoch": 3.9907059874888295, "grad_norm": 0.02170165441930294, "learning_rate": 1.1858752049426513e-05, "loss": 0.0083, "step": 5582 }, { "epoch": 3.99142091152815, "grad_norm": 0.031205125153064728, "learning_rate": 1.1842611760187722e-05, "loss": 0.0088, "step": 5583 }, { "epoch": 3.992135835567471, "grad_norm": 0.025327617302536964, "learning_rate": 1.1826480986610927e-05, "loss": 0.0085, "step": 5584 }, { "epoch": 3.992850759606792, "grad_norm": 0.025488421320915222, "learning_rate": 1.1810359732718796e-05, "loss": 0.0061, "step": 5585 }, { "epoch": 3.992850759606792, "eval_loss": 0.00702078640460968, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5585 }, { "epoch": 3.9935656836461124, "grad_norm": 0.02285645343363285, "learning_rate": 1.1794248002531644e-05, "loss": 0.004, "step": 5586 }, { "epoch": 3.9942806076854334, "grad_norm": 0.023321839049458504, "learning_rate": 1.177814580006742e-05, "loss": 0.0044, "step": 5587 }, { "epoch": 3.9949955317247543, "grad_norm": 0.022186974063515663, "learning_rate": 1.1762053129341643e-05, "loss": 0.0053, "step": 5588 }, { "epoch": 3.9957104557640752, "grad_norm": 0.028796181082725525, "learning_rate": 1.1745969994367523e-05, "loss": 0.0046, "step": 5589 }, { "epoch": 3.9964253798033957, "grad_norm": 0.027347084134817123, "learning_rate": 1.1729896399155831e-05, "loss": 0.0071, "step": 5590 }, { "epoch": 3.9964253798033957, "eval_loss": 0.007033652625977993, "eval_runtime": 4.5823, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5590 }, { "epoch": 3.9971403038427167, "grad_norm": 0.023853430524468422, "learning_rate": 1.171383234771501e-05, "loss": 0.0077, "step": 5591 }, { "epoch": 3.9978552278820376, "grad_norm": 0.029342539608478546, "learning_rate": 1.1697777844051105e-05, "loss": 0.0113, "step": 5592 }, { "epoch": 3.998570151921358, "grad_norm": 0.03491022065281868, "learning_rate": 1.1681732892167756e-05, "loss": 0.0097, "step": 5593 }, { "epoch": 3.999285075960679, "grad_norm": 0.025287551805377007, "learning_rate": 1.1665697496066253e-05, "loss": 0.0058, "step": 5594 }, { "epoch": 4.0, "grad_norm": 0.03098585642874241, "learning_rate": 1.1649671659745503e-05, "loss": 0.0084, "step": 5595 }, { "epoch": 4.0, "eval_loss": 0.007011804264038801, "eval_runtime": 4.6077, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 5595 }, { "epoch": 4.000714924039321, "grad_norm": 0.021513596177101135, "learning_rate": 1.1633655387201996e-05, "loss": 0.004, "step": 5596 }, { "epoch": 4.001429848078642, "grad_norm": 0.020564734935760498, "learning_rate": 1.1617648682429882e-05, "loss": 0.0039, "step": 5597 }, { "epoch": 4.002144772117963, "grad_norm": 0.02258847840130329, "learning_rate": 1.1601651549420873e-05, "loss": 0.0043, "step": 5598 }, { "epoch": 4.002859696157283, "grad_norm": 0.017445910722017288, "learning_rate": 1.1585663992164337e-05, "loss": 0.0041, "step": 5599 }, { "epoch": 4.003574620196604, "grad_norm": 0.024393439292907715, "learning_rate": 1.1569686014647252e-05, "loss": 0.005, "step": 5600 }, { "epoch": 4.003574620196604, "eval_loss": 0.007089006248861551, "eval_runtime": 4.5793, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 5600 }, { "epoch": 4.004289544235925, "grad_norm": 0.025675753131508827, "learning_rate": 1.1553717620854177e-05, "loss": 0.0056, "step": 5601 }, { "epoch": 4.005004468275246, "grad_norm": 0.017011389136314392, "learning_rate": 1.1537758814767297e-05, "loss": 0.0028, "step": 5602 }, { "epoch": 4.005719392314567, "grad_norm": 0.0218550693243742, "learning_rate": 1.152180960036643e-05, "loss": 0.0047, "step": 5603 }, { "epoch": 4.006434316353888, "grad_norm": 0.028809161856770515, "learning_rate": 1.1505869981628952e-05, "loss": 0.0049, "step": 5604 }, { "epoch": 4.0071492403932085, "grad_norm": 0.020324230194091797, "learning_rate": 1.1489939962529882e-05, "loss": 0.0026, "step": 5605 }, { "epoch": 4.0071492403932085, "eval_loss": 0.00728617375716567, "eval_runtime": 4.6182, "eval_samples_per_second": 10.827, "eval_steps_per_second": 2.815, "step": 5605 }, { "epoch": 4.0078641644325295, "grad_norm": 0.02313477173447609, "learning_rate": 1.1474019547041847e-05, "loss": 0.004, "step": 5606 }, { "epoch": 4.0085790884718495, "grad_norm": 0.020579099655151367, "learning_rate": 1.145810873913506e-05, "loss": 0.0035, "step": 5607 }, { "epoch": 4.0092940125111705, "grad_norm": 0.01816493645310402, "learning_rate": 1.1442207542777362e-05, "loss": 0.0029, "step": 5608 }, { "epoch": 4.010008936550491, "grad_norm": 0.02274462766945362, "learning_rate": 1.1426315961934158e-05, "loss": 0.0025, "step": 5609 }, { "epoch": 4.010723860589812, "grad_norm": 0.03340642899274826, "learning_rate": 1.1410434000568488e-05, "loss": 0.0037, "step": 5610 }, { "epoch": 4.010723860589812, "eval_loss": 0.0075758169405162334, "eval_runtime": 4.5933, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 5610 }, { "epoch": 4.011438784629133, "grad_norm": 0.026913031935691833, "learning_rate": 1.1394561662641e-05, "loss": 0.0026, "step": 5611 }, { "epoch": 4.012153708668454, "grad_norm": 0.023361969739198685, "learning_rate": 1.1378698952109901e-05, "loss": 0.0026, "step": 5612 }, { "epoch": 4.012868632707775, "grad_norm": 0.032681189477443695, "learning_rate": 1.1362845872931044e-05, "loss": 0.0047, "step": 5613 }, { "epoch": 4.013583556747095, "grad_norm": 0.02471497282385826, "learning_rate": 1.1347002429057835e-05, "loss": 0.0048, "step": 5614 }, { "epoch": 4.014298480786416, "grad_norm": 0.028219902887940407, "learning_rate": 1.1331168624441318e-05, "loss": 0.0045, "step": 5615 }, { "epoch": 4.014298480786416, "eval_loss": 0.0077487933449447155, "eval_runtime": 4.6161, "eval_samples_per_second": 10.832, "eval_steps_per_second": 2.816, "step": 5615 }, { "epoch": 4.015013404825737, "grad_norm": 0.040007758885622025, "learning_rate": 1.131534446303012e-05, "loss": 0.0043, "step": 5616 }, { "epoch": 4.015728328865058, "grad_norm": 0.025626441463828087, "learning_rate": 1.1299529948770442e-05, "loss": 0.0032, "step": 5617 }, { "epoch": 4.016443252904379, "grad_norm": 0.026284346356987953, "learning_rate": 1.12837250856061e-05, "loss": 0.0031, "step": 5618 }, { "epoch": 4.0171581769437, "grad_norm": 0.03969390317797661, "learning_rate": 1.1267929877478522e-05, "loss": 0.0047, "step": 5619 }, { "epoch": 4.017873100983021, "grad_norm": 0.026464996859431267, "learning_rate": 1.1252144328326674e-05, "loss": 0.0039, "step": 5620 }, { "epoch": 4.017873100983021, "eval_loss": 0.00762525200843811, "eval_runtime": 4.5773, "eval_samples_per_second": 10.924, "eval_steps_per_second": 2.84, "step": 5620 }, { "epoch": 4.018588025022342, "grad_norm": 0.032924436032772064, "learning_rate": 1.123636844208717e-05, "loss": 0.0027, "step": 5621 }, { "epoch": 4.019302949061662, "grad_norm": 0.030993953347206116, "learning_rate": 1.1220602222694165e-05, "loss": 0.004, "step": 5622 }, { "epoch": 4.020017873100983, "grad_norm": 0.03161732107400894, "learning_rate": 1.1204845674079439e-05, "loss": 0.0037, "step": 5623 }, { "epoch": 4.020732797140304, "grad_norm": 0.026196608319878578, "learning_rate": 1.1189098800172364e-05, "loss": 0.0054, "step": 5624 }, { "epoch": 4.021447721179625, "grad_norm": 0.02717943675816059, "learning_rate": 1.1173361604899857e-05, "loss": 0.004, "step": 5625 }, { "epoch": 4.021447721179625, "eval_loss": 0.007204261142760515, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 5625 }, { "epoch": 4.022162645218946, "grad_norm": 0.02425418049097061, "learning_rate": 1.1157634092186464e-05, "loss": 0.004, "step": 5626 }, { "epoch": 4.022877569258267, "grad_norm": 0.025042816996574402, "learning_rate": 1.1141916265954311e-05, "loss": 0.0031, "step": 5627 }, { "epoch": 4.0235924932975875, "grad_norm": 0.026418110355734825, "learning_rate": 1.1126208130123055e-05, "loss": 0.0043, "step": 5628 }, { "epoch": 4.0243074173369076, "grad_norm": 0.018652692437171936, "learning_rate": 1.1110509688610038e-05, "loss": 0.0028, "step": 5629 }, { "epoch": 4.0250223413762285, "grad_norm": 0.029465915635228157, "learning_rate": 1.1094820945330086e-05, "loss": 0.0059, "step": 5630 }, { "epoch": 4.0250223413762285, "eval_loss": 0.007124136667698622, "eval_runtime": 4.5886, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 5630 }, { "epoch": 4.025737265415549, "grad_norm": 0.01905275695025921, "learning_rate": 1.1079141904195661e-05, "loss": 0.0026, "step": 5631 }, { "epoch": 4.02645218945487, "grad_norm": 0.026319419965147972, "learning_rate": 1.10634725691168e-05, "loss": 0.0035, "step": 5632 }, { "epoch": 4.027167113494191, "grad_norm": 0.023438142612576485, "learning_rate": 1.1047812944001085e-05, "loss": 0.0058, "step": 5633 }, { "epoch": 4.027882037533512, "grad_norm": 0.016598127782344818, "learning_rate": 1.1032163032753717e-05, "loss": 0.002, "step": 5634 }, { "epoch": 4.028596961572833, "grad_norm": 0.022202854976058006, "learning_rate": 1.1016522839277472e-05, "loss": 0.0033, "step": 5635 }, { "epoch": 4.028596961572833, "eval_loss": 0.007122484501451254, "eval_runtime": 4.5865, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 5635 }, { "epoch": 4.029311885612154, "grad_norm": 0.021915268152952194, "learning_rate": 1.1000892367472659e-05, "loss": 0.0029, "step": 5636 }, { "epoch": 4.030026809651474, "grad_norm": 0.027741778641939163, "learning_rate": 1.098527162123723e-05, "loss": 0.0049, "step": 5637 }, { "epoch": 4.030741733690795, "grad_norm": 0.024271460250020027, "learning_rate": 1.0969660604466647e-05, "loss": 0.0027, "step": 5638 }, { "epoch": 4.031456657730116, "grad_norm": 0.024256069213151932, "learning_rate": 1.0954059321053978e-05, "loss": 0.0041, "step": 5639 }, { "epoch": 4.032171581769437, "grad_norm": 0.023630043491721153, "learning_rate": 1.0938467774889882e-05, "loss": 0.003, "step": 5640 }, { "epoch": 4.032171581769437, "eval_loss": 0.007192219607532024, "eval_runtime": 4.6367, "eval_samples_per_second": 10.784, "eval_steps_per_second": 2.804, "step": 5640 }, { "epoch": 4.032886505808758, "grad_norm": 0.026676736772060394, "learning_rate": 1.0922885969862539e-05, "loss": 0.0052, "step": 5641 }, { "epoch": 4.033601429848079, "grad_norm": 0.023807266727089882, "learning_rate": 1.0907313909857735e-05, "loss": 0.0029, "step": 5642 }, { "epoch": 4.0343163538874, "grad_norm": 0.018728796392679214, "learning_rate": 1.0891751598758848e-05, "loss": 0.0026, "step": 5643 }, { "epoch": 4.03503127792672, "grad_norm": 0.027357256039977074, "learning_rate": 1.0876199040446755e-05, "loss": 0.0036, "step": 5644 }, { "epoch": 4.035746201966041, "grad_norm": 0.02353154867887497, "learning_rate": 1.0860656238799971e-05, "loss": 0.0026, "step": 5645 }, { "epoch": 4.035746201966041, "eval_loss": 0.007227973081171513, "eval_runtime": 4.5802, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5645 }, { "epoch": 4.036461126005362, "grad_norm": 0.026258369907736778, "learning_rate": 1.0845123197694529e-05, "loss": 0.0027, "step": 5646 }, { "epoch": 4.037176050044683, "grad_norm": 0.022651685401797295, "learning_rate": 1.0829599921004053e-05, "loss": 0.0023, "step": 5647 }, { "epoch": 4.037890974084004, "grad_norm": 0.03211458772420883, "learning_rate": 1.0814086412599739e-05, "loss": 0.0035, "step": 5648 }, { "epoch": 4.038605898123325, "grad_norm": 0.022944306954741478, "learning_rate": 1.0798582676350316e-05, "loss": 0.0025, "step": 5649 }, { "epoch": 4.0393208221626455, "grad_norm": 0.018029753118753433, "learning_rate": 1.0783088716122103e-05, "loss": 0.0024, "step": 5650 }, { "epoch": 4.0393208221626455, "eval_loss": 0.00722627155482769, "eval_runtime": 4.5771, "eval_samples_per_second": 10.924, "eval_steps_per_second": 2.84, "step": 5650 }, { "epoch": 4.040035746201966, "grad_norm": 0.024673722684383392, "learning_rate": 1.0767604535778974e-05, "loss": 0.0027, "step": 5651 }, { "epoch": 4.0407506702412865, "grad_norm": 0.027404729276895523, "learning_rate": 1.0752130139182365e-05, "loss": 0.004, "step": 5652 }, { "epoch": 4.0414655942806075, "grad_norm": 0.02204035595059395, "learning_rate": 1.0736665530191275e-05, "loss": 0.0028, "step": 5653 }, { "epoch": 4.042180518319928, "grad_norm": 0.02820977382361889, "learning_rate": 1.0721210712662238e-05, "loss": 0.004, "step": 5654 }, { "epoch": 4.042895442359249, "grad_norm": 0.02381950244307518, "learning_rate": 1.0705765690449377e-05, "loss": 0.004, "step": 5655 }, { "epoch": 4.042895442359249, "eval_loss": 0.007253393996506929, "eval_runtime": 4.5799, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5655 }, { "epoch": 4.04361036639857, "grad_norm": 0.027949776500463486, "learning_rate": 1.0690330467404375e-05, "loss": 0.0033, "step": 5656 }, { "epoch": 4.044325290437891, "grad_norm": 0.02724815346300602, "learning_rate": 1.0674905047376422e-05, "loss": 0.0039, "step": 5657 }, { "epoch": 4.045040214477212, "grad_norm": 0.025521552190184593, "learning_rate": 1.0659489434212321e-05, "loss": 0.0053, "step": 5658 }, { "epoch": 4.045755138516532, "grad_norm": 0.0326373465359211, "learning_rate": 1.0644083631756419e-05, "loss": 0.0057, "step": 5659 }, { "epoch": 4.046470062555853, "grad_norm": 0.02106441929936409, "learning_rate": 1.0628687643850572e-05, "loss": 0.0022, "step": 5660 }, { "epoch": 4.046470062555853, "eval_loss": 0.0072518447414040565, "eval_runtime": 4.5861, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 5660 }, { "epoch": 4.047184986595174, "grad_norm": 0.02602100931107998, "learning_rate": 1.0613301474334252e-05, "loss": 0.0038, "step": 5661 }, { "epoch": 4.047899910634495, "grad_norm": 0.026384567841887474, "learning_rate": 1.0597925127044423e-05, "loss": 0.0038, "step": 5662 }, { "epoch": 4.048614834673816, "grad_norm": 0.022959405556321144, "learning_rate": 1.0582558605815634e-05, "loss": 0.0039, "step": 5663 }, { "epoch": 4.049329758713137, "grad_norm": 0.023379968479275703, "learning_rate": 1.0567201914480001e-05, "loss": 0.0039, "step": 5664 }, { "epoch": 4.050044682752458, "grad_norm": 0.026763716712594032, "learning_rate": 1.0551855056867132e-05, "loss": 0.0051, "step": 5665 }, { "epoch": 4.050044682752458, "eval_loss": 0.007167786825448275, "eval_runtime": 4.5815, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5665 }, { "epoch": 4.050759606791778, "grad_norm": 0.0210683885961771, "learning_rate": 1.0536518036804228e-05, "loss": 0.0032, "step": 5666 }, { "epoch": 4.051474530831099, "grad_norm": 0.024014364928007126, "learning_rate": 1.0521190858116042e-05, "loss": 0.0052, "step": 5667 }, { "epoch": 4.05218945487042, "grad_norm": 0.025081023573875427, "learning_rate": 1.0505873524624821e-05, "loss": 0.0038, "step": 5668 }, { "epoch": 4.052904378909741, "grad_norm": 0.027594417333602905, "learning_rate": 1.0490566040150428e-05, "loss": 0.006, "step": 5669 }, { "epoch": 4.053619302949062, "grad_norm": 0.018979331478476524, "learning_rate": 1.0475268408510191e-05, "loss": 0.0027, "step": 5670 }, { "epoch": 4.053619302949062, "eval_loss": 0.0071611772291362286, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 5670 }, { "epoch": 4.054334226988383, "grad_norm": 0.026477094739675522, "learning_rate": 1.045998063351905e-05, "loss": 0.0038, "step": 5671 }, { "epoch": 4.055049151027704, "grad_norm": 0.025305205956101418, "learning_rate": 1.0444702718989452e-05, "loss": 0.0045, "step": 5672 }, { "epoch": 4.0557640750670245, "grad_norm": 0.025293711572885513, "learning_rate": 1.0429434668731392e-05, "loss": 0.0026, "step": 5673 }, { "epoch": 4.056478999106345, "grad_norm": 0.031525734812021255, "learning_rate": 1.0414176486552424e-05, "loss": 0.0043, "step": 5674 }, { "epoch": 4.0571939231456655, "grad_norm": 0.025058958679437637, "learning_rate": 1.0398928176257588e-05, "loss": 0.003, "step": 5675 }, { "epoch": 4.0571939231456655, "eval_loss": 0.007209126837551594, "eval_runtime": 4.5954, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 5675 }, { "epoch": 4.0579088471849865, "grad_norm": 0.03353450819849968, "learning_rate": 1.0383689741649516e-05, "loss": 0.0057, "step": 5676 }, { "epoch": 4.058623771224307, "grad_norm": 0.04219072684645653, "learning_rate": 1.0368461186528367e-05, "loss": 0.0036, "step": 5677 }, { "epoch": 4.059338695263628, "grad_norm": 0.031035741791129112, "learning_rate": 1.0353242514691807e-05, "loss": 0.0043, "step": 5678 }, { "epoch": 4.060053619302949, "grad_norm": 0.033384937793016434, "learning_rate": 1.0338033729935066e-05, "loss": 0.0027, "step": 5679 }, { "epoch": 4.06076854334227, "grad_norm": 0.024883799254894257, "learning_rate": 1.032283483605091e-05, "loss": 0.0044, "step": 5680 }, { "epoch": 4.06076854334227, "eval_loss": 0.007115321699529886, "eval_runtime": 4.5808, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5680 }, { "epoch": 4.06148346738159, "grad_norm": 0.0228736512362957, "learning_rate": 1.0307645836829604e-05, "loss": 0.0039, "step": 5681 }, { "epoch": 4.062198391420911, "grad_norm": 0.025121403858065605, "learning_rate": 1.0292466736058987e-05, "loss": 0.0044, "step": 5682 }, { "epoch": 4.062913315460232, "grad_norm": 0.05234624445438385, "learning_rate": 1.0277297537524422e-05, "loss": 0.0032, "step": 5683 }, { "epoch": 4.063628239499553, "grad_norm": 0.027606666088104248, "learning_rate": 1.0262138245008768e-05, "loss": 0.0034, "step": 5684 }, { "epoch": 4.064343163538874, "grad_norm": 0.026684679090976715, "learning_rate": 1.0246988862292462e-05, "loss": 0.0049, "step": 5685 }, { "epoch": 4.064343163538874, "eval_loss": 0.007172818761318922, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5685 }, { "epoch": 4.065058087578195, "grad_norm": 0.026772761717438698, "learning_rate": 1.023184939315342e-05, "loss": 0.0035, "step": 5686 }, { "epoch": 4.065773011617516, "grad_norm": 0.03373505175113678, "learning_rate": 1.0216719841367129e-05, "loss": 0.0041, "step": 5687 }, { "epoch": 4.066487935656837, "grad_norm": 0.02443569526076317, "learning_rate": 1.0201600210706597e-05, "loss": 0.0028, "step": 5688 }, { "epoch": 4.067202859696157, "grad_norm": 0.032465774565935135, "learning_rate": 1.0186490504942319e-05, "loss": 0.005, "step": 5689 }, { "epoch": 4.067917783735478, "grad_norm": 0.02645975537598133, "learning_rate": 1.0171390727842355e-05, "loss": 0.0046, "step": 5690 }, { "epoch": 4.067917783735478, "eval_loss": 0.007238938473165035, "eval_runtime": 4.5895, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.833, "step": 5690 }, { "epoch": 4.068632707774799, "grad_norm": 0.02953052893280983, "learning_rate": 1.0156300883172292e-05, "loss": 0.0056, "step": 5691 }, { "epoch": 4.06934763181412, "grad_norm": 0.027937786653637886, "learning_rate": 1.0141220974695199e-05, "loss": 0.0042, "step": 5692 }, { "epoch": 4.070062555853441, "grad_norm": 0.02557063102722168, "learning_rate": 1.012615100617172e-05, "loss": 0.0034, "step": 5693 }, { "epoch": 4.070777479892762, "grad_norm": 0.019616955891251564, "learning_rate": 1.011109098135996e-05, "loss": 0.0028, "step": 5694 }, { "epoch": 4.071492403932083, "grad_norm": 0.023841215297579765, "learning_rate": 1.009604090401558e-05, "loss": 0.0052, "step": 5695 }, { "epoch": 4.071492403932083, "eval_loss": 0.007191431242972612, "eval_runtime": 4.6078, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 5695 }, { "epoch": 4.072207327971403, "grad_norm": 0.02492366172373295, "learning_rate": 1.0081000777891803e-05, "loss": 0.0044, "step": 5696 }, { "epoch": 4.0729222520107236, "grad_norm": 0.025506841018795967, "learning_rate": 1.0065970606739273e-05, "loss": 0.0056, "step": 5697 }, { "epoch": 4.0736371760500445, "grad_norm": 0.02999906800687313, "learning_rate": 1.0050950394306242e-05, "loss": 0.0053, "step": 5698 }, { "epoch": 4.074352100089365, "grad_norm": 0.027629468590021133, "learning_rate": 1.0035940144338406e-05, "loss": 0.0048, "step": 5699 }, { "epoch": 4.075067024128686, "grad_norm": 0.023865316063165665, "learning_rate": 1.0020939860579031e-05, "loss": 0.0028, "step": 5700 }, { "epoch": 4.075067024128686, "eval_loss": 0.007248422130942345, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5700 }, { "epoch": 4.075781948168007, "grad_norm": 0.02714594453573227, "learning_rate": 1.0005949546768877e-05, "loss": 0.004, "step": 5701 }, { "epoch": 4.076496872207328, "grad_norm": 0.02220909483730793, "learning_rate": 9.990969206646207e-06, "loss": 0.0026, "step": 5702 }, { "epoch": 4.077211796246649, "grad_norm": 0.026381943374872208, "learning_rate": 9.97599884394681e-06, "loss": 0.0027, "step": 5703 }, { "epoch": 4.077926720285969, "grad_norm": 0.024102458730340004, "learning_rate": 9.961038462403999e-06, "loss": 0.0038, "step": 5704 }, { "epoch": 4.07864164432529, "grad_norm": 0.03562544658780098, "learning_rate": 9.94608806574856e-06, "loss": 0.0084, "step": 5705 }, { "epoch": 4.07864164432529, "eval_loss": 0.007295980118215084, "eval_runtime": 4.6053, "eval_samples_per_second": 10.857, "eval_steps_per_second": 2.823, "step": 5705 }, { "epoch": 4.079356568364611, "grad_norm": 0.02215360663831234, "learning_rate": 9.931147657708823e-06, "loss": 0.003, "step": 5706 }, { "epoch": 4.080071492403932, "grad_norm": 0.026431234553456306, "learning_rate": 9.916217242010633e-06, "loss": 0.004, "step": 5707 }, { "epoch": 4.080786416443253, "grad_norm": 0.02872699685394764, "learning_rate": 9.901296822377292e-06, "loss": 0.0029, "step": 5708 }, { "epoch": 4.081501340482574, "grad_norm": 0.03156553581357002, "learning_rate": 9.88638640252968e-06, "loss": 0.0045, "step": 5709 }, { "epoch": 4.082216264521895, "grad_norm": 0.02341168187558651, "learning_rate": 9.871485986186113e-06, "loss": 0.0033, "step": 5710 }, { "epoch": 4.082216264521895, "eval_loss": 0.007272055372595787, "eval_runtime": 4.5925, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 5710 }, { "epoch": 4.082931188561215, "grad_norm": 0.025683538988232613, "learning_rate": 9.856595577062455e-06, "loss": 0.0026, "step": 5711 }, { "epoch": 4.083646112600536, "grad_norm": 0.031113725155591965, "learning_rate": 9.841715178872092e-06, "loss": 0.0059, "step": 5712 }, { "epoch": 4.084361036639857, "grad_norm": 0.027512094005942345, "learning_rate": 9.82684479532585e-06, "loss": 0.0046, "step": 5713 }, { "epoch": 4.085075960679178, "grad_norm": 0.03124389983713627, "learning_rate": 9.811984430132116e-06, "loss": 0.004, "step": 5714 }, { "epoch": 4.085790884718499, "grad_norm": 0.03010413981974125, "learning_rate": 9.797134086996761e-06, "loss": 0.0035, "step": 5715 }, { "epoch": 4.085790884718499, "eval_loss": 0.007207739632576704, "eval_runtime": 4.5845, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 5715 }, { "epoch": 4.08650580875782, "grad_norm": 0.030139001086354256, "learning_rate": 9.782293769623135e-06, "loss": 0.0045, "step": 5716 }, { "epoch": 4.087220732797141, "grad_norm": 0.026432406157255173, "learning_rate": 9.767463481712114e-06, "loss": 0.0035, "step": 5717 }, { "epoch": 4.0879356568364615, "grad_norm": 0.0237946305423975, "learning_rate": 9.752643226962066e-06, "loss": 0.0024, "step": 5718 }, { "epoch": 4.088650580875782, "grad_norm": 0.025189142674207687, "learning_rate": 9.737833009068859e-06, "loss": 0.0038, "step": 5719 }, { "epoch": 4.0893655049151025, "grad_norm": 0.02324841357767582, "learning_rate": 9.723032831725858e-06, "loss": 0.0028, "step": 5720 }, { "epoch": 4.0893655049151025, "eval_loss": 0.0071876635774970055, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5720 }, { "epoch": 4.0900804289544235, "grad_norm": 0.025244703516364098, "learning_rate": 9.708242698623898e-06, "loss": 0.0033, "step": 5721 }, { "epoch": 4.090795352993744, "grad_norm": 0.03427784889936447, "learning_rate": 9.693462613451365e-06, "loss": 0.0045, "step": 5722 }, { "epoch": 4.091510277033065, "grad_norm": 0.02309427224099636, "learning_rate": 9.678692579894073e-06, "loss": 0.0045, "step": 5723 }, { "epoch": 4.092225201072386, "grad_norm": 0.025298181921243668, "learning_rate": 9.663932601635378e-06, "loss": 0.0037, "step": 5724 }, { "epoch": 4.092940125111707, "grad_norm": 0.02680467627942562, "learning_rate": 9.649182682356122e-06, "loss": 0.0031, "step": 5725 }, { "epoch": 4.092940125111707, "eval_loss": 0.007182059343904257, "eval_runtime": 4.5815, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5725 }, { "epoch": 4.093655049151027, "grad_norm": 0.01834794692695141, "learning_rate": 9.634442825734607e-06, "loss": 0.0024, "step": 5726 }, { "epoch": 4.094369973190348, "grad_norm": 0.021565185859799385, "learning_rate": 9.619713035446665e-06, "loss": 0.0038, "step": 5727 }, { "epoch": 4.095084897229669, "grad_norm": 0.024540960788726807, "learning_rate": 9.604993315165606e-06, "loss": 0.0027, "step": 5728 }, { "epoch": 4.09579982126899, "grad_norm": 0.025328408926725388, "learning_rate": 9.590283668562194e-06, "loss": 0.0032, "step": 5729 }, { "epoch": 4.096514745308311, "grad_norm": 0.03293231502175331, "learning_rate": 9.575584099304735e-06, "loss": 0.0053, "step": 5730 }, { "epoch": 4.096514745308311, "eval_loss": 0.007189049385488033, "eval_runtime": 4.5823, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 5730 }, { "epoch": 4.097229669347632, "grad_norm": 0.022052381187677383, "learning_rate": 9.560894611059001e-06, "loss": 0.0028, "step": 5731 }, { "epoch": 4.097944593386953, "grad_norm": 0.02516588754951954, "learning_rate": 9.546215207488225e-06, "loss": 0.0039, "step": 5732 }, { "epoch": 4.098659517426274, "grad_norm": 0.02362586371600628, "learning_rate": 9.531545892253168e-06, "loss": 0.0051, "step": 5733 }, { "epoch": 4.099374441465594, "grad_norm": 0.02686757594347, "learning_rate": 9.516886669012031e-06, "loss": 0.0052, "step": 5734 }, { "epoch": 4.100089365504915, "grad_norm": 0.025101160630583763, "learning_rate": 9.502237541420534e-06, "loss": 0.0045, "step": 5735 }, { "epoch": 4.100089365504915, "eval_loss": 0.007320282515138388, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5735 }, { "epoch": 4.100804289544236, "grad_norm": 0.025677388533949852, "learning_rate": 9.487598513131868e-06, "loss": 0.0027, "step": 5736 }, { "epoch": 4.101519213583557, "grad_norm": 0.02389640174806118, "learning_rate": 9.472969587796693e-06, "loss": 0.0044, "step": 5737 }, { "epoch": 4.102234137622878, "grad_norm": 0.02761891484260559, "learning_rate": 9.458350769063162e-06, "loss": 0.0048, "step": 5738 }, { "epoch": 4.102949061662199, "grad_norm": 0.026027366518974304, "learning_rate": 9.443742060576915e-06, "loss": 0.0023, "step": 5739 }, { "epoch": 4.10366398570152, "grad_norm": 0.029619654640555382, "learning_rate": 9.42914346598105e-06, "loss": 0.0046, "step": 5740 }, { "epoch": 4.10366398570152, "eval_loss": 0.007343692239373922, "eval_runtime": 4.5969, "eval_samples_per_second": 10.877, "eval_steps_per_second": 2.828, "step": 5740 }, { "epoch": 4.10437890974084, "grad_norm": 0.03102249838411808, "learning_rate": 9.414554988916174e-06, "loss": 0.0059, "step": 5741 }, { "epoch": 4.105093833780161, "grad_norm": 0.030504174530506134, "learning_rate": 9.399976633020325e-06, "loss": 0.0049, "step": 5742 }, { "epoch": 4.1058087578194815, "grad_norm": 0.024165408685803413, "learning_rate": 9.385408401929051e-06, "loss": 0.0028, "step": 5743 }, { "epoch": 4.1065236818588025, "grad_norm": 0.026456981897354126, "learning_rate": 9.370850299275385e-06, "loss": 0.0027, "step": 5744 }, { "epoch": 4.107238605898123, "grad_norm": 0.021844318136572838, "learning_rate": 9.356302328689786e-06, "loss": 0.0021, "step": 5745 }, { "epoch": 4.107238605898123, "eval_loss": 0.007356844376772642, "eval_runtime": 4.5813, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5745 }, { "epoch": 4.107953529937444, "grad_norm": 0.029454749077558517, "learning_rate": 9.341764493800248e-06, "loss": 0.0029, "step": 5746 }, { "epoch": 4.108668453976765, "grad_norm": 0.026344578713178635, "learning_rate": 9.327236798232175e-06, "loss": 0.0034, "step": 5747 }, { "epoch": 4.109383378016086, "grad_norm": 0.022972896695137024, "learning_rate": 9.312719245608487e-06, "loss": 0.0029, "step": 5748 }, { "epoch": 4.110098302055406, "grad_norm": 0.027782244607806206, "learning_rate": 9.298211839549576e-06, "loss": 0.0025, "step": 5749 }, { "epoch": 4.110813226094727, "grad_norm": 0.030145172029733658, "learning_rate": 9.283714583673264e-06, "loss": 0.0025, "step": 5750 }, { "epoch": 4.110813226094727, "eval_loss": 0.007367836311459541, "eval_runtime": 4.5812, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 5750 }, { "epoch": 4.111528150134048, "grad_norm": 0.021448079496622086, "learning_rate": 9.269227481594871e-06, "loss": 0.0025, "step": 5751 }, { "epoch": 4.112243074173369, "grad_norm": 0.01973397471010685, "learning_rate": 9.254750536927204e-06, "loss": 0.0022, "step": 5752 }, { "epoch": 4.11295799821269, "grad_norm": 0.01653939113020897, "learning_rate": 9.240283753280476e-06, "loss": 0.0016, "step": 5753 }, { "epoch": 4.113672922252011, "grad_norm": 0.025126492604613304, "learning_rate": 9.225827134262421e-06, "loss": 0.0038, "step": 5754 }, { "epoch": 4.114387846291332, "grad_norm": 0.02561149373650551, "learning_rate": 9.211380683478237e-06, "loss": 0.0028, "step": 5755 }, { "epoch": 4.114387846291332, "eval_loss": 0.007499501574784517, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 5755 }, { "epoch": 4.115102770330652, "grad_norm": 0.02761317789554596, "learning_rate": 9.196944404530544e-06, "loss": 0.005, "step": 5756 }, { "epoch": 4.115817694369973, "grad_norm": 0.025595227256417274, "learning_rate": 9.182518301019466e-06, "loss": 0.0038, "step": 5757 }, { "epoch": 4.116532618409294, "grad_norm": 0.02487025409936905, "learning_rate": 9.168102376542559e-06, "loss": 0.0025, "step": 5758 }, { "epoch": 4.117247542448615, "grad_norm": 0.02166397124528885, "learning_rate": 9.153696634694863e-06, "loss": 0.0025, "step": 5759 }, { "epoch": 4.117962466487936, "grad_norm": 0.0350010059773922, "learning_rate": 9.139301079068891e-06, "loss": 0.0048, "step": 5760 }, { "epoch": 4.117962466487936, "eval_loss": 0.00738687114790082, "eval_runtime": 4.5921, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 5760 }, { "epoch": 4.118677390527257, "grad_norm": 0.028145775198936462, "learning_rate": 9.12491571325455e-06, "loss": 0.0046, "step": 5761 }, { "epoch": 4.119392314566578, "grad_norm": 0.032979968935251236, "learning_rate": 9.110540540839307e-06, "loss": 0.0061, "step": 5762 }, { "epoch": 4.120107238605899, "grad_norm": 0.020730387419462204, "learning_rate": 9.096175565407993e-06, "loss": 0.0024, "step": 5763 }, { "epoch": 4.120822162645219, "grad_norm": 0.021138792857527733, "learning_rate": 9.08182079054295e-06, "loss": 0.0033, "step": 5764 }, { "epoch": 4.1215370866845396, "grad_norm": 0.02482297271490097, "learning_rate": 9.067476219823973e-06, "loss": 0.0025, "step": 5765 }, { "epoch": 4.1215370866845396, "eval_loss": 0.007374881766736507, "eval_runtime": 4.6167, "eval_samples_per_second": 10.83, "eval_steps_per_second": 2.816, "step": 5765 }, { "epoch": 4.1222520107238605, "grad_norm": 0.03490757197141647, "learning_rate": 9.053141856828274e-06, "loss": 0.0038, "step": 5766 }, { "epoch": 4.122966934763181, "grad_norm": 0.026408223435282707, "learning_rate": 9.038817705130558e-06, "loss": 0.0047, "step": 5767 }, { "epoch": 4.123681858802502, "grad_norm": 0.023846590891480446, "learning_rate": 9.02450376830299e-06, "loss": 0.0025, "step": 5768 }, { "epoch": 4.124396782841823, "grad_norm": 0.0247403122484684, "learning_rate": 9.01020004991514e-06, "loss": 0.0021, "step": 5769 }, { "epoch": 4.125111706881144, "grad_norm": 0.02767360769212246, "learning_rate": 8.995906553534084e-06, "loss": 0.0029, "step": 5770 }, { "epoch": 4.125111706881144, "eval_loss": 0.0073272595182061195, "eval_runtime": 4.6135, "eval_samples_per_second": 10.838, "eval_steps_per_second": 2.818, "step": 5770 }, { "epoch": 4.125826630920464, "grad_norm": 0.023630741983652115, "learning_rate": 8.981623282724294e-06, "loss": 0.0027, "step": 5771 }, { "epoch": 4.126541554959785, "grad_norm": 0.030174467712640762, "learning_rate": 8.967350241047745e-06, "loss": 0.0046, "step": 5772 }, { "epoch": 4.127256478999106, "grad_norm": 0.025021709501743317, "learning_rate": 8.95308743206384e-06, "loss": 0.004, "step": 5773 }, { "epoch": 4.127971403038427, "grad_norm": 0.023294249549508095, "learning_rate": 8.938834859329414e-06, "loss": 0.0024, "step": 5774 }, { "epoch": 4.128686327077748, "grad_norm": 0.029395198449492455, "learning_rate": 8.924592526398761e-06, "loss": 0.0026, "step": 5775 }, { "epoch": 4.128686327077748, "eval_loss": 0.007295240648090839, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5775 }, { "epoch": 4.129401251117069, "grad_norm": 0.037597302347421646, "learning_rate": 8.91036043682365e-06, "loss": 0.0022, "step": 5776 }, { "epoch": 4.13011617515639, "grad_norm": 0.027942508459091187, "learning_rate": 8.896138594153241e-06, "loss": 0.0039, "step": 5777 }, { "epoch": 4.13083109919571, "grad_norm": 0.03572756424546242, "learning_rate": 8.881927001934175e-06, "loss": 0.0036, "step": 5778 }, { "epoch": 4.131546023235031, "grad_norm": 0.03775570169091225, "learning_rate": 8.867725663710547e-06, "loss": 0.0059, "step": 5779 }, { "epoch": 4.132260947274352, "grad_norm": 0.025153370574116707, "learning_rate": 8.853534583023842e-06, "loss": 0.0052, "step": 5780 }, { "epoch": 4.132260947274352, "eval_loss": 0.007243621163070202, "eval_runtime": 4.6082, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 5780 }, { "epoch": 4.132975871313673, "grad_norm": 0.029983168467879295, "learning_rate": 8.839353763413055e-06, "loss": 0.0036, "step": 5781 }, { "epoch": 4.133690795352994, "grad_norm": 0.023730982095003128, "learning_rate": 8.82518320841456e-06, "loss": 0.0023, "step": 5782 }, { "epoch": 4.134405719392315, "grad_norm": 0.02511592023074627, "learning_rate": 8.811022921562217e-06, "loss": 0.0031, "step": 5783 }, { "epoch": 4.135120643431636, "grad_norm": 0.024520166218280792, "learning_rate": 8.7968729063873e-06, "loss": 0.0027, "step": 5784 }, { "epoch": 4.135835567470957, "grad_norm": 0.02648898772895336, "learning_rate": 8.782733166418538e-06, "loss": 0.0023, "step": 5785 }, { "epoch": 4.135835567470957, "eval_loss": 0.007288254797458649, "eval_runtime": 4.599, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 5785 }, { "epoch": 4.136550491510277, "grad_norm": 0.023752566426992416, "learning_rate": 8.768603705182094e-06, "loss": 0.0028, "step": 5786 }, { "epoch": 4.137265415549598, "grad_norm": 0.025440867990255356, "learning_rate": 8.754484526201544e-06, "loss": 0.0041, "step": 5787 }, { "epoch": 4.1379803395889185, "grad_norm": 0.02495599538087845, "learning_rate": 8.740375632997927e-06, "loss": 0.0038, "step": 5788 }, { "epoch": 4.1386952636282395, "grad_norm": 0.03202643617987633, "learning_rate": 8.726277029089724e-06, "loss": 0.0058, "step": 5789 }, { "epoch": 4.13941018766756, "grad_norm": 0.02889202907681465, "learning_rate": 8.712188717992814e-06, "loss": 0.0041, "step": 5790 }, { "epoch": 4.13941018766756, "eval_loss": 0.007311760913580656, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 5790 }, { "epoch": 4.140125111706881, "grad_norm": 0.026956316083669662, "learning_rate": 8.69811070322053e-06, "loss": 0.0038, "step": 5791 }, { "epoch": 4.140840035746202, "grad_norm": 0.036234300583601, "learning_rate": 8.684042988283658e-06, "loss": 0.0067, "step": 5792 }, { "epoch": 4.141554959785523, "grad_norm": 0.026273151859641075, "learning_rate": 8.66998557669037e-06, "loss": 0.0036, "step": 5793 }, { "epoch": 4.142269883824843, "grad_norm": 0.027509793639183044, "learning_rate": 8.655938471946313e-06, "loss": 0.0041, "step": 5794 }, { "epoch": 4.142984807864164, "grad_norm": 0.027189642190933228, "learning_rate": 8.641901677554526e-06, "loss": 0.0037, "step": 5795 }, { "epoch": 4.142984807864164, "eval_loss": 0.007323262747377157, "eval_runtime": 4.579, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 5795 }, { "epoch": 4.143699731903485, "grad_norm": 0.028987670317292213, "learning_rate": 8.6278751970155e-06, "loss": 0.003, "step": 5796 }, { "epoch": 4.144414655942806, "grad_norm": 0.02472180314362049, "learning_rate": 8.613859033827166e-06, "loss": 0.0026, "step": 5797 }, { "epoch": 4.145129579982127, "grad_norm": 0.021620361134409904, "learning_rate": 8.59985319148484e-06, "loss": 0.0032, "step": 5798 }, { "epoch": 4.145844504021448, "grad_norm": 0.033127304166555405, "learning_rate": 8.585857673481301e-06, "loss": 0.005, "step": 5799 }, { "epoch": 4.146559428060769, "grad_norm": 0.029922110959887505, "learning_rate": 8.571872483306747e-06, "loss": 0.0043, "step": 5800 }, { "epoch": 4.146559428060769, "eval_loss": 0.007359272800385952, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 5800 }, { "epoch": 4.147274352100089, "grad_norm": 0.027541013434529305, "learning_rate": 8.557897624448779e-06, "loss": 0.0044, "step": 5801 }, { "epoch": 4.14798927613941, "grad_norm": 0.03052937239408493, "learning_rate": 8.543933100392459e-06, "loss": 0.0027, "step": 5802 }, { "epoch": 4.148704200178731, "grad_norm": 0.022379377856850624, "learning_rate": 8.529978914620218e-06, "loss": 0.0036, "step": 5803 }, { "epoch": 4.149419124218052, "grad_norm": 0.023808814585208893, "learning_rate": 8.516035070611967e-06, "loss": 0.0037, "step": 5804 }, { "epoch": 4.150134048257373, "grad_norm": 0.02620459534227848, "learning_rate": 8.502101571845001e-06, "loss": 0.0045, "step": 5805 }, { "epoch": 4.150134048257373, "eval_loss": 0.0073099443688988686, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5805 }, { "epoch": 4.150848972296694, "grad_norm": 0.03282605856657028, "learning_rate": 8.488178421794047e-06, "loss": 0.0047, "step": 5806 }, { "epoch": 4.151563896336015, "grad_norm": 0.028988482430577278, "learning_rate": 8.474265623931271e-06, "loss": 0.0052, "step": 5807 }, { "epoch": 4.152278820375335, "grad_norm": 0.020436255261301994, "learning_rate": 8.460363181726199e-06, "loss": 0.0022, "step": 5808 }, { "epoch": 4.152993744414656, "grad_norm": 0.0252582598477602, "learning_rate": 8.44647109864583e-06, "loss": 0.0044, "step": 5809 }, { "epoch": 4.153708668453977, "grad_norm": 0.024183912202715874, "learning_rate": 8.432589378154582e-06, "loss": 0.0022, "step": 5810 }, { "epoch": 4.153708668453977, "eval_loss": 0.007345011457800865, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 5810 }, { "epoch": 4.1544235924932975, "grad_norm": 0.02565700002014637, "learning_rate": 8.418718023714235e-06, "loss": 0.0035, "step": 5811 }, { "epoch": 4.1551385165326185, "grad_norm": 0.03207520768046379, "learning_rate": 8.404857038784025e-06, "loss": 0.0034, "step": 5812 }, { "epoch": 4.155853440571939, "grad_norm": 0.02647203393280506, "learning_rate": 8.391006426820618e-06, "loss": 0.004, "step": 5813 }, { "epoch": 4.15656836461126, "grad_norm": 0.026348769664764404, "learning_rate": 8.377166191278036e-06, "loss": 0.0031, "step": 5814 }, { "epoch": 4.157283288650581, "grad_norm": 0.021558113396167755, "learning_rate": 8.36333633560777e-06, "loss": 0.003, "step": 5815 }, { "epoch": 4.157283288650581, "eval_loss": 0.007279375568032265, "eval_runtime": 4.5806, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 5815 }, { "epoch": 4.157998212689901, "grad_norm": 0.01900641806423664, "learning_rate": 8.3495168632587e-06, "loss": 0.0022, "step": 5816 }, { "epoch": 4.158713136729222, "grad_norm": 0.0248624999076128, "learning_rate": 8.335707777677098e-06, "loss": 0.0027, "step": 5817 }, { "epoch": 4.159428060768543, "grad_norm": 0.03099467046558857, "learning_rate": 8.321909082306683e-06, "loss": 0.0036, "step": 5818 }, { "epoch": 4.160142984807864, "grad_norm": 0.028496239334344864, "learning_rate": 8.308120780588551e-06, "loss": 0.0029, "step": 5819 }, { "epoch": 4.160857908847185, "grad_norm": 0.022284599021077156, "learning_rate": 8.294342875961219e-06, "loss": 0.0033, "step": 5820 }, { "epoch": 4.160857908847185, "eval_loss": 0.00730930594727397, "eval_runtime": 4.6022, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 5820 }, { "epoch": 4.161572832886506, "grad_norm": 0.023829322308301926, "learning_rate": 8.280575371860627e-06, "loss": 0.0028, "step": 5821 }, { "epoch": 4.162287756925827, "grad_norm": 0.02667403407394886, "learning_rate": 8.266818271720078e-06, "loss": 0.0021, "step": 5822 }, { "epoch": 4.163002680965147, "grad_norm": 0.02715158648788929, "learning_rate": 8.253071578970328e-06, "loss": 0.0042, "step": 5823 }, { "epoch": 4.163717605004468, "grad_norm": 0.030480314046144485, "learning_rate": 8.239335297039525e-06, "loss": 0.0022, "step": 5824 }, { "epoch": 4.164432529043789, "grad_norm": 0.04567910358309746, "learning_rate": 8.225609429353187e-06, "loss": 0.0037, "step": 5825 }, { "epoch": 4.164432529043789, "eval_loss": 0.007324047852307558, "eval_runtime": 4.5831, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 5825 }, { "epoch": 4.16514745308311, "grad_norm": 0.025512820109725, "learning_rate": 8.211893979334284e-06, "loss": 0.0038, "step": 5826 }, { "epoch": 4.165862377122431, "grad_norm": 0.02123808115720749, "learning_rate": 8.198188950403147e-06, "loss": 0.0032, "step": 5827 }, { "epoch": 4.166577301161752, "grad_norm": 0.03397715091705322, "learning_rate": 8.184494345977517e-06, "loss": 0.0038, "step": 5828 }, { "epoch": 4.167292225201073, "grad_norm": 0.02098172903060913, "learning_rate": 8.170810169472592e-06, "loss": 0.0025, "step": 5829 }, { "epoch": 4.168007149240394, "grad_norm": 0.03015507012605667, "learning_rate": 8.157136424300876e-06, "loss": 0.0056, "step": 5830 }, { "epoch": 4.168007149240394, "eval_loss": 0.007380194496363401, "eval_runtime": 4.5922, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 5830 }, { "epoch": 4.168722073279714, "grad_norm": 0.022900177165865898, "learning_rate": 8.143473113872352e-06, "loss": 0.0027, "step": 5831 }, { "epoch": 4.169436997319035, "grad_norm": 0.026138445362448692, "learning_rate": 8.129820241594333e-06, "loss": 0.0051, "step": 5832 }, { "epoch": 4.1701519213583556, "grad_norm": 0.022312350571155548, "learning_rate": 8.116177810871578e-06, "loss": 0.0024, "step": 5833 }, { "epoch": 4.1708668453976765, "grad_norm": 0.029424700886011124, "learning_rate": 8.10254582510624e-06, "loss": 0.0041, "step": 5834 }, { "epoch": 4.171581769436997, "grad_norm": 0.03698524460196495, "learning_rate": 8.088924287697824e-06, "loss": 0.0031, "step": 5835 }, { "epoch": 4.171581769436997, "eval_loss": 0.007395508233457804, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 5835 }, { "epoch": 4.172296693476318, "grad_norm": 0.024354323744773865, "learning_rate": 8.075313202043278e-06, "loss": 0.0027, "step": 5836 }, { "epoch": 4.173011617515639, "grad_norm": 0.02420593425631523, "learning_rate": 8.061712571536939e-06, "loss": 0.0016, "step": 5837 }, { "epoch": 4.173726541554959, "grad_norm": 0.028890660032629967, "learning_rate": 8.04812239957049e-06, "loss": 0.0031, "step": 5838 }, { "epoch": 4.17444146559428, "grad_norm": 0.03178088739514351, "learning_rate": 8.034542689533053e-06, "loss": 0.0037, "step": 5839 }, { "epoch": 4.175156389633601, "grad_norm": 0.027446813881397247, "learning_rate": 8.020973444811142e-06, "loss": 0.004, "step": 5840 }, { "epoch": 4.175156389633601, "eval_loss": 0.0073518306016922, "eval_runtime": 4.6026, "eval_samples_per_second": 10.863, "eval_steps_per_second": 2.824, "step": 5840 }, { "epoch": 4.175871313672922, "grad_norm": 0.02931239828467369, "learning_rate": 8.007414668788615e-06, "loss": 0.0034, "step": 5841 }, { "epoch": 4.176586237712243, "grad_norm": 0.024816635996103287, "learning_rate": 7.99386636484678e-06, "loss": 0.0031, "step": 5842 }, { "epoch": 4.177301161751564, "grad_norm": 0.025676585733890533, "learning_rate": 7.980328536364278e-06, "loss": 0.0039, "step": 5843 }, { "epoch": 4.178016085790885, "grad_norm": 0.02773095853626728, "learning_rate": 7.96680118671717e-06, "loss": 0.0022, "step": 5844 }, { "epoch": 4.178731009830206, "grad_norm": 0.03265616297721863, "learning_rate": 7.95328431927892e-06, "loss": 0.0061, "step": 5845 }, { "epoch": 4.178731009830206, "eval_loss": 0.007432785350829363, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 5845 }, { "epoch": 4.179445933869526, "grad_norm": 0.023364180698990822, "learning_rate": 7.939777937420318e-06, "loss": 0.0026, "step": 5846 }, { "epoch": 4.180160857908847, "grad_norm": 0.026552803814411163, "learning_rate": 7.926282044509592e-06, "loss": 0.0022, "step": 5847 }, { "epoch": 4.180875781948168, "grad_norm": 0.02660200372338295, "learning_rate": 7.912796643912352e-06, "loss": 0.0031, "step": 5848 }, { "epoch": 4.181590705987489, "grad_norm": 0.031485021114349365, "learning_rate": 7.89932173899155e-06, "loss": 0.0053, "step": 5849 }, { "epoch": 4.18230563002681, "grad_norm": 0.028567640110850334, "learning_rate": 7.885857333107566e-06, "loss": 0.0041, "step": 5850 }, { "epoch": 4.18230563002681, "eval_loss": 0.007431251462548971, "eval_runtime": 4.5815, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.838, "step": 5850 }, { "epoch": 4.183020554066131, "grad_norm": 0.028362998738884926, "learning_rate": 7.87240342961814e-06, "loss": 0.004, "step": 5851 }, { "epoch": 4.183735478105452, "grad_norm": 0.027493683621287346, "learning_rate": 7.858960031878398e-06, "loss": 0.0033, "step": 5852 }, { "epoch": 4.184450402144772, "grad_norm": 0.029043888673186302, "learning_rate": 7.845527143240854e-06, "loss": 0.0052, "step": 5853 }, { "epoch": 4.185165326184093, "grad_norm": 0.02569028176367283, "learning_rate": 7.832104767055376e-06, "loss": 0.0023, "step": 5854 }, { "epoch": 4.185880250223414, "grad_norm": 0.01942068338394165, "learning_rate": 7.818692906669239e-06, "loss": 0.002, "step": 5855 }, { "epoch": 4.185880250223414, "eval_loss": 0.00732978992164135, "eval_runtime": 4.6079, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 5855 }, { "epoch": 4.1865951742627345, "grad_norm": 0.028304440900683403, "learning_rate": 7.805291565427064e-06, "loss": 0.0037, "step": 5856 }, { "epoch": 4.1873100983020555, "grad_norm": 0.01973491720855236, "learning_rate": 7.791900746670882e-06, "loss": 0.0019, "step": 5857 }, { "epoch": 4.188025022341376, "grad_norm": 0.028768785297870636, "learning_rate": 7.778520453740096e-06, "loss": 0.0044, "step": 5858 }, { "epoch": 4.188739946380697, "grad_norm": 0.026372838765382767, "learning_rate": 7.765150689971451e-06, "loss": 0.0035, "step": 5859 }, { "epoch": 4.189454870420018, "grad_norm": 0.03162795305252075, "learning_rate": 7.751791458699098e-06, "loss": 0.0055, "step": 5860 }, { "epoch": 4.189454870420018, "eval_loss": 0.007358667906373739, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 5860 }, { "epoch": 4.190169794459338, "grad_norm": 0.029146883636713028, "learning_rate": 7.738442763254572e-06, "loss": 0.0032, "step": 5861 }, { "epoch": 4.190884718498659, "grad_norm": 0.029480934143066406, "learning_rate": 7.725104606966726e-06, "loss": 0.0043, "step": 5862 }, { "epoch": 4.19159964253798, "grad_norm": 0.024450179189443588, "learning_rate": 7.71177699316184e-06, "loss": 0.0019, "step": 5863 }, { "epoch": 4.192314566577301, "grad_norm": 0.02496964856982231, "learning_rate": 7.698459925163553e-06, "loss": 0.0033, "step": 5864 }, { "epoch": 4.193029490616622, "grad_norm": 0.030681218951940536, "learning_rate": 7.685153406292845e-06, "loss": 0.004, "step": 5865 }, { "epoch": 4.193029490616622, "eval_loss": 0.007299089804291725, "eval_runtime": 4.6088, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 5865 }, { "epoch": 4.193744414655943, "grad_norm": 0.03969575837254524, "learning_rate": 7.671857439868107e-06, "loss": 0.0048, "step": 5866 }, { "epoch": 4.194459338695264, "grad_norm": 0.023342521861195564, "learning_rate": 7.658572029205052e-06, "loss": 0.002, "step": 5867 }, { "epoch": 4.195174262734584, "grad_norm": 0.03896242380142212, "learning_rate": 7.645297177616807e-06, "loss": 0.0034, "step": 5868 }, { "epoch": 4.195889186773905, "grad_norm": 0.03664558380842209, "learning_rate": 7.632032888413847e-06, "loss": 0.0049, "step": 5869 }, { "epoch": 4.196604110813226, "grad_norm": 0.027434993535280228, "learning_rate": 7.618779164903988e-06, "loss": 0.0043, "step": 5870 }, { "epoch": 4.196604110813226, "eval_loss": 0.007215645629912615, "eval_runtime": 4.5791, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 5870 }, { "epoch": 4.197319034852547, "grad_norm": 0.02629953809082508, "learning_rate": 7.60553601039245e-06, "loss": 0.0041, "step": 5871 }, { "epoch": 4.198033958891868, "grad_norm": 0.030343342572450638, "learning_rate": 7.5923034281818015e-06, "loss": 0.0046, "step": 5872 }, { "epoch": 4.198748882931189, "grad_norm": 0.026285480707883835, "learning_rate": 7.579081421571976e-06, "loss": 0.0038, "step": 5873 }, { "epoch": 4.19946380697051, "grad_norm": 0.039523012936115265, "learning_rate": 7.565869993860269e-06, "loss": 0.0021, "step": 5874 }, { "epoch": 4.200178731009831, "grad_norm": 0.03097582422196865, "learning_rate": 7.552669148341329e-06, "loss": 0.0065, "step": 5875 }, { "epoch": 4.200178731009831, "eval_loss": 0.007205529138445854, "eval_runtime": 4.5981, "eval_samples_per_second": 10.874, "eval_steps_per_second": 2.827, "step": 5875 }, { "epoch": 4.200893655049151, "grad_norm": 0.02710711397230625, "learning_rate": 7.539478888307172e-06, "loss": 0.0036, "step": 5876 }, { "epoch": 4.201608579088472, "grad_norm": 0.022284414619207382, "learning_rate": 7.526299217047194e-06, "loss": 0.0031, "step": 5877 }, { "epoch": 4.202323503127793, "grad_norm": 0.022818151861429214, "learning_rate": 7.5131301378481015e-06, "loss": 0.0039, "step": 5878 }, { "epoch": 4.2030384271671135, "grad_norm": 0.031558334827423096, "learning_rate": 7.499971653994026e-06, "loss": 0.0027, "step": 5879 }, { "epoch": 4.2037533512064345, "grad_norm": 0.02602611482143402, "learning_rate": 7.486823768766388e-06, "loss": 0.0019, "step": 5880 }, { "epoch": 4.2037533512064345, "eval_loss": 0.007268418557941914, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 5880 }, { "epoch": 4.204468275245755, "grad_norm": 0.02536638453602791, "learning_rate": 7.473686485444009e-06, "loss": 0.0028, "step": 5881 }, { "epoch": 4.205183199285076, "grad_norm": 0.01821356825530529, "learning_rate": 7.460559807303069e-06, "loss": 0.0022, "step": 5882 }, { "epoch": 4.205898123324396, "grad_norm": 0.024901214987039566, "learning_rate": 7.447443737617066e-06, "loss": 0.0034, "step": 5883 }, { "epoch": 4.206613047363717, "grad_norm": 0.020242959260940552, "learning_rate": 7.434338279656888e-06, "loss": 0.002, "step": 5884 }, { "epoch": 4.207327971403038, "grad_norm": 0.025930535048246384, "learning_rate": 7.421243436690778e-06, "loss": 0.0031, "step": 5885 }, { "epoch": 4.207327971403038, "eval_loss": 0.007331493776291609, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 5885 }, { "epoch": 4.208042895442359, "grad_norm": 0.023320816457271576, "learning_rate": 7.408159211984295e-06, "loss": 0.0027, "step": 5886 }, { "epoch": 4.20875781948168, "grad_norm": 0.02076614275574684, "learning_rate": 7.395085608800384e-06, "loss": 0.0025, "step": 5887 }, { "epoch": 4.209472743521001, "grad_norm": 0.024705009534955025, "learning_rate": 7.382022630399338e-06, "loss": 0.0036, "step": 5888 }, { "epoch": 4.210187667560322, "grad_norm": 0.02561981976032257, "learning_rate": 7.368970280038779e-06, "loss": 0.0027, "step": 5889 }, { "epoch": 4.210902591599643, "grad_norm": 0.02553432434797287, "learning_rate": 7.355928560973707e-06, "loss": 0.0043, "step": 5890 }, { "epoch": 4.210902591599643, "eval_loss": 0.007339932955801487, "eval_runtime": 4.5801, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 5890 }, { "epoch": 4.211617515638963, "grad_norm": 0.033642806112766266, "learning_rate": 7.342897476456439e-06, "loss": 0.005, "step": 5891 }, { "epoch": 4.212332439678284, "grad_norm": 0.02951042726635933, "learning_rate": 7.329877029736665e-06, "loss": 0.0034, "step": 5892 }, { "epoch": 4.213047363717605, "grad_norm": 0.02608702890574932, "learning_rate": 7.31686722406143e-06, "loss": 0.003, "step": 5893 }, { "epoch": 4.213762287756926, "grad_norm": 0.026933016255497932, "learning_rate": 7.3038680626750734e-06, "loss": 0.0038, "step": 5894 }, { "epoch": 4.214477211796247, "grad_norm": 0.021035131067037582, "learning_rate": 7.290879548819363e-06, "loss": 0.0021, "step": 5895 }, { "epoch": 4.214477211796247, "eval_loss": 0.007323861122131348, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 5895 }, { "epoch": 4.215192135835568, "grad_norm": 0.028927147388458252, "learning_rate": 7.277901685733335e-06, "loss": 0.0041, "step": 5896 }, { "epoch": 4.215907059874889, "grad_norm": 0.02925601601600647, "learning_rate": 7.264934476653401e-06, "loss": 0.0039, "step": 5897 }, { "epoch": 4.216621983914209, "grad_norm": 0.02557559683918953, "learning_rate": 7.251977924813336e-06, "loss": 0.0034, "step": 5898 }, { "epoch": 4.21733690795353, "grad_norm": 0.026480717584490776, "learning_rate": 7.2390320334442045e-06, "loss": 0.0031, "step": 5899 }, { "epoch": 4.218051831992851, "grad_norm": 0.029857570305466652, "learning_rate": 7.226096805774463e-06, "loss": 0.0041, "step": 5900 }, { "epoch": 4.218051831992851, "eval_loss": 0.007389609701931477, "eval_runtime": 4.5837, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 5900 }, { "epoch": 4.2187667560321715, "grad_norm": 0.026134926825761795, "learning_rate": 7.2131722450298925e-06, "loss": 0.003, "step": 5901 }, { "epoch": 4.2194816800714925, "grad_norm": 0.025266116484999657, "learning_rate": 7.200258354433593e-06, "loss": 0.0029, "step": 5902 }, { "epoch": 4.220196604110813, "grad_norm": 0.03355015441775322, "learning_rate": 7.187355137206042e-06, "loss": 0.0033, "step": 5903 }, { "epoch": 4.220911528150134, "grad_norm": 0.026856690645217896, "learning_rate": 7.174462596565012e-06, "loss": 0.0021, "step": 5904 }, { "epoch": 4.221626452189454, "grad_norm": 0.03637417033314705, "learning_rate": 7.161580735725648e-06, "loss": 0.0025, "step": 5905 }, { "epoch": 4.221626452189454, "eval_loss": 0.007461750879883766, "eval_runtime": 4.5833, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5905 }, { "epoch": 4.222341376228775, "grad_norm": 0.030683772638440132, "learning_rate": 7.14870955790043e-06, "loss": 0.0026, "step": 5906 }, { "epoch": 4.223056300268096, "grad_norm": 0.0376545712351799, "learning_rate": 7.135849066299144e-06, "loss": 0.0062, "step": 5907 }, { "epoch": 4.223771224307417, "grad_norm": 0.029333747923374176, "learning_rate": 7.122999264128932e-06, "loss": 0.0029, "step": 5908 }, { "epoch": 4.224486148346738, "grad_norm": 0.04089578241109848, "learning_rate": 7.110160154594286e-06, "loss": 0.0079, "step": 5909 }, { "epoch": 4.225201072386059, "grad_norm": 0.0332692414522171, "learning_rate": 7.097331740896995e-06, "loss": 0.0057, "step": 5910 }, { "epoch": 4.225201072386059, "eval_loss": 0.007390712387859821, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5910 }, { "epoch": 4.22591599642538, "grad_norm": 0.027553701773285866, "learning_rate": 7.0845140262362e-06, "loss": 0.0024, "step": 5911 }, { "epoch": 4.226630920464701, "grad_norm": 0.023503776639699936, "learning_rate": 7.071707013808398e-06, "loss": 0.0036, "step": 5912 }, { "epoch": 4.227345844504021, "grad_norm": 0.025209791958332062, "learning_rate": 7.058910706807359e-06, "loss": 0.0026, "step": 5913 }, { "epoch": 4.228060768543342, "grad_norm": 0.028651421889662743, "learning_rate": 7.046125108424245e-06, "loss": 0.0047, "step": 5914 }, { "epoch": 4.228775692582663, "grad_norm": 0.03320781886577606, "learning_rate": 7.0333502218474955e-06, "loss": 0.0038, "step": 5915 }, { "epoch": 4.228775692582663, "eval_loss": 0.007299493532627821, "eval_runtime": 4.5772, "eval_samples_per_second": 10.924, "eval_steps_per_second": 2.84, "step": 5915 }, { "epoch": 4.229490616621984, "grad_norm": 0.022939197719097137, "learning_rate": 7.020586050262912e-06, "loss": 0.003, "step": 5916 }, { "epoch": 4.230205540661305, "grad_norm": 0.021063178777694702, "learning_rate": 7.00783259685362e-06, "loss": 0.0025, "step": 5917 }, { "epoch": 4.230920464700626, "grad_norm": 0.03561911731958389, "learning_rate": 6.995089864800059e-06, "loss": 0.003, "step": 5918 }, { "epoch": 4.231635388739947, "grad_norm": 0.03288431093096733, "learning_rate": 6.98235785728002e-06, "loss": 0.0031, "step": 5919 }, { "epoch": 4.232350312779268, "grad_norm": 0.016645699739456177, "learning_rate": 6.9696365774685746e-06, "loss": 0.0017, "step": 5920 }, { "epoch": 4.232350312779268, "eval_loss": 0.007358471862971783, "eval_runtime": 4.6094, "eval_samples_per_second": 10.847, "eval_steps_per_second": 2.82, "step": 5920 }, { "epoch": 4.233065236818588, "grad_norm": 0.02718031033873558, "learning_rate": 6.9569260285381635e-06, "loss": 0.0035, "step": 5921 }, { "epoch": 4.233780160857909, "grad_norm": 0.024560723453760147, "learning_rate": 6.944226213658533e-06, "loss": 0.0027, "step": 5922 }, { "epoch": 4.23449508489723, "grad_norm": 0.022520065307617188, "learning_rate": 6.931537135996746e-06, "loss": 0.0046, "step": 5923 }, { "epoch": 4.2352100089365505, "grad_norm": 0.023316631093621254, "learning_rate": 6.918858798717204e-06, "loss": 0.0021, "step": 5924 }, { "epoch": 4.2359249329758715, "grad_norm": 0.027668217197060585, "learning_rate": 6.906191204981621e-06, "loss": 0.0034, "step": 5925 }, { "epoch": 4.2359249329758715, "eval_loss": 0.007333238609135151, "eval_runtime": 4.575, "eval_samples_per_second": 10.929, "eval_steps_per_second": 2.842, "step": 5925 }, { "epoch": 4.236639857015192, "grad_norm": 0.020663360133767128, "learning_rate": 6.893534357949022e-06, "loss": 0.0019, "step": 5926 }, { "epoch": 4.237354781054513, "grad_norm": 0.027202589437365532, "learning_rate": 6.880888260775786e-06, "loss": 0.0037, "step": 5927 }, { "epoch": 4.238069705093833, "grad_norm": 0.027576245367527008, "learning_rate": 6.868252916615553e-06, "loss": 0.0038, "step": 5928 }, { "epoch": 4.238784629133154, "grad_norm": 0.021270016208291054, "learning_rate": 6.85562832861934e-06, "loss": 0.003, "step": 5929 }, { "epoch": 4.239499553172475, "grad_norm": 0.03141560032963753, "learning_rate": 6.843014499935463e-06, "loss": 0.0047, "step": 5930 }, { "epoch": 4.239499553172475, "eval_loss": 0.007421288173645735, "eval_runtime": 4.584, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5930 }, { "epoch": 4.240214477211796, "grad_norm": 0.026006951928138733, "learning_rate": 6.8304114337095275e-06, "loss": 0.0027, "step": 5931 }, { "epoch": 4.240929401251117, "grad_norm": 0.026185549795627594, "learning_rate": 6.817819133084486e-06, "loss": 0.0032, "step": 5932 }, { "epoch": 4.241644325290438, "grad_norm": 0.025225268676877022, "learning_rate": 6.805237601200615e-06, "loss": 0.0029, "step": 5933 }, { "epoch": 4.242359249329759, "grad_norm": 0.025884633883833885, "learning_rate": 6.792666841195455e-06, "loss": 0.0031, "step": 5934 }, { "epoch": 4.243074173369079, "grad_norm": 0.0316263772547245, "learning_rate": 6.780106856203916e-06, "loss": 0.0024, "step": 5935 }, { "epoch": 4.243074173369079, "eval_loss": 0.007372036576271057, "eval_runtime": 4.5854, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5935 }, { "epoch": 4.2437890974084, "grad_norm": 0.02394098974764347, "learning_rate": 6.767557649358203e-06, "loss": 0.0035, "step": 5936 }, { "epoch": 4.244504021447721, "grad_norm": 0.024762580171227455, "learning_rate": 6.755019223787806e-06, "loss": 0.0034, "step": 5937 }, { "epoch": 4.245218945487042, "grad_norm": 0.034075986593961716, "learning_rate": 6.742491582619559e-06, "loss": 0.005, "step": 5938 }, { "epoch": 4.245933869526363, "grad_norm": 0.03521861135959625, "learning_rate": 6.7299747289776e-06, "loss": 0.0043, "step": 5939 }, { "epoch": 4.246648793565684, "grad_norm": 0.028884151950478554, "learning_rate": 6.717468665983384e-06, "loss": 0.0033, "step": 5940 }, { "epoch": 4.246648793565684, "eval_loss": 0.007342441938817501, "eval_runtime": 4.5962, "eval_samples_per_second": 10.879, "eval_steps_per_second": 2.828, "step": 5940 }, { "epoch": 4.247363717605005, "grad_norm": 0.027080800384283066, "learning_rate": 6.704973396755637e-06, "loss": 0.0033, "step": 5941 }, { "epoch": 4.248078641644326, "grad_norm": 0.024258486926555634, "learning_rate": 6.692488924410434e-06, "loss": 0.0037, "step": 5942 }, { "epoch": 4.248793565683646, "grad_norm": 0.03073773719370365, "learning_rate": 6.68001525206115e-06, "loss": 0.0027, "step": 5943 }, { "epoch": 4.249508489722967, "grad_norm": 0.027871521189808846, "learning_rate": 6.667552382818448e-06, "loss": 0.0035, "step": 5944 }, { "epoch": 4.250223413762288, "grad_norm": 0.030205782502889633, "learning_rate": 6.655100319790314e-06, "loss": 0.0035, "step": 5945 }, { "epoch": 4.250223413762288, "eval_loss": 0.007356594782322645, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 5945 }, { "epoch": 4.250938337801609, "grad_norm": 0.02132536843419075, "learning_rate": 6.6426590660820465e-06, "loss": 0.0033, "step": 5946 }, { "epoch": 4.2516532618409295, "grad_norm": 0.021582534536719322, "learning_rate": 6.6302286247962145e-06, "loss": 0.0027, "step": 5947 }, { "epoch": 4.2523681858802505, "grad_norm": 0.031002355739474297, "learning_rate": 6.6178089990327265e-06, "loss": 0.0037, "step": 5948 }, { "epoch": 4.253083109919571, "grad_norm": 0.028246261179447174, "learning_rate": 6.605400191888783e-06, "loss": 0.0035, "step": 5949 }, { "epoch": 4.253798033958892, "grad_norm": 0.02539791353046894, "learning_rate": 6.593002206458871e-06, "loss": 0.0022, "step": 5950 }, { "epoch": 4.253798033958892, "eval_loss": 0.007366883102804422, "eval_runtime": 4.5855, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5950 }, { "epoch": 4.254512957998212, "grad_norm": 0.022957414388656616, "learning_rate": 6.580615045834804e-06, "loss": 0.003, "step": 5951 }, { "epoch": 4.255227882037533, "grad_norm": 0.025887656956911087, "learning_rate": 6.5682387131056676e-06, "loss": 0.0029, "step": 5952 }, { "epoch": 4.255942806076854, "grad_norm": 0.02903703786432743, "learning_rate": 6.555873211357871e-06, "loss": 0.0051, "step": 5953 }, { "epoch": 4.256657730116175, "grad_norm": 0.02520645409822464, "learning_rate": 6.543518543675131e-06, "loss": 0.0025, "step": 5954 }, { "epoch": 4.257372654155496, "grad_norm": 0.02575562708079815, "learning_rate": 6.531174713138416e-06, "loss": 0.0034, "step": 5955 }, { "epoch": 4.257372654155496, "eval_loss": 0.007431562524288893, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 5955 }, { "epoch": 4.258087578194817, "grad_norm": 0.02702287957072258, "learning_rate": 6.5188417228260304e-06, "loss": 0.0026, "step": 5956 }, { "epoch": 4.258802502234138, "grad_norm": 0.03155507892370224, "learning_rate": 6.506519575813591e-06, "loss": 0.005, "step": 5957 }, { "epoch": 4.259517426273458, "grad_norm": 0.021098880097270012, "learning_rate": 6.494208275173947e-06, "loss": 0.0019, "step": 5958 }, { "epoch": 4.260232350312779, "grad_norm": 0.024807244539260864, "learning_rate": 6.4819078239773064e-06, "loss": 0.0036, "step": 5959 }, { "epoch": 4.2609472743521, "grad_norm": 0.02827535755932331, "learning_rate": 6.469618225291141e-06, "loss": 0.0034, "step": 5960 }, { "epoch": 4.2609472743521, "eval_loss": 0.007440618705004454, "eval_runtime": 4.5915, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 5960 }, { "epoch": 4.261662198391421, "grad_norm": 0.026958143338561058, "learning_rate": 6.457339482180219e-06, "loss": 0.0024, "step": 5961 }, { "epoch": 4.262377122430742, "grad_norm": 0.03225865215063095, "learning_rate": 6.44507159770662e-06, "loss": 0.0025, "step": 5962 }, { "epoch": 4.263092046470063, "grad_norm": 0.0281483493745327, "learning_rate": 6.43281457492968e-06, "loss": 0.0025, "step": 5963 }, { "epoch": 4.263806970509384, "grad_norm": 0.031172553077340126, "learning_rate": 6.420568416906059e-06, "loss": 0.0051, "step": 5964 }, { "epoch": 4.264521894548704, "grad_norm": 0.029605615884065628, "learning_rate": 6.408333126689687e-06, "loss": 0.004, "step": 5965 }, { "epoch": 4.264521894548704, "eval_loss": 0.0074111600406467915, "eval_runtime": 4.6176, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.815, "step": 5965 }, { "epoch": 4.265236818588025, "grad_norm": 0.027913063764572144, "learning_rate": 6.396108707331794e-06, "loss": 0.0041, "step": 5966 }, { "epoch": 4.265951742627346, "grad_norm": 0.031524620950222015, "learning_rate": 6.38389516188091e-06, "loss": 0.0036, "step": 5967 }, { "epoch": 4.266666666666667, "grad_norm": 0.03528693690896034, "learning_rate": 6.371692493382814e-06, "loss": 0.004, "step": 5968 }, { "epoch": 4.2673815907059875, "grad_norm": 0.029842445626854897, "learning_rate": 6.359500704880617e-06, "loss": 0.0053, "step": 5969 }, { "epoch": 4.2680965147453085, "grad_norm": 0.02615230530500412, "learning_rate": 6.347319799414703e-06, "loss": 0.0037, "step": 5970 }, { "epoch": 4.2680965147453085, "eval_loss": 0.00731690926477313, "eval_runtime": 4.6022, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 5970 }, { "epoch": 4.268811438784629, "grad_norm": 0.03194921091198921, "learning_rate": 6.33514978002272e-06, "loss": 0.0046, "step": 5971 }, { "epoch": 4.2695263628239495, "grad_norm": 0.0307309553027153, "learning_rate": 6.322990649739624e-06, "loss": 0.004, "step": 5972 }, { "epoch": 4.27024128686327, "grad_norm": 0.030167344957590103, "learning_rate": 6.310842411597667e-06, "loss": 0.0025, "step": 5973 }, { "epoch": 4.270956210902591, "grad_norm": 0.034487493336200714, "learning_rate": 6.298705068626348e-06, "loss": 0.0041, "step": 5974 }, { "epoch": 4.271671134941912, "grad_norm": 0.030169887468218803, "learning_rate": 6.286578623852485e-06, "loss": 0.0035, "step": 5975 }, { "epoch": 4.271671134941912, "eval_loss": 0.0073048691265285015, "eval_runtime": 4.581, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 5975 }, { "epoch": 4.272386058981233, "grad_norm": 0.02212505415081978, "learning_rate": 6.274463080300142e-06, "loss": 0.002, "step": 5976 }, { "epoch": 4.273100983020554, "grad_norm": 0.034588128328323364, "learning_rate": 6.26235844099069e-06, "loss": 0.0038, "step": 5977 }, { "epoch": 4.273815907059875, "grad_norm": 0.025838760659098625, "learning_rate": 6.250264708942799e-06, "loss": 0.0041, "step": 5978 }, { "epoch": 4.274530831099196, "grad_norm": 0.026902368292212486, "learning_rate": 6.238181887172362e-06, "loss": 0.0032, "step": 5979 }, { "epoch": 4.275245755138516, "grad_norm": 0.029669582843780518, "learning_rate": 6.226109978692596e-06, "loss": 0.0044, "step": 5980 }, { "epoch": 4.275245755138516, "eval_loss": 0.007341836579144001, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 5980 }, { "epoch": 4.275960679177837, "grad_norm": 0.027113761752843857, "learning_rate": 6.214048986514004e-06, "loss": 0.0043, "step": 5981 }, { "epoch": 4.276675603217158, "grad_norm": 0.02614571712911129, "learning_rate": 6.201998913644319e-06, "loss": 0.0029, "step": 5982 }, { "epoch": 4.277390527256479, "grad_norm": 0.026641853153705597, "learning_rate": 6.189959763088593e-06, "loss": 0.0039, "step": 5983 }, { "epoch": 4.2781054512958, "grad_norm": 0.033498525619506836, "learning_rate": 6.177931537849141e-06, "loss": 0.0028, "step": 5984 }, { "epoch": 4.278820375335121, "grad_norm": 0.02950974367558956, "learning_rate": 6.165914240925547e-06, "loss": 0.0034, "step": 5985 }, { "epoch": 4.278820375335121, "eval_loss": 0.007374745327979326, "eval_runtime": 4.6352, "eval_samples_per_second": 10.787, "eval_steps_per_second": 2.805, "step": 5985 }, { "epoch": 4.279535299374442, "grad_norm": 0.02350798435509205, "learning_rate": 6.153907875314696e-06, "loss": 0.0035, "step": 5986 }, { "epoch": 4.280250223413763, "grad_norm": 0.021999463438987732, "learning_rate": 6.141912444010695e-06, "loss": 0.0023, "step": 5987 }, { "epoch": 4.280965147453083, "grad_norm": 0.03171335533261299, "learning_rate": 6.129927950004988e-06, "loss": 0.0027, "step": 5988 }, { "epoch": 4.281680071492404, "grad_norm": 0.021488729864358902, "learning_rate": 6.117954396286235e-06, "loss": 0.0025, "step": 5989 }, { "epoch": 4.282394995531725, "grad_norm": 0.02204546518623829, "learning_rate": 6.105991785840398e-06, "loss": 0.0024, "step": 5990 }, { "epoch": 4.282394995531725, "eval_loss": 0.007441294379532337, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 5990 }, { "epoch": 4.283109919571046, "grad_norm": 0.027743663638830185, "learning_rate": 6.094040121650718e-06, "loss": 0.0049, "step": 5991 }, { "epoch": 4.2838248436103665, "grad_norm": 0.021015316247940063, "learning_rate": 6.082099406697673e-06, "loss": 0.0022, "step": 5992 }, { "epoch": 4.2845397676496875, "grad_norm": 0.019533976912498474, "learning_rate": 6.070169643959034e-06, "loss": 0.0021, "step": 5993 }, { "epoch": 4.285254691689008, "grad_norm": 0.025525039061903954, "learning_rate": 6.058250836409856e-06, "loss": 0.0025, "step": 5994 }, { "epoch": 4.2859696157283285, "grad_norm": 0.026452355086803436, "learning_rate": 6.046342987022419e-06, "loss": 0.0023, "step": 5995 }, { "epoch": 4.2859696157283285, "eval_loss": 0.007485551759600639, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 5995 }, { "epoch": 4.286684539767649, "grad_norm": 0.023952508345246315, "learning_rate": 6.0344460987663e-06, "loss": 0.0015, "step": 5996 }, { "epoch": 4.28739946380697, "grad_norm": 0.02311859279870987, "learning_rate": 6.0225601746083495e-06, "loss": 0.0036, "step": 5997 }, { "epoch": 4.288114387846291, "grad_norm": 0.033617980778217316, "learning_rate": 6.010685217512646e-06, "loss": 0.0043, "step": 5998 }, { "epoch": 4.288829311885612, "grad_norm": 0.022123869508504868, "learning_rate": 5.998821230440588e-06, "loss": 0.0022, "step": 5999 }, { "epoch": 4.289544235924933, "grad_norm": 0.03446776419878006, "learning_rate": 5.9869682163507855e-06, "loss": 0.0045, "step": 6000 }, { "epoch": 4.289544235924933, "eval_loss": 0.007492502219974995, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6000 }, { "epoch": 4.290259159964254, "grad_norm": 0.02456563524901867, "learning_rate": 5.975126178199136e-06, "loss": 0.0021, "step": 6001 }, { "epoch": 4.290974084003574, "grad_norm": 0.026428498327732086, "learning_rate": 5.963295118938816e-06, "loss": 0.0029, "step": 6002 }, { "epoch": 4.291689008042895, "grad_norm": 0.026376323774456978, "learning_rate": 5.9514750415202215e-06, "loss": 0.0029, "step": 6003 }, { "epoch": 4.292403932082216, "grad_norm": 0.02469492144882679, "learning_rate": 5.939665948891049e-06, "loss": 0.0034, "step": 6004 }, { "epoch": 4.293118856121537, "grad_norm": 0.031984493136405945, "learning_rate": 5.927867843996243e-06, "loss": 0.0044, "step": 6005 }, { "epoch": 4.293118856121537, "eval_loss": 0.007456004153937101, "eval_runtime": 4.5896, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 6005 }, { "epoch": 4.293833780160858, "grad_norm": 0.026664400473237038, "learning_rate": 5.9160807297780005e-06, "loss": 0.0023, "step": 6006 }, { "epoch": 4.294548704200179, "grad_norm": 0.03076602891087532, "learning_rate": 5.904304609175798e-06, "loss": 0.0037, "step": 6007 }, { "epoch": 4.2952636282395, "grad_norm": 0.021685102954506874, "learning_rate": 5.892539485126331e-06, "loss": 0.0021, "step": 6008 }, { "epoch": 4.295978552278821, "grad_norm": 0.020308079198002815, "learning_rate": 5.880785360563595e-06, "loss": 0.0023, "step": 6009 }, { "epoch": 4.296693476318141, "grad_norm": 0.03547128289937973, "learning_rate": 5.869042238418832e-06, "loss": 0.0043, "step": 6010 }, { "epoch": 4.296693476318141, "eval_loss": 0.0073783425614237785, "eval_runtime": 4.5795, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 6010 }, { "epoch": 4.297408400357462, "grad_norm": 0.028139203786849976, "learning_rate": 5.857310121620513e-06, "loss": 0.0039, "step": 6011 }, { "epoch": 4.298123324396783, "grad_norm": 0.020442118868231773, "learning_rate": 5.845589013094405e-06, "loss": 0.0025, "step": 6012 }, { "epoch": 4.298838248436104, "grad_norm": 0.02641214057803154, "learning_rate": 5.833878915763485e-06, "loss": 0.003, "step": 6013 }, { "epoch": 4.299553172475425, "grad_norm": 0.02263258770108223, "learning_rate": 5.822179832548025e-06, "loss": 0.0028, "step": 6014 }, { "epoch": 4.3002680965147455, "grad_norm": 0.028971409425139427, "learning_rate": 5.810491766365544e-06, "loss": 0.0032, "step": 6015 }, { "epoch": 4.3002680965147455, "eval_loss": 0.0073651764541864395, "eval_runtime": 4.5937, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 6015 }, { "epoch": 4.3009830205540664, "grad_norm": 0.02545134164392948, "learning_rate": 5.79881472013078e-06, "loss": 0.0035, "step": 6016 }, { "epoch": 4.301697944593387, "grad_norm": 0.02789938822388649, "learning_rate": 5.787148696755756e-06, "loss": 0.0029, "step": 6017 }, { "epoch": 4.302412868632707, "grad_norm": 0.02741112746298313, "learning_rate": 5.775493699149753e-06, "loss": 0.0055, "step": 6018 }, { "epoch": 4.303127792672028, "grad_norm": 0.032121505588293076, "learning_rate": 5.7638497302192566e-06, "loss": 0.0034, "step": 6019 }, { "epoch": 4.303842716711349, "grad_norm": 0.025579530745744705, "learning_rate": 5.752216792868048e-06, "loss": 0.0028, "step": 6020 }, { "epoch": 4.303842716711349, "eval_loss": 0.007304919417947531, "eval_runtime": 4.5913, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 6020 }, { "epoch": 4.30455764075067, "grad_norm": 0.02810940518975258, "learning_rate": 5.740594889997148e-06, "loss": 0.0049, "step": 6021 }, { "epoch": 4.305272564789991, "grad_norm": 0.0313456729054451, "learning_rate": 5.728984024504796e-06, "loss": 0.0035, "step": 6022 }, { "epoch": 4.305987488829312, "grad_norm": 0.03977878391742706, "learning_rate": 5.71738419928653e-06, "loss": 0.0042, "step": 6023 }, { "epoch": 4.306702412868633, "grad_norm": 0.028482619673013687, "learning_rate": 5.705795417235077e-06, "loss": 0.0038, "step": 6024 }, { "epoch": 4.307417336907953, "grad_norm": 0.030283436179161072, "learning_rate": 5.6942176812404546e-06, "loss": 0.0033, "step": 6025 }, { "epoch": 4.307417336907953, "eval_loss": 0.007251562550663948, "eval_runtime": 4.5843, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6025 }, { "epoch": 4.308132260947274, "grad_norm": 0.029021291062235832, "learning_rate": 5.682650994189914e-06, "loss": 0.0038, "step": 6026 }, { "epoch": 4.308847184986595, "grad_norm": 0.027477223426103592, "learning_rate": 5.671095358967926e-06, "loss": 0.0024, "step": 6027 }, { "epoch": 4.309562109025916, "grad_norm": 0.030933795496821404, "learning_rate": 5.659550778456258e-06, "loss": 0.0029, "step": 6028 }, { "epoch": 4.310277033065237, "grad_norm": 0.03445263206958771, "learning_rate": 5.648017255533866e-06, "loss": 0.0073, "step": 6029 }, { "epoch": 4.310991957104558, "grad_norm": 0.019255150109529495, "learning_rate": 5.636494793076974e-06, "loss": 0.002, "step": 6030 }, { "epoch": 4.310991957104558, "eval_loss": 0.007250187452882528, "eval_runtime": 4.5804, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6030 }, { "epoch": 4.311706881143879, "grad_norm": 0.026682205498218536, "learning_rate": 5.6249833939590655e-06, "loss": 0.0052, "step": 6031 }, { "epoch": 4.312421805183199, "grad_norm": 0.03128250688314438, "learning_rate": 5.613483061050817e-06, "loss": 0.0026, "step": 6032 }, { "epoch": 4.31313672922252, "grad_norm": 0.03139358013868332, "learning_rate": 5.60199379722019e-06, "loss": 0.0049, "step": 6033 }, { "epoch": 4.313851653261841, "grad_norm": 0.028363607823848724, "learning_rate": 5.59051560533237e-06, "loss": 0.0023, "step": 6034 }, { "epoch": 4.314566577301162, "grad_norm": 0.03022068366408348, "learning_rate": 5.579048488249766e-06, "loss": 0.0048, "step": 6035 }, { "epoch": 4.314566577301162, "eval_loss": 0.007244376465678215, "eval_runtime": 4.6209, "eval_samples_per_second": 10.82, "eval_steps_per_second": 2.813, "step": 6035 }, { "epoch": 4.315281501340483, "grad_norm": 0.023401105776429176, "learning_rate": 5.5675924488320595e-06, "loss": 0.0025, "step": 6036 }, { "epoch": 4.3159964253798035, "grad_norm": 0.029822485521435738, "learning_rate": 5.556147489936131e-06, "loss": 0.0052, "step": 6037 }, { "epoch": 4.3167113494191245, "grad_norm": 0.02028927206993103, "learning_rate": 5.544713614416119e-06, "loss": 0.0016, "step": 6038 }, { "epoch": 4.317426273458445, "grad_norm": 0.025724751874804497, "learning_rate": 5.53329082512341e-06, "loss": 0.0046, "step": 6039 }, { "epoch": 4.3181411974977655, "grad_norm": 0.02789846807718277, "learning_rate": 5.5218791249065875e-06, "loss": 0.0026, "step": 6040 }, { "epoch": 4.3181411974977655, "eval_loss": 0.007265615742653608, "eval_runtime": 4.6056, "eval_samples_per_second": 10.856, "eval_steps_per_second": 2.823, "step": 6040 }, { "epoch": 4.318856121537086, "grad_norm": 0.029148893430829048, "learning_rate": 5.510478516611512e-06, "loss": 0.0053, "step": 6041 }, { "epoch": 4.319571045576407, "grad_norm": 0.02275710366666317, "learning_rate": 5.499089003081259e-06, "loss": 0.0029, "step": 6042 }, { "epoch": 4.320285969615728, "grad_norm": 0.03991260007023811, "learning_rate": 5.4877105871561196e-06, "loss": 0.0034, "step": 6043 }, { "epoch": 4.321000893655049, "grad_norm": 0.025692729279398918, "learning_rate": 5.476343271673651e-06, "loss": 0.0038, "step": 6044 }, { "epoch": 4.32171581769437, "grad_norm": 0.02403969131410122, "learning_rate": 5.464987059468629e-06, "loss": 0.0027, "step": 6045 }, { "epoch": 4.32171581769437, "eval_loss": 0.00721962284296751, "eval_runtime": 4.6017, "eval_samples_per_second": 10.866, "eval_steps_per_second": 2.825, "step": 6045 }, { "epoch": 4.322430741733691, "grad_norm": 0.027364669367671013, "learning_rate": 5.45364195337304e-06, "loss": 0.0033, "step": 6046 }, { "epoch": 4.323145665773012, "grad_norm": 0.03268884867429733, "learning_rate": 5.442307956216142e-06, "loss": 0.004, "step": 6047 }, { "epoch": 4.323860589812332, "grad_norm": 0.023614609614014626, "learning_rate": 5.4309850708243736e-06, "loss": 0.0026, "step": 6048 }, { "epoch": 4.324575513851653, "grad_norm": 0.03167151287198067, "learning_rate": 5.4196733000214275e-06, "loss": 0.0046, "step": 6049 }, { "epoch": 4.325290437890974, "grad_norm": 0.028008505702018738, "learning_rate": 5.408372646628257e-06, "loss": 0.0034, "step": 6050 }, { "epoch": 4.325290437890974, "eval_loss": 0.007273905444890261, "eval_runtime": 4.58, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6050 }, { "epoch": 4.326005361930295, "grad_norm": 0.03276285156607628, "learning_rate": 5.397083113462986e-06, "loss": 0.0042, "step": 6051 }, { "epoch": 4.326720285969616, "grad_norm": 0.027031265199184418, "learning_rate": 5.385804703341002e-06, "loss": 0.0025, "step": 6052 }, { "epoch": 4.327435210008937, "grad_norm": 0.022846929728984833, "learning_rate": 5.374537419074893e-06, "loss": 0.0021, "step": 6053 }, { "epoch": 4.328150134048258, "grad_norm": 0.03044930286705494, "learning_rate": 5.36328126347449e-06, "loss": 0.0027, "step": 6054 }, { "epoch": 4.328865058087578, "grad_norm": 0.029896214604377747, "learning_rate": 5.352036239346858e-06, "loss": 0.004, "step": 6055 }, { "epoch": 4.328865058087578, "eval_loss": 0.007216135039925575, "eval_runtime": 4.5757, "eval_samples_per_second": 10.927, "eval_steps_per_second": 2.841, "step": 6055 }, { "epoch": 4.329579982126899, "grad_norm": 0.034001488238573074, "learning_rate": 5.340802349496254e-06, "loss": 0.0057, "step": 6056 }, { "epoch": 4.33029490616622, "grad_norm": 0.02940990962088108, "learning_rate": 5.329579596724188e-06, "loss": 0.0077, "step": 6057 }, { "epoch": 4.331009830205541, "grad_norm": 0.025526273995637894, "learning_rate": 5.318367983829392e-06, "loss": 0.0039, "step": 6058 }, { "epoch": 4.331724754244862, "grad_norm": 0.02409251220524311, "learning_rate": 5.307167513607786e-06, "loss": 0.0019, "step": 6059 }, { "epoch": 4.3324396782841825, "grad_norm": 0.02304650843143463, "learning_rate": 5.295978188852557e-06, "loss": 0.0021, "step": 6060 }, { "epoch": 4.3324396782841825, "eval_loss": 0.007186388596892357, "eval_runtime": 4.5953, "eval_samples_per_second": 10.881, "eval_steps_per_second": 2.829, "step": 6060 }, { "epoch": 4.3331546023235035, "grad_norm": 0.020068533718585968, "learning_rate": 5.284800012354074e-06, "loss": 0.0033, "step": 6061 }, { "epoch": 4.3338695263628235, "grad_norm": 0.03486838936805725, "learning_rate": 5.273632986899951e-06, "loss": 0.0043, "step": 6062 }, { "epoch": 4.3345844504021445, "grad_norm": 0.025326792150735855, "learning_rate": 5.2624771152750215e-06, "loss": 0.0023, "step": 6063 }, { "epoch": 4.335299374441465, "grad_norm": 0.02963690645992756, "learning_rate": 5.25133240026131e-06, "loss": 0.0063, "step": 6064 }, { "epoch": 4.336014298480786, "grad_norm": 0.035597216337919235, "learning_rate": 5.240198844638083e-06, "loss": 0.0035, "step": 6065 }, { "epoch": 4.336014298480786, "eval_loss": 0.007180005311965942, "eval_runtime": 4.6126, "eval_samples_per_second": 10.84, "eval_steps_per_second": 2.818, "step": 6065 }, { "epoch": 4.336729222520107, "grad_norm": 0.02492886409163475, "learning_rate": 5.229076451181836e-06, "loss": 0.0025, "step": 6066 }, { "epoch": 4.337444146559428, "grad_norm": 0.024477656930685043, "learning_rate": 5.217965222666238e-06, "loss": 0.0039, "step": 6067 }, { "epoch": 4.338159070598749, "grad_norm": 0.026809565722942352, "learning_rate": 5.206865161862212e-06, "loss": 0.0037, "step": 6068 }, { "epoch": 4.33887399463807, "grad_norm": 0.02504148706793785, "learning_rate": 5.195776271537894e-06, "loss": 0.0044, "step": 6069 }, { "epoch": 4.33958891867739, "grad_norm": 0.026276830583810806, "learning_rate": 5.184698554458595e-06, "loss": 0.0028, "step": 6070 }, { "epoch": 4.33958891867739, "eval_loss": 0.007190997712314129, "eval_runtime": 4.5856, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 6070 }, { "epoch": 4.340303842716711, "grad_norm": 0.025646749883890152, "learning_rate": 5.173632013386892e-06, "loss": 0.0039, "step": 6071 }, { "epoch": 4.341018766756032, "grad_norm": 0.021056555211544037, "learning_rate": 5.1625766510825404e-06, "loss": 0.0022, "step": 6072 }, { "epoch": 4.341733690795353, "grad_norm": 0.034965578466653824, "learning_rate": 5.151532470302523e-06, "loss": 0.0068, "step": 6073 }, { "epoch": 4.342448614834674, "grad_norm": 0.027481112629175186, "learning_rate": 5.140499473801036e-06, "loss": 0.0043, "step": 6074 }, { "epoch": 4.343163538873995, "grad_norm": 0.025980770587921143, "learning_rate": 5.129477664329463e-06, "loss": 0.0039, "step": 6075 }, { "epoch": 4.343163538873995, "eval_loss": 0.007182017900049686, "eval_runtime": 4.6118, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 6075 }, { "epoch": 4.343878462913316, "grad_norm": 0.029546210542321205, "learning_rate": 5.118467044636438e-06, "loss": 0.0033, "step": 6076 }, { "epoch": 4.344593386952637, "grad_norm": 0.030718745663762093, "learning_rate": 5.107467617467754e-06, "loss": 0.0053, "step": 6077 }, { "epoch": 4.345308310991957, "grad_norm": 0.025992272421717644, "learning_rate": 5.096479385566455e-06, "loss": 0.0033, "step": 6078 }, { "epoch": 4.346023235031278, "grad_norm": 0.02841196395456791, "learning_rate": 5.0855023516727885e-06, "loss": 0.0027, "step": 6079 }, { "epoch": 4.346738159070599, "grad_norm": 0.02899874374270439, "learning_rate": 5.074536518524175e-06, "loss": 0.0032, "step": 6080 }, { "epoch": 4.346738159070599, "eval_loss": 0.007215059828013182, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 6080 }, { "epoch": 4.34745308310992, "grad_norm": 0.02670358307659626, "learning_rate": 5.063581888855284e-06, "loss": 0.0032, "step": 6081 }, { "epoch": 4.348168007149241, "grad_norm": 0.0219917930662632, "learning_rate": 5.05263846539798e-06, "loss": 0.0026, "step": 6082 }, { "epoch": 4.3488829311885615, "grad_norm": 0.03274204954504967, "learning_rate": 5.0417062508813005e-06, "loss": 0.0049, "step": 6083 }, { "epoch": 4.3495978552278824, "grad_norm": 0.029175039380788803, "learning_rate": 5.030785248031533e-06, "loss": 0.0039, "step": 6084 }, { "epoch": 4.3503127792672025, "grad_norm": 0.02585165575146675, "learning_rate": 5.019875459572143e-06, "loss": 0.0036, "step": 6085 }, { "epoch": 4.3503127792672025, "eval_loss": 0.007208025082945824, "eval_runtime": 4.5905, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 6085 }, { "epoch": 4.351027703306523, "grad_norm": 0.027851223945617676, "learning_rate": 5.008976888223799e-06, "loss": 0.0034, "step": 6086 }, { "epoch": 4.351742627345844, "grad_norm": 0.024726953357458115, "learning_rate": 4.9980895367043975e-06, "loss": 0.004, "step": 6087 }, { "epoch": 4.352457551385165, "grad_norm": 0.028289208188652992, "learning_rate": 4.987213407729003e-06, "loss": 0.0042, "step": 6088 }, { "epoch": 4.353172475424486, "grad_norm": 0.026194315403699875, "learning_rate": 4.9763485040098975e-06, "loss": 0.0026, "step": 6089 }, { "epoch": 4.353887399463807, "grad_norm": 0.03396173194050789, "learning_rate": 4.965494828256573e-06, "loss": 0.0043, "step": 6090 }, { "epoch": 4.353887399463807, "eval_loss": 0.007237749174237251, "eval_runtime": 4.6099, "eval_samples_per_second": 10.846, "eval_steps_per_second": 2.82, "step": 6090 }, { "epoch": 4.354602323503128, "grad_norm": 0.02792229689657688, "learning_rate": 4.954652383175695e-06, "loss": 0.0046, "step": 6091 }, { "epoch": 4.355317247542448, "grad_norm": 0.026405885815620422, "learning_rate": 4.943821171471158e-06, "loss": 0.0025, "step": 6092 }, { "epoch": 4.356032171581769, "grad_norm": 0.03529457747936249, "learning_rate": 4.933001195844034e-06, "loss": 0.0046, "step": 6093 }, { "epoch": 4.35674709562109, "grad_norm": 0.026236608624458313, "learning_rate": 4.922192458992608e-06, "loss": 0.0028, "step": 6094 }, { "epoch": 4.357462019660411, "grad_norm": 0.02382282353937626, "learning_rate": 4.911394963612359e-06, "loss": 0.0025, "step": 6095 }, { "epoch": 4.357462019660411, "eval_loss": 0.0071966578252613544, "eval_runtime": 4.6102, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 6095 }, { "epoch": 4.358176943699732, "grad_norm": 0.025155313313007355, "learning_rate": 4.900608712395943e-06, "loss": 0.0034, "step": 6096 }, { "epoch": 4.358891867739053, "grad_norm": 0.020982451736927032, "learning_rate": 4.889833708033248e-06, "loss": 0.0026, "step": 6097 }, { "epoch": 4.359606791778374, "grad_norm": 0.02474331110715866, "learning_rate": 4.879069953211313e-06, "loss": 0.0025, "step": 6098 }, { "epoch": 4.360321715817694, "grad_norm": 0.03801548853516579, "learning_rate": 4.868317450614407e-06, "loss": 0.0037, "step": 6099 }, { "epoch": 4.361036639857015, "grad_norm": 0.022563515231013298, "learning_rate": 4.857576202923986e-06, "loss": 0.003, "step": 6100 }, { "epoch": 4.361036639857015, "eval_loss": 0.007221470586955547, "eval_runtime": 4.613, "eval_samples_per_second": 10.839, "eval_steps_per_second": 2.818, "step": 6100 }, { "epoch": 4.361751563896336, "grad_norm": 0.026228290051221848, "learning_rate": 4.846846212818684e-06, "loss": 0.0036, "step": 6101 }, { "epoch": 4.362466487935657, "grad_norm": 0.02960488758981228, "learning_rate": 4.836127482974345e-06, "loss": 0.0041, "step": 6102 }, { "epoch": 4.363181411974978, "grad_norm": 0.03314942866563797, "learning_rate": 4.825420016064009e-06, "loss": 0.0064, "step": 6103 }, { "epoch": 4.363896336014299, "grad_norm": 0.024127431213855743, "learning_rate": 4.814723814757871e-06, "loss": 0.003, "step": 6104 }, { "epoch": 4.3646112600536195, "grad_norm": 0.024583963677287102, "learning_rate": 4.80403888172336e-06, "loss": 0.0024, "step": 6105 }, { "epoch": 4.3646112600536195, "eval_loss": 0.007195612415671349, "eval_runtime": 4.579, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 6105 }, { "epoch": 4.3653261840929405, "grad_norm": 0.022926440462470055, "learning_rate": 4.793365219625079e-06, "loss": 0.0022, "step": 6106 }, { "epoch": 4.3660411081322605, "grad_norm": 0.026070987805724144, "learning_rate": 4.7827028311248024e-06, "loss": 0.0033, "step": 6107 }, { "epoch": 4.3667560321715815, "grad_norm": 0.03119768016040325, "learning_rate": 4.772051718881532e-06, "loss": 0.0039, "step": 6108 }, { "epoch": 4.367470956210902, "grad_norm": 0.03008115477859974, "learning_rate": 4.761411885551409e-06, "loss": 0.0025, "step": 6109 }, { "epoch": 4.368185880250223, "grad_norm": 0.02951090969145298, "learning_rate": 4.750783333787795e-06, "loss": 0.0047, "step": 6110 }, { "epoch": 4.368185880250223, "eval_loss": 0.0072181979194283485, "eval_runtime": 4.6014, "eval_samples_per_second": 10.866, "eval_steps_per_second": 2.825, "step": 6110 }, { "epoch": 4.368900804289544, "grad_norm": 0.031075600534677505, "learning_rate": 4.740166066241247e-06, "loss": 0.0039, "step": 6111 }, { "epoch": 4.369615728328865, "grad_norm": 0.023188503459095955, "learning_rate": 4.729560085559476e-06, "loss": 0.0022, "step": 6112 }, { "epoch": 4.370330652368186, "grad_norm": 0.03555607795715332, "learning_rate": 4.718965394387387e-06, "loss": 0.0023, "step": 6113 }, { "epoch": 4.371045576407507, "grad_norm": 0.030095694586634636, "learning_rate": 4.7083819953671e-06, "loss": 0.0033, "step": 6114 }, { "epoch": 4.371760500446827, "grad_norm": 0.03891676664352417, "learning_rate": 4.697809891137877e-06, "loss": 0.0026, "step": 6115 }, { "epoch": 4.371760500446827, "eval_loss": 0.0071942913345992565, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 6115 }, { "epoch": 4.372475424486148, "grad_norm": 0.02946421317756176, "learning_rate": 4.687249084336182e-06, "loss": 0.0029, "step": 6116 }, { "epoch": 4.373190348525469, "grad_norm": 0.023475656285881996, "learning_rate": 4.676699577595667e-06, "loss": 0.0023, "step": 6117 }, { "epoch": 4.37390527256479, "grad_norm": 0.031094815582036972, "learning_rate": 4.666161373547162e-06, "loss": 0.0064, "step": 6118 }, { "epoch": 4.374620196604111, "grad_norm": 0.02879992313683033, "learning_rate": 4.655634474818682e-06, "loss": 0.0034, "step": 6119 }, { "epoch": 4.375335120643432, "grad_norm": 0.028308173641562462, "learning_rate": 4.6451188840353985e-06, "loss": 0.0057, "step": 6120 }, { "epoch": 4.375335120643432, "eval_loss": 0.007174867670983076, "eval_runtime": 4.6105, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 6120 }, { "epoch": 4.376050044682753, "grad_norm": 0.02317548170685768, "learning_rate": 4.634614603819704e-06, "loss": 0.0025, "step": 6121 }, { "epoch": 4.376764968722073, "grad_norm": 0.02380523458123207, "learning_rate": 4.624121636791129e-06, "loss": 0.0028, "step": 6122 }, { "epoch": 4.377479892761394, "grad_norm": 0.02514830231666565, "learning_rate": 4.613639985566409e-06, "loss": 0.0035, "step": 6123 }, { "epoch": 4.378194816800715, "grad_norm": 0.032240066677331924, "learning_rate": 4.603169652759465e-06, "loss": 0.005, "step": 6124 }, { "epoch": 4.378909740840036, "grad_norm": 0.028875377029180527, "learning_rate": 4.592710640981352e-06, "loss": 0.0024, "step": 6125 }, { "epoch": 4.378909740840036, "eval_loss": 0.007218371145427227, "eval_runtime": 4.5825, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6125 }, { "epoch": 4.379624664879357, "grad_norm": 0.020841659978032112, "learning_rate": 4.5822629528403545e-06, "loss": 0.0024, "step": 6126 }, { "epoch": 4.380339588918678, "grad_norm": 0.024649037048220634, "learning_rate": 4.5718265909419075e-06, "loss": 0.0024, "step": 6127 }, { "epoch": 4.3810545129579985, "grad_norm": 0.025172889232635498, "learning_rate": 4.561401557888606e-06, "loss": 0.0024, "step": 6128 }, { "epoch": 4.381769436997319, "grad_norm": 0.030305735766887665, "learning_rate": 4.550987856280253e-06, "loss": 0.0029, "step": 6129 }, { "epoch": 4.3824843610366395, "grad_norm": 0.025836138054728508, "learning_rate": 4.540585488713817e-06, "loss": 0.0036, "step": 6130 }, { "epoch": 4.3824843610366395, "eval_loss": 0.007225135341286659, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6130 }, { "epoch": 4.3831992850759605, "grad_norm": 0.026671001687645912, "learning_rate": 4.53019445778341e-06, "loss": 0.0042, "step": 6131 }, { "epoch": 4.383914209115281, "grad_norm": 0.027074165642261505, "learning_rate": 4.5198147660803605e-06, "loss": 0.0032, "step": 6132 }, { "epoch": 4.384629133154602, "grad_norm": 0.030329477041959763, "learning_rate": 4.5094464161931305e-06, "loss": 0.0024, "step": 6133 }, { "epoch": 4.385344057193923, "grad_norm": 0.02320021763443947, "learning_rate": 4.499089410707385e-06, "loss": 0.0035, "step": 6134 }, { "epoch": 4.386058981233244, "grad_norm": 0.03162852302193642, "learning_rate": 4.488743752205949e-06, "loss": 0.0036, "step": 6135 }, { "epoch": 4.386058981233244, "eval_loss": 0.007202134467661381, "eval_runtime": 4.5858, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 6135 }, { "epoch": 4.386773905272565, "grad_norm": 0.026605775579810143, "learning_rate": 4.4784094432688e-06, "loss": 0.0025, "step": 6136 }, { "epoch": 4.387488829311885, "grad_norm": 0.035729438066482544, "learning_rate": 4.468086486473111e-06, "loss": 0.0063, "step": 6137 }, { "epoch": 4.388203753351206, "grad_norm": 0.02478761598467827, "learning_rate": 4.457774884393207e-06, "loss": 0.0019, "step": 6138 }, { "epoch": 4.388918677390527, "grad_norm": 0.023109570145606995, "learning_rate": 4.4474746396005965e-06, "loss": 0.0031, "step": 6139 }, { "epoch": 4.389633601429848, "grad_norm": 0.024631792679429054, "learning_rate": 4.437185754663953e-06, "loss": 0.0029, "step": 6140 }, { "epoch": 4.389633601429848, "eval_loss": 0.007202628068625927, "eval_runtime": 4.5799, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6140 }, { "epoch": 4.390348525469169, "grad_norm": 0.025627097114920616, "learning_rate": 4.426908232149091e-06, "loss": 0.0042, "step": 6141 }, { "epoch": 4.39106344950849, "grad_norm": 0.0296808909624815, "learning_rate": 4.41664207461902e-06, "loss": 0.005, "step": 6142 }, { "epoch": 4.391778373547811, "grad_norm": 0.01944749429821968, "learning_rate": 4.406387284633923e-06, "loss": 0.0015, "step": 6143 }, { "epoch": 4.392493297587132, "grad_norm": 0.016933538019657135, "learning_rate": 4.3961438647511065e-06, "loss": 0.0017, "step": 6144 }, { "epoch": 4.393208221626452, "grad_norm": 0.0290033258497715, "learning_rate": 4.38591181752509e-06, "loss": 0.0031, "step": 6145 }, { "epoch": 4.393208221626452, "eval_loss": 0.007239548023790121, "eval_runtime": 4.5865, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.834, "step": 6145 }, { "epoch": 4.393923145665773, "grad_norm": 0.03024812415242195, "learning_rate": 4.375691145507516e-06, "loss": 0.0067, "step": 6146 }, { "epoch": 4.394638069705094, "grad_norm": 0.04401356354355812, "learning_rate": 4.36548185124721e-06, "loss": 0.0044, "step": 6147 }, { "epoch": 4.395352993744415, "grad_norm": 0.024562012404203415, "learning_rate": 4.3552839372901775e-06, "loss": 0.0027, "step": 6148 }, { "epoch": 4.396067917783736, "grad_norm": 0.027805421501398087, "learning_rate": 4.345097406179543e-06, "loss": 0.0036, "step": 6149 }, { "epoch": 4.396782841823057, "grad_norm": 0.026966681703925133, "learning_rate": 4.334922260455626e-06, "loss": 0.0019, "step": 6150 }, { "epoch": 4.396782841823057, "eval_loss": 0.007285743951797485, "eval_runtime": 4.5814, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 6150 }, { "epoch": 4.3974977658623775, "grad_norm": 0.030561141669750214, "learning_rate": 4.324758502655907e-06, "loss": 0.0047, "step": 6151 }, { "epoch": 4.398212689901698, "grad_norm": 0.031599853187799454, "learning_rate": 4.314606135314997e-06, "loss": 0.0052, "step": 6152 }, { "epoch": 4.3989276139410185, "grad_norm": 0.032108500599861145, "learning_rate": 4.304465160964699e-06, "loss": 0.0046, "step": 6153 }, { "epoch": 4.399642537980339, "grad_norm": 0.026500577107071877, "learning_rate": 4.294335582133968e-06, "loss": 0.0038, "step": 6154 }, { "epoch": 4.40035746201966, "grad_norm": 0.02762567065656185, "learning_rate": 4.284217401348889e-06, "loss": 0.0029, "step": 6155 }, { "epoch": 4.40035746201966, "eval_loss": 0.007204790599644184, "eval_runtime": 4.6184, "eval_samples_per_second": 10.826, "eval_steps_per_second": 2.815, "step": 6155 }, { "epoch": 4.401072386058981, "grad_norm": 0.022127121686935425, "learning_rate": 4.274110621132754e-06, "loss": 0.0022, "step": 6156 }, { "epoch": 4.401787310098302, "grad_norm": 0.031932469457387924, "learning_rate": 4.264015244005959e-06, "loss": 0.0032, "step": 6157 }, { "epoch": 4.402502234137623, "grad_norm": 0.023708045482635498, "learning_rate": 4.253931272486094e-06, "loss": 0.0021, "step": 6158 }, { "epoch": 4.403217158176943, "grad_norm": 0.025987805798649788, "learning_rate": 4.2438587090879e-06, "loss": 0.0036, "step": 6159 }, { "epoch": 4.403932082216264, "grad_norm": 0.024010401219129562, "learning_rate": 4.233797556323243e-06, "loss": 0.0033, "step": 6160 }, { "epoch": 4.403932082216264, "eval_loss": 0.00721084326505661, "eval_runtime": 4.5846, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 6160 }, { "epoch": 4.404647006255585, "grad_norm": 0.03477980196475983, "learning_rate": 4.223747816701196e-06, "loss": 0.0063, "step": 6161 }, { "epoch": 4.405361930294906, "grad_norm": 0.03380696848034859, "learning_rate": 4.21370949272793e-06, "loss": 0.0039, "step": 6162 }, { "epoch": 4.406076854334227, "grad_norm": 0.03149792179465294, "learning_rate": 4.20368258690681e-06, "loss": 0.0033, "step": 6163 }, { "epoch": 4.406791778373548, "grad_norm": 0.03053572028875351, "learning_rate": 4.193667101738336e-06, "loss": 0.0043, "step": 6164 }, { "epoch": 4.407506702412869, "grad_norm": 0.024395186454057693, "learning_rate": 4.183663039720159e-06, "loss": 0.0039, "step": 6165 }, { "epoch": 4.407506702412869, "eval_loss": 0.007181425578892231, "eval_runtime": 4.5946, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 6165 }, { "epoch": 4.40822162645219, "grad_norm": 0.023775646463036537, "learning_rate": 4.17367040334708e-06, "loss": 0.0026, "step": 6166 }, { "epoch": 4.40893655049151, "grad_norm": 0.029457177966833115, "learning_rate": 4.163689195111076e-06, "loss": 0.0054, "step": 6167 }, { "epoch": 4.409651474530831, "grad_norm": 0.023847362026572227, "learning_rate": 4.153719417501223e-06, "loss": 0.0022, "step": 6168 }, { "epoch": 4.410366398570152, "grad_norm": 0.033984120935201645, "learning_rate": 4.14376107300381e-06, "loss": 0.0038, "step": 6169 }, { "epoch": 4.411081322609473, "grad_norm": 0.023484567180275917, "learning_rate": 4.133814164102212e-06, "loss": 0.0019, "step": 6170 }, { "epoch": 4.411081322609473, "eval_loss": 0.007154523395001888, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6170 }, { "epoch": 4.411796246648794, "grad_norm": 0.02995440550148487, "learning_rate": 4.123878693276994e-06, "loss": 0.0037, "step": 6171 }, { "epoch": 4.412511170688115, "grad_norm": 0.023821447044610977, "learning_rate": 4.113954663005864e-06, "loss": 0.0024, "step": 6172 }, { "epoch": 4.4132260947274355, "grad_norm": 0.026901138946413994, "learning_rate": 4.104042075763659e-06, "loss": 0.0054, "step": 6173 }, { "epoch": 4.4139410187667565, "grad_norm": 0.029550211504101753, "learning_rate": 4.094140934022378e-06, "loss": 0.0029, "step": 6174 }, { "epoch": 4.4146559428060765, "grad_norm": 0.021958963945508003, "learning_rate": 4.084251240251163e-06, "loss": 0.0018, "step": 6175 }, { "epoch": 4.4146559428060765, "eval_loss": 0.007150876801460981, "eval_runtime": 4.5792, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 6175 }, { "epoch": 4.4153708668453975, "grad_norm": 0.0306364968419075, "learning_rate": 4.0743729969162925e-06, "loss": 0.0026, "step": 6176 }, { "epoch": 4.416085790884718, "grad_norm": 0.021342923864722252, "learning_rate": 4.064506206481195e-06, "loss": 0.0019, "step": 6177 }, { "epoch": 4.416800714924039, "grad_norm": 0.02258634939789772, "learning_rate": 4.054650871406451e-06, "loss": 0.0026, "step": 6178 }, { "epoch": 4.41751563896336, "grad_norm": 0.022842122241854668, "learning_rate": 4.044806994149769e-06, "loss": 0.002, "step": 6179 }, { "epoch": 4.418230563002681, "grad_norm": 0.02842799201607704, "learning_rate": 4.034974577166023e-06, "loss": 0.0047, "step": 6180 }, { "epoch": 4.418230563002681, "eval_loss": 0.007159036118537188, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 6180 }, { "epoch": 4.418945487042002, "grad_norm": 0.02819244936108589, "learning_rate": 4.025153622907191e-06, "loss": 0.0027, "step": 6181 }, { "epoch": 4.419660411081322, "grad_norm": 0.03171747922897339, "learning_rate": 4.015344133822418e-06, "loss": 0.004, "step": 6182 }, { "epoch": 4.420375335120643, "grad_norm": 0.04445064440369606, "learning_rate": 4.005546112358016e-06, "loss": 0.0065, "step": 6183 }, { "epoch": 4.421090259159964, "grad_norm": 0.02369259111583233, "learning_rate": 3.995759560957379e-06, "loss": 0.0022, "step": 6184 }, { "epoch": 4.421805183199285, "grad_norm": 0.02696416527032852, "learning_rate": 3.9859844820610896e-06, "loss": 0.0024, "step": 6185 }, { "epoch": 4.421805183199285, "eval_loss": 0.007188236340880394, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6185 }, { "epoch": 4.422520107238606, "grad_norm": 0.036213990300893784, "learning_rate": 3.9762208781068285e-06, "loss": 0.0058, "step": 6186 }, { "epoch": 4.423235031277927, "grad_norm": 0.02646007388830185, "learning_rate": 3.9664687515294565e-06, "loss": 0.0036, "step": 6187 }, { "epoch": 4.423949955317248, "grad_norm": 0.025758489966392517, "learning_rate": 3.956728104760943e-06, "loss": 0.0029, "step": 6188 }, { "epoch": 4.424664879356568, "grad_norm": 0.02865190990269184, "learning_rate": 3.946998940230401e-06, "loss": 0.0028, "step": 6189 }, { "epoch": 4.425379803395889, "grad_norm": 0.019528202712535858, "learning_rate": 3.937281260364084e-06, "loss": 0.0018, "step": 6190 }, { "epoch": 4.425379803395889, "eval_loss": 0.0071350461803376675, "eval_runtime": 4.5852, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 6190 }, { "epoch": 4.42609472743521, "grad_norm": 0.0271197147667408, "learning_rate": 3.927575067585393e-06, "loss": 0.002, "step": 6191 }, { "epoch": 4.426809651474531, "grad_norm": 0.024152159690856934, "learning_rate": 3.91788036431483e-06, "loss": 0.0023, "step": 6192 }, { "epoch": 4.427524575513852, "grad_norm": 0.03284483402967453, "learning_rate": 3.908197152970072e-06, "loss": 0.0048, "step": 6193 }, { "epoch": 4.428239499553173, "grad_norm": 0.026249583810567856, "learning_rate": 3.898525435965894e-06, "loss": 0.0032, "step": 6194 }, { "epoch": 4.428954423592494, "grad_norm": 0.03276562690734863, "learning_rate": 3.8888652157142324e-06, "loss": 0.0047, "step": 6195 }, { "epoch": 4.428954423592494, "eval_loss": 0.007150932680815458, "eval_runtime": 4.6125, "eval_samples_per_second": 10.84, "eval_steps_per_second": 2.818, "step": 6195 }, { "epoch": 4.4296693476318145, "grad_norm": 0.026001950725913048, "learning_rate": 3.87921649462415e-06, "loss": 0.0035, "step": 6196 }, { "epoch": 4.430384271671135, "grad_norm": 0.035985253751277924, "learning_rate": 3.869579275101826e-06, "loss": 0.0059, "step": 6197 }, { "epoch": 4.4310991957104555, "grad_norm": 0.03361751511693001, "learning_rate": 3.8599535595505884e-06, "loss": 0.0052, "step": 6198 }, { "epoch": 4.4318141197497765, "grad_norm": 0.02803066000342369, "learning_rate": 3.850339350370896e-06, "loss": 0.0046, "step": 6199 }, { "epoch": 4.432529043789097, "grad_norm": 0.027508046478033066, "learning_rate": 3.840736649960325e-06, "loss": 0.0018, "step": 6200 }, { "epoch": 4.432529043789097, "eval_loss": 0.0070962985046207905, "eval_runtime": 4.6087, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 6200 }, { "epoch": 4.433243967828418, "grad_norm": 0.02744363807141781, "learning_rate": 3.831145460713592e-06, "loss": 0.0024, "step": 6201 }, { "epoch": 4.433958891867739, "grad_norm": 0.024622460827231407, "learning_rate": 3.821565785022552e-06, "loss": 0.0029, "step": 6202 }, { "epoch": 4.43467381590706, "grad_norm": 0.022869501262903214, "learning_rate": 3.8119976252761648e-06, "loss": 0.0024, "step": 6203 }, { "epoch": 4.435388739946381, "grad_norm": 0.022725338116288185, "learning_rate": 3.802440983860528e-06, "loss": 0.0027, "step": 6204 }, { "epoch": 4.436103663985701, "grad_norm": 0.027030099183321, "learning_rate": 3.7928958631588797e-06, "loss": 0.0022, "step": 6205 }, { "epoch": 4.436103663985701, "eval_loss": 0.007065132725983858, "eval_runtime": 4.5974, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 6205 }, { "epoch": 4.436818588025022, "grad_norm": 0.026230763643980026, "learning_rate": 3.7833622655515767e-06, "loss": 0.0034, "step": 6206 }, { "epoch": 4.437533512064343, "grad_norm": 0.03346811980009079, "learning_rate": 3.7738401934161005e-06, "loss": 0.0066, "step": 6207 }, { "epoch": 4.438248436103664, "grad_norm": 0.031450577080249786, "learning_rate": 3.7643296491270465e-06, "loss": 0.007, "step": 6208 }, { "epoch": 4.438963360142985, "grad_norm": 0.021568378433585167, "learning_rate": 3.7548306350561666e-06, "loss": 0.0024, "step": 6209 }, { "epoch": 4.439678284182306, "grad_norm": 0.029898973181843758, "learning_rate": 3.745343153572295e-06, "loss": 0.0057, "step": 6210 }, { "epoch": 4.439678284182306, "eval_loss": 0.007063729222863913, "eval_runtime": 4.5752, "eval_samples_per_second": 10.929, "eval_steps_per_second": 2.841, "step": 6210 }, { "epoch": 4.440393208221627, "grad_norm": 0.026340194046497345, "learning_rate": 3.735867207041427e-06, "loss": 0.0029, "step": 6211 }, { "epoch": 4.441108132260947, "grad_norm": 0.02619379758834839, "learning_rate": 3.7264027978266723e-06, "loss": 0.0029, "step": 6212 }, { "epoch": 4.441823056300268, "grad_norm": 0.02372012287378311, "learning_rate": 3.716949928288244e-06, "loss": 0.0022, "step": 6213 }, { "epoch": 4.442537980339589, "grad_norm": 0.025198524817824364, "learning_rate": 3.707508600783499e-06, "loss": 0.0033, "step": 6214 }, { "epoch": 4.44325290437891, "grad_norm": 0.021328197792172432, "learning_rate": 3.6980788176669223e-06, "loss": 0.0026, "step": 6215 }, { "epoch": 4.44325290437891, "eval_loss": 0.007032093591988087, "eval_runtime": 4.6104, "eval_samples_per_second": 10.845, "eval_steps_per_second": 2.82, "step": 6215 }, { "epoch": 4.443967828418231, "grad_norm": 0.022853132337331772, "learning_rate": 3.6886605812900766e-06, "loss": 0.0026, "step": 6216 }, { "epoch": 4.444682752457552, "grad_norm": 0.03526567667722702, "learning_rate": 3.679253894001705e-06, "loss": 0.0032, "step": 6217 }, { "epoch": 4.445397676496873, "grad_norm": 0.02135223150253296, "learning_rate": 3.669858758147621e-06, "loss": 0.0023, "step": 6218 }, { "epoch": 4.446112600536193, "grad_norm": 0.021354110911488533, "learning_rate": 3.660475176070777e-06, "loss": 0.0021, "step": 6219 }, { "epoch": 4.446827524575514, "grad_norm": 0.022455645725131035, "learning_rate": 3.6511031501112623e-06, "loss": 0.0034, "step": 6220 }, { "epoch": 4.446827524575514, "eval_loss": 0.0070450203493237495, "eval_runtime": 4.5927, "eval_samples_per_second": 10.887, "eval_steps_per_second": 2.831, "step": 6220 }, { "epoch": 4.4475424486148345, "grad_norm": 0.036630310118198395, "learning_rate": 3.6417426826062416e-06, "loss": 0.0057, "step": 6221 }, { "epoch": 4.448257372654155, "grad_norm": 0.03520859405398369, "learning_rate": 3.632393775890036e-06, "loss": 0.0048, "step": 6222 }, { "epoch": 4.448972296693476, "grad_norm": 0.025329947471618652, "learning_rate": 3.623056432294075e-06, "loss": 0.0036, "step": 6223 }, { "epoch": 4.449687220732797, "grad_norm": 0.030173631384968758, "learning_rate": 3.6137306541468796e-06, "loss": 0.003, "step": 6224 }, { "epoch": 4.450402144772118, "grad_norm": 0.033882394433021545, "learning_rate": 3.6044164437741167e-06, "loss": 0.0031, "step": 6225 }, { "epoch": 4.450402144772118, "eval_loss": 0.007059859577566385, "eval_runtime": 4.5871, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 6225 }, { "epoch": 4.451117068811438, "grad_norm": 0.0332101546227932, "learning_rate": 3.5951138034985567e-06, "loss": 0.005, "step": 6226 }, { "epoch": 4.451831992850759, "grad_norm": 0.03640732541680336, "learning_rate": 3.5858227356400876e-06, "loss": 0.004, "step": 6227 }, { "epoch": 4.45254691689008, "grad_norm": 0.027424249798059464, "learning_rate": 3.576543242515712e-06, "loss": 0.0028, "step": 6228 }, { "epoch": 4.453261840929401, "grad_norm": 0.02990092523396015, "learning_rate": 3.567275326439534e-06, "loss": 0.004, "step": 6229 }, { "epoch": 4.453976764968722, "grad_norm": 0.024622362107038498, "learning_rate": 3.5580189897227777e-06, "loss": 0.003, "step": 6230 }, { "epoch": 4.453976764968722, "eval_loss": 0.007085258141160011, "eval_runtime": 4.6108, "eval_samples_per_second": 10.844, "eval_steps_per_second": 2.819, "step": 6230 }, { "epoch": 4.454691689008043, "grad_norm": 0.02701437659561634, "learning_rate": 3.5487742346738017e-06, "loss": 0.0025, "step": 6231 }, { "epoch": 4.455406613047364, "grad_norm": 0.022135360166430473, "learning_rate": 3.539541063598034e-06, "loss": 0.0018, "step": 6232 }, { "epoch": 4.456121537086685, "grad_norm": 0.026358241215348244, "learning_rate": 3.53031947879805e-06, "loss": 0.0046, "step": 6233 }, { "epoch": 4.456836461126005, "grad_norm": 0.02771388366818428, "learning_rate": 3.5211094825735145e-06, "loss": 0.0037, "step": 6234 }, { "epoch": 4.457551385165326, "grad_norm": 0.023383911699056625, "learning_rate": 3.511911077221208e-06, "loss": 0.0023, "step": 6235 }, { "epoch": 4.457551385165326, "eval_loss": 0.007083903532475233, "eval_runtime": 4.5792, "eval_samples_per_second": 10.919, "eval_steps_per_second": 2.839, "step": 6235 }, { "epoch": 4.458266309204647, "grad_norm": 0.026333507150411606, "learning_rate": 3.5027242650350343e-06, "loss": 0.0027, "step": 6236 }, { "epoch": 4.458981233243968, "grad_norm": 0.025562051683664322, "learning_rate": 3.4935490483059772e-06, "loss": 0.0032, "step": 6237 }, { "epoch": 4.459696157283289, "grad_norm": 0.031071266159415245, "learning_rate": 3.4843854293221512e-06, "loss": 0.0045, "step": 6238 }, { "epoch": 4.46041108132261, "grad_norm": 0.03182419762015343, "learning_rate": 3.475233410368789e-06, "loss": 0.0047, "step": 6239 }, { "epoch": 4.461126005361931, "grad_norm": 0.03077416680753231, "learning_rate": 3.4660929937281926e-06, "loss": 0.0045, "step": 6240 }, { "epoch": 4.461126005361931, "eval_loss": 0.0071056904271245, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6240 }, { "epoch": 4.4618409294012515, "grad_norm": 0.024482427164912224, "learning_rate": 3.4569641816798114e-06, "loss": 0.0019, "step": 6241 }, { "epoch": 4.462555853440572, "grad_norm": 0.03215309977531433, "learning_rate": 3.447846976500163e-06, "loss": 0.0022, "step": 6242 }, { "epoch": 4.4632707774798925, "grad_norm": 0.034093525260686874, "learning_rate": 3.4387413804628954e-06, "loss": 0.0031, "step": 6243 }, { "epoch": 4.4639857015192135, "grad_norm": 0.04083935171365738, "learning_rate": 3.42964739583877e-06, "loss": 0.0055, "step": 6244 }, { "epoch": 4.464700625558534, "grad_norm": 0.030319813638925552, "learning_rate": 3.4205650248956167e-06, "loss": 0.0048, "step": 6245 }, { "epoch": 4.464700625558534, "eval_loss": 0.007124540861696005, "eval_runtime": 4.5902, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 6245 }, { "epoch": 4.465415549597855, "grad_norm": 0.02412031963467598, "learning_rate": 3.4114942698984023e-06, "loss": 0.0021, "step": 6246 }, { "epoch": 4.466130473637176, "grad_norm": 0.022657975554466248, "learning_rate": 3.402435133109194e-06, "loss": 0.003, "step": 6247 }, { "epoch": 4.466845397676497, "grad_norm": 0.03063354268670082, "learning_rate": 3.3933876167871193e-06, "loss": 0.0035, "step": 6248 }, { "epoch": 4.467560321715817, "grad_norm": 0.0212885569781065, "learning_rate": 3.384351723188489e-06, "loss": 0.0035, "step": 6249 }, { "epoch": 4.468275245755138, "grad_norm": 0.02833453193306923, "learning_rate": 3.375327454566629e-06, "loss": 0.0037, "step": 6250 }, { "epoch": 4.468275245755138, "eval_loss": 0.007092955056577921, "eval_runtime": 4.6072, "eval_samples_per_second": 10.853, "eval_steps_per_second": 2.822, "step": 6250 }, { "epoch": 4.468990169794459, "grad_norm": 0.02888507768511772, "learning_rate": 3.3663148131720223e-06, "loss": 0.0059, "step": 6251 }, { "epoch": 4.46970509383378, "grad_norm": 0.029433289542794228, "learning_rate": 3.3573138012522375e-06, "loss": 0.0025, "step": 6252 }, { "epoch": 4.470420017873101, "grad_norm": 0.028789574280381203, "learning_rate": 3.3483244210519292e-06, "loss": 0.0024, "step": 6253 }, { "epoch": 4.471134941912422, "grad_norm": 0.025397194549441338, "learning_rate": 3.3393466748128654e-06, "loss": 0.0024, "step": 6254 }, { "epoch": 4.471849865951743, "grad_norm": 0.028686845675110817, "learning_rate": 3.330380564773922e-06, "loss": 0.0061, "step": 6255 }, { "epoch": 4.471849865951743, "eval_loss": 0.0070550027303397655, "eval_runtime": 4.5763, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.841, "step": 6255 }, { "epoch": 4.472564789991063, "grad_norm": 0.023312494158744812, "learning_rate": 3.321426093171043e-06, "loss": 0.0018, "step": 6256 }, { "epoch": 4.473279714030384, "grad_norm": 0.022054992616176605, "learning_rate": 3.3124832622373036e-06, "loss": 0.0028, "step": 6257 }, { "epoch": 4.473994638069705, "grad_norm": 0.025229327380657196, "learning_rate": 3.303552074202848e-06, "loss": 0.0037, "step": 6258 }, { "epoch": 4.474709562109026, "grad_norm": 0.027017204090952873, "learning_rate": 3.294632531294933e-06, "loss": 0.004, "step": 6259 }, { "epoch": 4.475424486148347, "grad_norm": 0.02650728076696396, "learning_rate": 3.285724635737919e-06, "loss": 0.0041, "step": 6260 }, { "epoch": 4.475424486148347, "eval_loss": 0.007095033302903175, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6260 }, { "epoch": 4.476139410187668, "grad_norm": 0.031718626618385315, "learning_rate": 3.276828389753234e-06, "loss": 0.0065, "step": 6261 }, { "epoch": 4.476854334226989, "grad_norm": 0.0269191712141037, "learning_rate": 3.267943795559425e-06, "loss": 0.0037, "step": 6262 }, { "epoch": 4.47756925826631, "grad_norm": 0.03827166184782982, "learning_rate": 3.2590708553721318e-06, "loss": 0.0059, "step": 6263 }, { "epoch": 4.47828418230563, "grad_norm": 0.0256330668926239, "learning_rate": 3.250209571404067e-06, "loss": 0.003, "step": 6264 }, { "epoch": 4.478999106344951, "grad_norm": 0.029430018737912178, "learning_rate": 3.2413599458650746e-06, "loss": 0.0044, "step": 6265 }, { "epoch": 4.478999106344951, "eval_loss": 0.007097229361534119, "eval_runtime": 4.5879, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.834, "step": 6265 }, { "epoch": 4.4797140303842715, "grad_norm": 0.02275266870856285, "learning_rate": 3.232521980962039e-06, "loss": 0.0028, "step": 6266 }, { "epoch": 4.4804289544235925, "grad_norm": 0.02650543302297592, "learning_rate": 3.22369567889898e-06, "loss": 0.0032, "step": 6267 }, { "epoch": 4.481143878462913, "grad_norm": 0.027856577187776566, "learning_rate": 3.214881041877005e-06, "loss": 0.0036, "step": 6268 }, { "epoch": 4.481858802502234, "grad_norm": 0.037651631981134415, "learning_rate": 3.2060780720942762e-06, "loss": 0.0043, "step": 6269 }, { "epoch": 4.482573726541555, "grad_norm": 0.028884870931506157, "learning_rate": 3.197286771746094e-06, "loss": 0.0032, "step": 6270 }, { "epoch": 4.482573726541555, "eval_loss": 0.007053326349705458, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6270 }, { "epoch": 4.483288650580876, "grad_norm": 0.028368350118398666, "learning_rate": 3.1885071430248158e-06, "loss": 0.0033, "step": 6271 }, { "epoch": 4.484003574620196, "grad_norm": 0.03310739994049072, "learning_rate": 3.1797391881199014e-06, "loss": 0.002, "step": 6272 }, { "epoch": 4.484718498659517, "grad_norm": 0.027994254603981972, "learning_rate": 3.170982909217907e-06, "loss": 0.0023, "step": 6273 }, { "epoch": 4.485433422698838, "grad_norm": 0.025332080200314522, "learning_rate": 3.162238308502452e-06, "loss": 0.0028, "step": 6274 }, { "epoch": 4.486148346738159, "grad_norm": 0.02662665583193302, "learning_rate": 3.1535053881542655e-06, "loss": 0.0039, "step": 6275 }, { "epoch": 4.486148346738159, "eval_loss": 0.007062715943902731, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6275 }, { "epoch": 4.48686327077748, "grad_norm": 0.030798865482211113, "learning_rate": 3.14478415035116e-06, "loss": 0.0043, "step": 6276 }, { "epoch": 4.487578194816801, "grad_norm": 0.029047923162579536, "learning_rate": 3.136074597268024e-06, "loss": 0.0052, "step": 6277 }, { "epoch": 4.488293118856122, "grad_norm": 0.027687542140483856, "learning_rate": 3.1273767310768364e-06, "loss": 0.0033, "step": 6278 }, { "epoch": 4.489008042895442, "grad_norm": 0.020460709929466248, "learning_rate": 3.118690553946685e-06, "loss": 0.0018, "step": 6279 }, { "epoch": 4.489722966934763, "grad_norm": 0.03156713396310806, "learning_rate": 3.110016068043703e-06, "loss": 0.0061, "step": 6280 }, { "epoch": 4.489722966934763, "eval_loss": 0.007045576348900795, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6280 }, { "epoch": 4.490437890974084, "grad_norm": 0.02635188214480877, "learning_rate": 3.101353275531138e-06, "loss": 0.0031, "step": 6281 }, { "epoch": 4.491152815013405, "grad_norm": 0.025210583582520485, "learning_rate": 3.092702178569301e-06, "loss": 0.0047, "step": 6282 }, { "epoch": 4.491867739052726, "grad_norm": 0.028730466961860657, "learning_rate": 3.0840627793155994e-06, "loss": 0.0043, "step": 6283 }, { "epoch": 4.492582663092047, "grad_norm": 0.028377845883369446, "learning_rate": 3.0754350799245323e-06, "loss": 0.0038, "step": 6284 }, { "epoch": 4.493297587131368, "grad_norm": 0.023456022143363953, "learning_rate": 3.0668190825476504e-06, "loss": 0.0022, "step": 6285 }, { "epoch": 4.493297587131368, "eval_loss": 0.007050015032291412, "eval_runtime": 4.6043, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 6285 }, { "epoch": 4.494012511170688, "grad_norm": 0.029227115213871002, "learning_rate": 3.058214789333613e-06, "loss": 0.0023, "step": 6286 }, { "epoch": 4.494727435210009, "grad_norm": 0.03468048572540283, "learning_rate": 3.0496222024281653e-06, "loss": 0.0034, "step": 6287 }, { "epoch": 4.4954423592493296, "grad_norm": 0.02372249960899353, "learning_rate": 3.0410413239740975e-06, "loss": 0.0025, "step": 6288 }, { "epoch": 4.4961572832886505, "grad_norm": 0.029684731736779213, "learning_rate": 3.0324721561113213e-06, "loss": 0.0028, "step": 6289 }, { "epoch": 4.496872207327971, "grad_norm": 0.031918834894895554, "learning_rate": 3.0239147009767987e-06, "loss": 0.0045, "step": 6290 }, { "epoch": 4.496872207327971, "eval_loss": 0.007100119721144438, "eval_runtime": 4.5789, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 6290 }, { "epoch": 4.497587131367292, "grad_norm": 0.024172428995370865, "learning_rate": 3.0153689607045845e-06, "loss": 0.0025, "step": 6291 }, { "epoch": 4.498302055406613, "grad_norm": 0.034460462629795074, "learning_rate": 3.0068349374258176e-06, "loss": 0.0067, "step": 6292 }, { "epoch": 4.499016979445934, "grad_norm": 0.02211238257586956, "learning_rate": 2.9983126332686794e-06, "loss": 0.0019, "step": 6293 }, { "epoch": 4.499731903485254, "grad_norm": 0.02234301157295704, "learning_rate": 2.989802050358498e-06, "loss": 0.0026, "step": 6294 }, { "epoch": 4.500446827524575, "grad_norm": 0.026565195992588997, "learning_rate": 2.9813031908176024e-06, "loss": 0.0022, "step": 6295 }, { "epoch": 4.500446827524575, "eval_loss": 0.007144973613321781, "eval_runtime": 4.5881, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 6295 }, { "epoch": 4.501161751563896, "grad_norm": 0.042375318706035614, "learning_rate": 2.9728160567654484e-06, "loss": 0.0077, "step": 6296 }, { "epoch": 4.501876675603217, "grad_norm": 0.033143550157547, "learning_rate": 2.964340650318548e-06, "loss": 0.0043, "step": 6297 }, { "epoch": 4.502591599642538, "grad_norm": 0.029389504343271255, "learning_rate": 2.955876973590488e-06, "loss": 0.0027, "step": 6298 }, { "epoch": 4.503306523681859, "grad_norm": 0.02693074196577072, "learning_rate": 2.947425028691936e-06, "loss": 0.0043, "step": 6299 }, { "epoch": 4.50402144772118, "grad_norm": 0.028375854715704918, "learning_rate": 2.938984817730639e-06, "loss": 0.0029, "step": 6300 }, { "epoch": 4.50402144772118, "eval_loss": 0.007129484787583351, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6300 }, { "epoch": 4.504736371760501, "grad_norm": 0.02480219677090645, "learning_rate": 2.9305563428113968e-06, "loss": 0.0025, "step": 6301 }, { "epoch": 4.505451295799821, "grad_norm": 0.026062365621328354, "learning_rate": 2.9221396060361063e-06, "loss": 0.0037, "step": 6302 }, { "epoch": 4.506166219839142, "grad_norm": 0.01974448747932911, "learning_rate": 2.913734609503732e-06, "loss": 0.002, "step": 6303 }, { "epoch": 4.506881143878463, "grad_norm": 0.025260306894779205, "learning_rate": 2.905341355310287e-06, "loss": 0.0032, "step": 6304 }, { "epoch": 4.507596067917784, "grad_norm": 0.0238971970975399, "learning_rate": 2.8969598455489022e-06, "loss": 0.0027, "step": 6305 }, { "epoch": 4.507596067917784, "eval_loss": 0.007147557567805052, "eval_runtime": 4.5775, "eval_samples_per_second": 10.923, "eval_steps_per_second": 2.84, "step": 6305 }, { "epoch": 4.508310991957105, "grad_norm": 0.022259658202528954, "learning_rate": 2.8885900823097222e-06, "loss": 0.0022, "step": 6306 }, { "epoch": 4.509025915996426, "grad_norm": 0.03046254627406597, "learning_rate": 2.880232067680011e-06, "loss": 0.0034, "step": 6307 }, { "epoch": 4.509740840035747, "grad_norm": 0.021836455911397934, "learning_rate": 2.87188580374409e-06, "loss": 0.0033, "step": 6308 }, { "epoch": 4.510455764075067, "grad_norm": 0.03232728689908981, "learning_rate": 2.863551292583322e-06, "loss": 0.0053, "step": 6309 }, { "epoch": 4.511170688114388, "grad_norm": 0.018922897055745125, "learning_rate": 2.855228536276183e-06, "loss": 0.0018, "step": 6310 }, { "epoch": 4.511170688114388, "eval_loss": 0.007105090189725161, "eval_runtime": 4.6024, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 6310 }, { "epoch": 4.5118856121537085, "grad_norm": 0.023055143654346466, "learning_rate": 2.8469175368981915e-06, "loss": 0.0022, "step": 6311 }, { "epoch": 4.5126005361930295, "grad_norm": 0.02482178620994091, "learning_rate": 2.8386182965219223e-06, "loss": 0.0036, "step": 6312 }, { "epoch": 4.51331546023235, "grad_norm": 0.026915740221738815, "learning_rate": 2.8303308172170582e-06, "loss": 0.0018, "step": 6313 }, { "epoch": 4.514030384271671, "grad_norm": 0.020234929397702217, "learning_rate": 2.8220551010503084e-06, "loss": 0.0022, "step": 6314 }, { "epoch": 4.514745308310992, "grad_norm": 0.023011820390820503, "learning_rate": 2.813791150085454e-06, "loss": 0.0025, "step": 6315 }, { "epoch": 4.514745308310992, "eval_loss": 0.007129116915166378, "eval_runtime": 4.5825, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6315 }, { "epoch": 4.515460232350312, "grad_norm": 0.029531264677643776, "learning_rate": 2.805538966383392e-06, "loss": 0.004, "step": 6316 }, { "epoch": 4.516175156389633, "grad_norm": 0.019086331129074097, "learning_rate": 2.797298552002009e-06, "loss": 0.0018, "step": 6317 }, { "epoch": 4.516890080428954, "grad_norm": 0.029939275234937668, "learning_rate": 2.7890699089963223e-06, "loss": 0.0028, "step": 6318 }, { "epoch": 4.517605004468275, "grad_norm": 0.03086797520518303, "learning_rate": 2.7808530394183575e-06, "loss": 0.0054, "step": 6319 }, { "epoch": 4.518319928507596, "grad_norm": 0.028534293174743652, "learning_rate": 2.7726479453172415e-06, "loss": 0.0028, "step": 6320 }, { "epoch": 4.518319928507596, "eval_loss": 0.007145327981561422, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 6320 }, { "epoch": 4.519034852546917, "grad_norm": 0.025769533589482307, "learning_rate": 2.7644546287391715e-06, "loss": 0.0035, "step": 6321 }, { "epoch": 4.519749776586238, "grad_norm": 0.03157801181077957, "learning_rate": 2.7562730917273625e-06, "loss": 0.0038, "step": 6322 }, { "epoch": 4.520464700625558, "grad_norm": 0.04427046328783035, "learning_rate": 2.7481033363221387e-06, "loss": 0.0041, "step": 6323 }, { "epoch": 4.521179624664879, "grad_norm": 0.02725585550069809, "learning_rate": 2.7399453645608697e-06, "loss": 0.0033, "step": 6324 }, { "epoch": 4.5218945487042, "grad_norm": 0.024170907214283943, "learning_rate": 2.7317991784779727e-06, "loss": 0.0035, "step": 6325 }, { "epoch": 4.5218945487042, "eval_loss": 0.007166360039263964, "eval_runtime": 4.5835, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6325 }, { "epoch": 4.522609472743521, "grad_norm": 0.025312259793281555, "learning_rate": 2.7236647801049453e-06, "loss": 0.0031, "step": 6326 }, { "epoch": 4.523324396782842, "grad_norm": 0.028761737048625946, "learning_rate": 2.715542171470342e-06, "loss": 0.0039, "step": 6327 }, { "epoch": 4.524039320822163, "grad_norm": 0.03268183767795563, "learning_rate": 2.707431354599754e-06, "loss": 0.0053, "step": 6328 }, { "epoch": 4.524754244861484, "grad_norm": 0.026734478771686554, "learning_rate": 2.69933233151588e-06, "loss": 0.0041, "step": 6329 }, { "epoch": 4.525469168900805, "grad_norm": 0.029261041432619095, "learning_rate": 2.6912451042384214e-06, "loss": 0.0039, "step": 6330 }, { "epoch": 4.525469168900805, "eval_loss": 0.0071192351169884205, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6330 }, { "epoch": 4.526184092940126, "grad_norm": 0.022151879966259003, "learning_rate": 2.6831696747841804e-06, "loss": 0.0023, "step": 6331 }, { "epoch": 4.526899016979446, "grad_norm": 0.024307535961270332, "learning_rate": 2.675106045167003e-06, "loss": 0.0032, "step": 6332 }, { "epoch": 4.527613941018767, "grad_norm": 0.02850949391722679, "learning_rate": 2.667054217397774e-06, "loss": 0.0053, "step": 6333 }, { "epoch": 4.5283288650580875, "grad_norm": 0.03499632328748703, "learning_rate": 2.6590141934844715e-06, "loss": 0.0044, "step": 6334 }, { "epoch": 4.5290437890974085, "grad_norm": 0.025769367814064026, "learning_rate": 2.6509859754321077e-06, "loss": 0.0029, "step": 6335 }, { "epoch": 4.5290437890974085, "eval_loss": 0.0070981672033667564, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6335 }, { "epoch": 4.529758713136729, "grad_norm": 0.026010755449533463, "learning_rate": 2.6429695652427477e-06, "loss": 0.0033, "step": 6336 }, { "epoch": 4.53047363717605, "grad_norm": 0.025851259008049965, "learning_rate": 2.634964964915515e-06, "loss": 0.0035, "step": 6337 }, { "epoch": 4.531188561215371, "grad_norm": 0.023607606068253517, "learning_rate": 2.6269721764466014e-06, "loss": 0.0034, "step": 6338 }, { "epoch": 4.531903485254691, "grad_norm": 0.026818882673978806, "learning_rate": 2.618991201829235e-06, "loss": 0.0028, "step": 6339 }, { "epoch": 4.532618409294012, "grad_norm": 0.017449596896767616, "learning_rate": 2.6110220430537126e-06, "loss": 0.0017, "step": 6340 }, { "epoch": 4.532618409294012, "eval_loss": 0.007115814369171858, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6340 }, { "epoch": 4.533333333333333, "grad_norm": 0.03122848831117153, "learning_rate": 2.6030647021073718e-06, "loss": 0.0059, "step": 6341 }, { "epoch": 4.534048257372654, "grad_norm": 0.0302316565066576, "learning_rate": 2.5951191809746144e-06, "loss": 0.0031, "step": 6342 }, { "epoch": 4.534763181411975, "grad_norm": 0.030890582129359245, "learning_rate": 2.587185481636878e-06, "loss": 0.0042, "step": 6343 }, { "epoch": 4.535478105451296, "grad_norm": 0.026405489072203636, "learning_rate": 2.579263606072668e-06, "loss": 0.0024, "step": 6344 }, { "epoch": 4.536193029490617, "grad_norm": 0.03140515834093094, "learning_rate": 2.5713535562575443e-06, "loss": 0.0047, "step": 6345 }, { "epoch": 4.536193029490617, "eval_loss": 0.007136113476008177, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 6345 }, { "epoch": 4.536907953529937, "grad_norm": 0.02556844986975193, "learning_rate": 2.5634553341640945e-06, "loss": 0.0032, "step": 6346 }, { "epoch": 4.537622877569258, "grad_norm": 0.029324905946850777, "learning_rate": 2.555568941761982e-06, "loss": 0.004, "step": 6347 }, { "epoch": 4.538337801608579, "grad_norm": 0.027967067435383797, "learning_rate": 2.5476943810179122e-06, "loss": 0.0036, "step": 6348 }, { "epoch": 4.5390527256479, "grad_norm": 0.028831008821725845, "learning_rate": 2.5398316538956244e-06, "loss": 0.0045, "step": 6349 }, { "epoch": 4.539767649687221, "grad_norm": 0.025780759751796722, "learning_rate": 2.5319807623559287e-06, "loss": 0.0027, "step": 6350 }, { "epoch": 4.539767649687221, "eval_loss": 0.007190987467765808, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6350 }, { "epoch": 4.540482573726542, "grad_norm": 0.025729980319738388, "learning_rate": 2.524141708356681e-06, "loss": 0.0025, "step": 6351 }, { "epoch": 4.541197497765863, "grad_norm": 0.023949839174747467, "learning_rate": 2.5163144938527673e-06, "loss": 0.0033, "step": 6352 }, { "epoch": 4.541912421805183, "grad_norm": 0.03150177374482155, "learning_rate": 2.5084991207961373e-06, "loss": 0.004, "step": 6353 }, { "epoch": 4.542627345844504, "grad_norm": 0.027334561571478844, "learning_rate": 2.500695591135782e-06, "loss": 0.002, "step": 6354 }, { "epoch": 4.543342269883825, "grad_norm": 0.03537203371524811, "learning_rate": 2.492903906817734e-06, "loss": 0.0038, "step": 6355 }, { "epoch": 4.543342269883825, "eval_loss": 0.00718738092109561, "eval_runtime": 4.58, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6355 }, { "epoch": 4.5440571939231456, "grad_norm": 0.031541526317596436, "learning_rate": 2.4851240697850997e-06, "loss": 0.0054, "step": 6356 }, { "epoch": 4.5447721179624665, "grad_norm": 0.029783356934785843, "learning_rate": 2.477356081977983e-06, "loss": 0.0035, "step": 6357 }, { "epoch": 4.545487042001787, "grad_norm": 0.030486533418297768, "learning_rate": 2.4695999453335672e-06, "loss": 0.0044, "step": 6358 }, { "epoch": 4.546201966041108, "grad_norm": 0.030429888516664505, "learning_rate": 2.4618556617860776e-06, "loss": 0.0041, "step": 6359 }, { "epoch": 4.546916890080429, "grad_norm": 0.02717489004135132, "learning_rate": 2.454123233266781e-06, "loss": 0.003, "step": 6360 }, { "epoch": 4.546916890080429, "eval_loss": 0.007168032694607973, "eval_runtime": 4.5851, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 6360 }, { "epoch": 4.54763181411975, "grad_norm": 0.025820806622505188, "learning_rate": 2.446402661703978e-06, "loss": 0.0023, "step": 6361 }, { "epoch": 4.54834673815907, "grad_norm": 0.03156621754169464, "learning_rate": 2.4386939490230186e-06, "loss": 0.0071, "step": 6362 }, { "epoch": 4.549061662198391, "grad_norm": 0.031509384512901306, "learning_rate": 2.430997097146298e-06, "loss": 0.0027, "step": 6363 }, { "epoch": 4.549776586237712, "grad_norm": 0.026853114366531372, "learning_rate": 2.4233121079932585e-06, "loss": 0.0024, "step": 6364 }, { "epoch": 4.550491510277033, "grad_norm": 0.024954475462436676, "learning_rate": 2.4156389834803616e-06, "loss": 0.0025, "step": 6365 }, { "epoch": 4.550491510277033, "eval_loss": 0.007168457843363285, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6365 }, { "epoch": 4.551206434316354, "grad_norm": 0.02274089865386486, "learning_rate": 2.4079777255211434e-06, "loss": 0.0024, "step": 6366 }, { "epoch": 4.551921358355675, "grad_norm": 0.02797696180641651, "learning_rate": 2.4003283360261475e-06, "loss": 0.0025, "step": 6367 }, { "epoch": 4.552636282394996, "grad_norm": 0.028065994381904602, "learning_rate": 2.392690816902976e-06, "loss": 0.0034, "step": 6368 }, { "epoch": 4.553351206434316, "grad_norm": 0.028747767210006714, "learning_rate": 2.385065170056283e-06, "loss": 0.0041, "step": 6369 }, { "epoch": 4.554066130473637, "grad_norm": 0.028002044185996056, "learning_rate": 2.3774513973877254e-06, "loss": 0.0038, "step": 6370 }, { "epoch": 4.554066130473637, "eval_loss": 0.007113579194992781, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6370 }, { "epoch": 4.554781054512958, "grad_norm": 0.028946273028850555, "learning_rate": 2.3698495007960286e-06, "loss": 0.0024, "step": 6371 }, { "epoch": 4.555495978552279, "grad_norm": 0.03315328434109688, "learning_rate": 2.3622594821769596e-06, "loss": 0.0046, "step": 6372 }, { "epoch": 4.5562109025916, "grad_norm": 0.021298082545399666, "learning_rate": 2.3546813434232927e-06, "loss": 0.002, "step": 6373 }, { "epoch": 4.556925826630921, "grad_norm": 0.03310447931289673, "learning_rate": 2.347115086424867e-06, "loss": 0.0056, "step": 6374 }, { "epoch": 4.557640750670242, "grad_norm": 0.02728324383497238, "learning_rate": 2.3395607130685615e-06, "loss": 0.0023, "step": 6375 }, { "epoch": 4.557640750670242, "eval_loss": 0.007077910006046295, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 6375 }, { "epoch": 4.558355674709562, "grad_norm": 0.020810812711715698, "learning_rate": 2.332018225238264e-06, "loss": 0.0019, "step": 6376 }, { "epoch": 4.559070598748883, "grad_norm": 0.02338842675089836, "learning_rate": 2.3244876248149193e-06, "loss": 0.0025, "step": 6377 }, { "epoch": 4.559785522788204, "grad_norm": 0.02498248778283596, "learning_rate": 2.316968913676504e-06, "loss": 0.0021, "step": 6378 }, { "epoch": 4.5605004468275245, "grad_norm": 0.026785360649228096, "learning_rate": 2.309462093698028e-06, "loss": 0.0027, "step": 6379 }, { "epoch": 4.5612153708668455, "grad_norm": 0.023307841271162033, "learning_rate": 2.301967166751545e-06, "loss": 0.0034, "step": 6380 }, { "epoch": 4.5612153708668455, "eval_loss": 0.00713927811011672, "eval_runtime": 4.6012, "eval_samples_per_second": 10.867, "eval_steps_per_second": 2.825, "step": 6380 }, { "epoch": 4.561930294906166, "grad_norm": 0.035712700337171555, "learning_rate": 2.294484134706115e-06, "loss": 0.0025, "step": 6381 }, { "epoch": 4.562645218945487, "grad_norm": 0.028780685737729073, "learning_rate": 2.2870129994278733e-06, "loss": 0.0036, "step": 6382 }, { "epoch": 4.563360142984807, "grad_norm": 0.03108803555369377, "learning_rate": 2.2795537627799513e-06, "loss": 0.0045, "step": 6383 }, { "epoch": 4.564075067024128, "grad_norm": 0.026272345334291458, "learning_rate": 2.2721064266225333e-06, "loss": 0.0027, "step": 6384 }, { "epoch": 4.564789991063449, "grad_norm": 0.0280915517359972, "learning_rate": 2.2646709928128394e-06, "loss": 0.0046, "step": 6385 }, { "epoch": 4.564789991063449, "eval_loss": 0.007099051959812641, "eval_runtime": 4.6004, "eval_samples_per_second": 10.869, "eval_steps_per_second": 2.826, "step": 6385 }, { "epoch": 4.56550491510277, "grad_norm": 0.03160857781767845, "learning_rate": 2.2572474632050977e-06, "loss": 0.0023, "step": 6386 }, { "epoch": 4.566219839142091, "grad_norm": 0.02622440829873085, "learning_rate": 2.249835839650588e-06, "loss": 0.0025, "step": 6387 }, { "epoch": 4.566934763181412, "grad_norm": 0.026560809463262558, "learning_rate": 2.242436123997621e-06, "loss": 0.0026, "step": 6388 }, { "epoch": 4.567649687220733, "grad_norm": 0.019882837310433388, "learning_rate": 2.2350483180915206e-06, "loss": 0.0019, "step": 6389 }, { "epoch": 4.568364611260054, "grad_norm": 0.02559405192732811, "learning_rate": 2.2276724237746683e-06, "loss": 0.0047, "step": 6390 }, { "epoch": 4.568364611260054, "eval_loss": 0.0070845624431967735, "eval_runtime": 4.5857, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 6390 }, { "epoch": 4.569079535299375, "grad_norm": 0.03199129179120064, "learning_rate": 2.220308442886443e-06, "loss": 0.0038, "step": 6391 }, { "epoch": 4.569794459338695, "grad_norm": 0.02538278140127659, "learning_rate": 2.2129563772632755e-06, "loss": 0.0027, "step": 6392 }, { "epoch": 4.570509383378016, "grad_norm": 0.029584527015686035, "learning_rate": 2.2056162287386327e-06, "loss": 0.0042, "step": 6393 }, { "epoch": 4.571224307417337, "grad_norm": 0.024519041180610657, "learning_rate": 2.1982879991429728e-06, "loss": 0.0024, "step": 6394 }, { "epoch": 4.571939231456658, "grad_norm": 0.030713576823472977, "learning_rate": 2.1909716903038115e-06, "loss": 0.0057, "step": 6395 }, { "epoch": 4.571939231456658, "eval_loss": 0.007111558690667152, "eval_runtime": 4.6215, "eval_samples_per_second": 10.819, "eval_steps_per_second": 2.813, "step": 6395 }, { "epoch": 4.572654155495979, "grad_norm": 0.02548370510339737, "learning_rate": 2.1836673040456944e-06, "loss": 0.0033, "step": 6396 }, { "epoch": 4.5733690795353, "grad_norm": 0.023522350937128067, "learning_rate": 2.1763748421901763e-06, "loss": 0.0025, "step": 6397 }, { "epoch": 4.574084003574621, "grad_norm": 0.028160687536001205, "learning_rate": 2.1690943065558412e-06, "loss": 0.0035, "step": 6398 }, { "epoch": 4.574798927613941, "grad_norm": 0.02545909397304058, "learning_rate": 2.1618256989583196e-06, "loss": 0.0027, "step": 6399 }, { "epoch": 4.575513851653262, "grad_norm": 0.02259010821580887, "learning_rate": 2.1545690212102343e-06, "loss": 0.0028, "step": 6400 }, { "epoch": 4.575513851653262, "eval_loss": 0.007115661632269621, "eval_runtime": 4.5802, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6400 }, { "epoch": 4.576228775692583, "grad_norm": 0.028131801635026932, "learning_rate": 2.1473242751212653e-06, "loss": 0.0024, "step": 6401 }, { "epoch": 4.5769436997319035, "grad_norm": 0.023507483303546906, "learning_rate": 2.140091462498084e-06, "loss": 0.0019, "step": 6402 }, { "epoch": 4.5776586237712245, "grad_norm": 0.03835359960794449, "learning_rate": 2.13287058514442e-06, "loss": 0.0055, "step": 6403 }, { "epoch": 4.578373547810545, "grad_norm": 0.03058980219066143, "learning_rate": 2.125661644860999e-06, "loss": 0.0039, "step": 6404 }, { "epoch": 4.579088471849866, "grad_norm": 0.0256726685911417, "learning_rate": 2.1184646434455947e-06, "loss": 0.0023, "step": 6405 }, { "epoch": 4.579088471849866, "eval_loss": 0.007094697095453739, "eval_runtime": 4.5809, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6405 }, { "epoch": 4.579803395889186, "grad_norm": 0.024126531556248665, "learning_rate": 2.111279582692982e-06, "loss": 0.0021, "step": 6406 }, { "epoch": 4.580518319928507, "grad_norm": 0.02600071206688881, "learning_rate": 2.104106464394967e-06, "loss": 0.0028, "step": 6407 }, { "epoch": 4.581233243967828, "grad_norm": 0.021058740094304085, "learning_rate": 2.0969452903403742e-06, "loss": 0.0021, "step": 6408 }, { "epoch": 4.581948168007149, "grad_norm": 0.033710330724716187, "learning_rate": 2.089796062315058e-06, "loss": 0.0029, "step": 6409 }, { "epoch": 4.58266309204647, "grad_norm": 0.03888450562953949, "learning_rate": 2.0826587821018817e-06, "loss": 0.004, "step": 6410 }, { "epoch": 4.58266309204647, "eval_loss": 0.007140099070966244, "eval_runtime": 4.5788, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 6410 }, { "epoch": 4.583378016085791, "grad_norm": 0.024443604052066803, "learning_rate": 2.075533451480738e-06, "loss": 0.0032, "step": 6411 }, { "epoch": 4.584092940125112, "grad_norm": 0.023738259449601173, "learning_rate": 2.0684200722285383e-06, "loss": 0.0032, "step": 6412 }, { "epoch": 4.584807864164432, "grad_norm": 0.027216555550694466, "learning_rate": 2.061318646119209e-06, "loss": 0.0031, "step": 6413 }, { "epoch": 4.585522788203753, "grad_norm": 0.027847737073898315, "learning_rate": 2.0542291749237054e-06, "loss": 0.0034, "step": 6414 }, { "epoch": 4.586237712243074, "grad_norm": 0.025179505348205566, "learning_rate": 2.0471516604099792e-06, "loss": 0.0043, "step": 6415 }, { "epoch": 4.586237712243074, "eval_loss": 0.007100123446434736, "eval_runtime": 4.6582, "eval_samples_per_second": 10.734, "eval_steps_per_second": 2.791, "step": 6415 }, { "epoch": 4.586952636282395, "grad_norm": 0.02421646937727928, "learning_rate": 2.040086104343025e-06, "loss": 0.0023, "step": 6416 }, { "epoch": 4.587667560321716, "grad_norm": 0.022700924426317215, "learning_rate": 2.033032508484861e-06, "loss": 0.0023, "step": 6417 }, { "epoch": 4.588382484361037, "grad_norm": 0.026840725913643837, "learning_rate": 2.02599087459448e-06, "loss": 0.0028, "step": 6418 }, { "epoch": 4.589097408400358, "grad_norm": 0.02776055969297886, "learning_rate": 2.0189612044279384e-06, "loss": 0.0022, "step": 6419 }, { "epoch": 4.589812332439678, "grad_norm": 0.029351716861128807, "learning_rate": 2.0119434997382893e-06, "loss": 0.0026, "step": 6420 }, { "epoch": 4.589812332439678, "eval_loss": 0.007144683972001076, "eval_runtime": 4.5778, "eval_samples_per_second": 10.922, "eval_steps_per_second": 2.84, "step": 6420 }, { "epoch": 4.590527256478999, "grad_norm": 0.019776927307248116, "learning_rate": 2.004937762275588e-06, "loss": 0.0018, "step": 6421 }, { "epoch": 4.59124218051832, "grad_norm": 0.027155106887221336, "learning_rate": 1.9979439937869383e-06, "loss": 0.0025, "step": 6422 }, { "epoch": 4.591957104557641, "grad_norm": 0.026457881554961205, "learning_rate": 1.990962196016438e-06, "loss": 0.0027, "step": 6423 }, { "epoch": 4.5926720285969616, "grad_norm": 0.023178113624453545, "learning_rate": 1.98399237070519e-06, "loss": 0.0023, "step": 6424 }, { "epoch": 4.5933869526362825, "grad_norm": 0.022362850606441498, "learning_rate": 1.977034519591342e-06, "loss": 0.0022, "step": 6425 }, { "epoch": 4.5933869526362825, "eval_loss": 0.007152446545660496, "eval_runtime": 4.5889, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 6425 }, { "epoch": 4.594101876675603, "grad_norm": 0.03314327076077461, "learning_rate": 1.970088644410012e-06, "loss": 0.0059, "step": 6426 }, { "epoch": 4.594816800714924, "grad_norm": 0.02808530628681183, "learning_rate": 1.963154746893392e-06, "loss": 0.0028, "step": 6427 }, { "epoch": 4.595531724754245, "grad_norm": 0.02945554256439209, "learning_rate": 1.956232828770621e-06, "loss": 0.0035, "step": 6428 }, { "epoch": 4.596246648793565, "grad_norm": 0.03483498468995094, "learning_rate": 1.9493228917679017e-06, "loss": 0.0063, "step": 6429 }, { "epoch": 4.596961572832886, "grad_norm": 0.036092497408390045, "learning_rate": 1.942424937608428e-06, "loss": 0.0045, "step": 6430 }, { "epoch": 4.596961572832886, "eval_loss": 0.007147912867367268, "eval_runtime": 4.5816, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 6430 }, { "epoch": 4.597676496872207, "grad_norm": 0.019780077040195465, "learning_rate": 1.9355389680123957e-06, "loss": 0.0016, "step": 6431 }, { "epoch": 4.598391420911528, "grad_norm": 0.0258866585791111, "learning_rate": 1.9286649846970318e-06, "loss": 0.0034, "step": 6432 }, { "epoch": 4.599106344950849, "grad_norm": 0.022955553606152534, "learning_rate": 1.9218029893765643e-06, "loss": 0.0031, "step": 6433 }, { "epoch": 4.59982126899017, "grad_norm": 0.03127299249172211, "learning_rate": 1.9149529837622247e-06, "loss": 0.0049, "step": 6434 }, { "epoch": 4.600536193029491, "grad_norm": 0.022846076637506485, "learning_rate": 1.908114969562269e-06, "loss": 0.0021, "step": 6435 }, { "epoch": 4.600536193029491, "eval_loss": 0.007164465729147196, "eval_runtime": 4.588, "eval_samples_per_second": 10.898, "eval_steps_per_second": 2.833, "step": 6435 }, { "epoch": 4.601251117068811, "grad_norm": 0.0266538355499506, "learning_rate": 1.9012889484819662e-06, "loss": 0.0034, "step": 6436 }, { "epoch": 4.601966041108132, "grad_norm": 0.027574574574828148, "learning_rate": 1.8944749222235658e-06, "loss": 0.0026, "step": 6437 }, { "epoch": 4.602680965147453, "grad_norm": 0.025904051959514618, "learning_rate": 1.8876728924863528e-06, "loss": 0.0032, "step": 6438 }, { "epoch": 4.603395889186774, "grad_norm": 0.019223466515541077, "learning_rate": 1.880882860966615e-06, "loss": 0.0015, "step": 6439 }, { "epoch": 4.604110813226095, "grad_norm": 0.028766313567757607, "learning_rate": 1.8741048293576423e-06, "loss": 0.0048, "step": 6440 }, { "epoch": 4.604110813226095, "eval_loss": 0.007160710636526346, "eval_runtime": 4.6195, "eval_samples_per_second": 10.824, "eval_steps_per_second": 2.814, "step": 6440 }, { "epoch": 4.604825737265416, "grad_norm": 0.03717562183737755, "learning_rate": 1.8673387993497381e-06, "loss": 0.0042, "step": 6441 }, { "epoch": 4.605540661304737, "grad_norm": 0.030573174357414246, "learning_rate": 1.8605847726302084e-06, "loss": 0.0043, "step": 6442 }, { "epoch": 4.606255585344057, "grad_norm": 0.025150520727038383, "learning_rate": 1.8538427508833611e-06, "loss": 0.0033, "step": 6443 }, { "epoch": 4.606970509383378, "grad_norm": 0.037909287959337234, "learning_rate": 1.847112735790535e-06, "loss": 0.0023, "step": 6444 }, { "epoch": 4.607685433422699, "grad_norm": 0.025622354820370674, "learning_rate": 1.8403947290300316e-06, "loss": 0.0035, "step": 6445 }, { "epoch": 4.607685433422699, "eval_loss": 0.0071610137820243835, "eval_runtime": 4.6084, "eval_samples_per_second": 10.85, "eval_steps_per_second": 2.821, "step": 6445 }, { "epoch": 4.60840035746202, "grad_norm": 0.027374543249607086, "learning_rate": 1.8336887322772001e-06, "loss": 0.0019, "step": 6446 }, { "epoch": 4.6091152815013405, "grad_norm": 0.030268659815192223, "learning_rate": 1.8269947472043803e-06, "loss": 0.003, "step": 6447 }, { "epoch": 4.6098302055406615, "grad_norm": 0.037886932492256165, "learning_rate": 1.8203127754808924e-06, "loss": 0.0056, "step": 6448 }, { "epoch": 4.610545129579982, "grad_norm": 0.032749198377132416, "learning_rate": 1.8136428187731037e-06, "loss": 0.0058, "step": 6449 }, { "epoch": 4.6112600536193025, "grad_norm": 0.0351053848862648, "learning_rate": 1.8069848787443554e-06, "loss": 0.004, "step": 6450 }, { "epoch": 4.6112600536193025, "eval_loss": 0.007187933661043644, "eval_runtime": 4.6088, "eval_samples_per_second": 10.849, "eval_steps_per_second": 2.821, "step": 6450 }, { "epoch": 4.611974977658623, "grad_norm": 0.023633727803826332, "learning_rate": 1.8003389570549978e-06, "loss": 0.0028, "step": 6451 }, { "epoch": 4.612689901697944, "grad_norm": 0.0364188626408577, "learning_rate": 1.793705055362388e-06, "loss": 0.0031, "step": 6452 }, { "epoch": 4.613404825737265, "grad_norm": 0.03031056374311447, "learning_rate": 1.787083175320875e-06, "loss": 0.0032, "step": 6453 }, { "epoch": 4.614119749776586, "grad_norm": 0.027567459270358086, "learning_rate": 1.7804733185818379e-06, "loss": 0.002, "step": 6454 }, { "epoch": 4.614834673815907, "grad_norm": 0.026410238817334175, "learning_rate": 1.7738754867936191e-06, "loss": 0.0022, "step": 6455 }, { "epoch": 4.614834673815907, "eval_loss": 0.007160539738833904, "eval_runtime": 4.5989, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 6455 }, { "epoch": 4.615549597855228, "grad_norm": 0.031042492017149925, "learning_rate": 1.767289681601586e-06, "loss": 0.0055, "step": 6456 }, { "epoch": 4.616264521894549, "grad_norm": 0.022748924791812897, "learning_rate": 1.7607159046481136e-06, "loss": 0.0021, "step": 6457 }, { "epoch": 4.61697944593387, "grad_norm": 0.023225808516144753, "learning_rate": 1.7541541575725461e-06, "loss": 0.0026, "step": 6458 }, { "epoch": 4.61769436997319, "grad_norm": 0.0259630735963583, "learning_rate": 1.7476044420112637e-06, "loss": 0.0025, "step": 6459 }, { "epoch": 4.618409294012511, "grad_norm": 0.018589984625577927, "learning_rate": 1.741066759597626e-06, "loss": 0.0014, "step": 6460 }, { "epoch": 4.618409294012511, "eval_loss": 0.007190290372818708, "eval_runtime": 4.5885, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 6460 }, { "epoch": 4.619124218051832, "grad_norm": 0.029250672087073326, "learning_rate": 1.7345411119619847e-06, "loss": 0.003, "step": 6461 }, { "epoch": 4.619839142091153, "grad_norm": 0.029382523149251938, "learning_rate": 1.728027500731716e-06, "loss": 0.005, "step": 6462 }, { "epoch": 4.620554066130474, "grad_norm": 0.030913271009922028, "learning_rate": 1.7215259275311702e-06, "loss": 0.0044, "step": 6463 }, { "epoch": 4.621268990169795, "grad_norm": 0.027418101206421852, "learning_rate": 1.7150363939817115e-06, "loss": 0.0021, "step": 6464 }, { "epoch": 4.621983914209116, "grad_norm": 0.024828830733895302, "learning_rate": 1.70855890170169e-06, "loss": 0.002, "step": 6465 }, { "epoch": 4.621983914209116, "eval_loss": 0.007230397779494524, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6465 }, { "epoch": 4.622698838248436, "grad_norm": 0.028463805094361305, "learning_rate": 1.702093452306458e-06, "loss": 0.0043, "step": 6466 }, { "epoch": 4.623413762287757, "grad_norm": 0.03590044379234314, "learning_rate": 1.6956400474083646e-06, "loss": 0.0048, "step": 6467 }, { "epoch": 4.624128686327078, "grad_norm": 0.03580552712082863, "learning_rate": 1.689198688616761e-06, "loss": 0.004, "step": 6468 }, { "epoch": 4.624843610366399, "grad_norm": 0.024758225306868553, "learning_rate": 1.6827693775379794e-06, "loss": 0.0035, "step": 6469 }, { "epoch": 4.6255585344057195, "grad_norm": 0.04009470343589783, "learning_rate": 1.6763521157753648e-06, "loss": 0.0086, "step": 6470 }, { "epoch": 4.6255585344057195, "eval_loss": 0.007181990426033735, "eval_runtime": 4.5934, "eval_samples_per_second": 10.885, "eval_steps_per_second": 2.83, "step": 6470 }, { "epoch": 4.6262734584450405, "grad_norm": 0.025751303881406784, "learning_rate": 1.6699469049292426e-06, "loss": 0.0029, "step": 6471 }, { "epoch": 4.626988382484361, "grad_norm": 0.030904000625014305, "learning_rate": 1.6635537465969464e-06, "loss": 0.0031, "step": 6472 }, { "epoch": 4.6277033065236814, "grad_norm": 0.022708240896463394, "learning_rate": 1.6571726423727951e-06, "loss": 0.0023, "step": 6473 }, { "epoch": 4.628418230563002, "grad_norm": 0.024019742384552956, "learning_rate": 1.6508035938481048e-06, "loss": 0.0023, "step": 6474 }, { "epoch": 4.629133154602323, "grad_norm": 0.024133436381816864, "learning_rate": 1.6444466026111827e-06, "loss": 0.0035, "step": 6475 }, { "epoch": 4.629133154602323, "eval_loss": 0.007194759324193001, "eval_runtime": 4.6149, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 6475 }, { "epoch": 4.629848078641644, "grad_norm": 0.03927747160196304, "learning_rate": 1.6381016702473272e-06, "loss": 0.0071, "step": 6476 }, { "epoch": 4.630563002680965, "grad_norm": 0.027128025889396667, "learning_rate": 1.631768798338834e-06, "loss": 0.0025, "step": 6477 }, { "epoch": 4.631277926720286, "grad_norm": 0.026137636974453926, "learning_rate": 1.625447988465001e-06, "loss": 0.0029, "step": 6478 }, { "epoch": 4.631992850759607, "grad_norm": 0.02399667352437973, "learning_rate": 1.619139242202089e-06, "loss": 0.0026, "step": 6479 }, { "epoch": 4.632707774798927, "grad_norm": 0.028760990127921104, "learning_rate": 1.6128425611233845e-06, "loss": 0.0036, "step": 6480 }, { "epoch": 4.632707774798927, "eval_loss": 0.007169350050389767, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 6480 }, { "epoch": 4.633422698838248, "grad_norm": 0.02289280667901039, "learning_rate": 1.606557946799142e-06, "loss": 0.0028, "step": 6481 }, { "epoch": 4.634137622877569, "grad_norm": 0.030246540904045105, "learning_rate": 1.6002854007966139e-06, "loss": 0.0022, "step": 6482 }, { "epoch": 4.63485254691689, "grad_norm": 0.024439767003059387, "learning_rate": 1.5940249246800433e-06, "loss": 0.0022, "step": 6483 }, { "epoch": 4.635567470956211, "grad_norm": 0.02906089834868908, "learning_rate": 1.5877765200106699e-06, "loss": 0.0034, "step": 6484 }, { "epoch": 4.636282394995532, "grad_norm": 0.029940536245703697, "learning_rate": 1.5815401883467085e-06, "loss": 0.0051, "step": 6485 }, { "epoch": 4.636282394995532, "eval_loss": 0.00715586869046092, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 6485 }, { "epoch": 4.636997319034853, "grad_norm": 0.0287533737719059, "learning_rate": 1.5753159312433763e-06, "loss": 0.0026, "step": 6486 }, { "epoch": 4.637712243074174, "grad_norm": 0.029958533123135567, "learning_rate": 1.569103750252865e-06, "loss": 0.0024, "step": 6487 }, { "epoch": 4.638427167113495, "grad_norm": 0.029039444401860237, "learning_rate": 1.5629036469243685e-06, "loss": 0.0038, "step": 6488 }, { "epoch": 4.639142091152815, "grad_norm": 0.024245936423540115, "learning_rate": 1.5567156228040725e-06, "loss": 0.0022, "step": 6489 }, { "epoch": 4.639857015192136, "grad_norm": 0.025271136313676834, "learning_rate": 1.5505396794351313e-06, "loss": 0.0034, "step": 6490 }, { "epoch": 4.639857015192136, "eval_loss": 0.007147275377064943, "eval_runtime": 4.5878, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 6490 }, { "epoch": 4.640571939231457, "grad_norm": 0.026094505563378334, "learning_rate": 1.5443758183576962e-06, "loss": 0.0019, "step": 6491 }, { "epoch": 4.6412868632707776, "grad_norm": 0.026618247851729393, "learning_rate": 1.5382240411089155e-06, "loss": 0.0039, "step": 6492 }, { "epoch": 4.6420017873100985, "grad_norm": 0.028140613809227943, "learning_rate": 1.532084349222912e-06, "loss": 0.0035, "step": 6493 }, { "epoch": 4.642716711349419, "grad_norm": 0.028978997841477394, "learning_rate": 1.5259567442307998e-06, "loss": 0.0026, "step": 6494 }, { "epoch": 4.64343163538874, "grad_norm": 0.027116935700178146, "learning_rate": 1.5198412276606622e-06, "loss": 0.0025, "step": 6495 }, { "epoch": 4.64343163538874, "eval_loss": 0.007166438270360231, "eval_runtime": 4.6137, "eval_samples_per_second": 10.837, "eval_steps_per_second": 2.818, "step": 6495 }, { "epoch": 4.64414655942806, "grad_norm": 0.03329271450638771, "learning_rate": 1.5137378010376013e-06, "loss": 0.0027, "step": 6496 }, { "epoch": 4.644861483467381, "grad_norm": 0.023617034777998924, "learning_rate": 1.5076464658836776e-06, "loss": 0.0025, "step": 6497 }, { "epoch": 4.645576407506702, "grad_norm": 0.026999710127711296, "learning_rate": 1.5015672237179423e-06, "loss": 0.0019, "step": 6498 }, { "epoch": 4.646291331546023, "grad_norm": 0.016593681648373604, "learning_rate": 1.4955000760564331e-06, "loss": 0.0016, "step": 6499 }, { "epoch": 4.647006255585344, "grad_norm": 0.02598855271935463, "learning_rate": 1.4894450244121727e-06, "loss": 0.002, "step": 6500 }, { "epoch": 4.647006255585344, "eval_loss": 0.007172842510044575, "eval_runtime": 4.5876, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 6500 }, { "epoch": 4.647721179624665, "grad_norm": 0.023931341245770454, "learning_rate": 1.483402070295159e-06, "loss": 0.0038, "step": 6501 }, { "epoch": 4.648436103663986, "grad_norm": 0.02738863229751587, "learning_rate": 1.477371215212392e-06, "loss": 0.004, "step": 6502 }, { "epoch": 4.649151027703306, "grad_norm": 0.021915633231401443, "learning_rate": 1.4713524606678297e-06, "loss": 0.003, "step": 6503 }, { "epoch": 4.649865951742627, "grad_norm": 0.02620886079967022, "learning_rate": 1.4653458081624272e-06, "loss": 0.003, "step": 6504 }, { "epoch": 4.650580875781948, "grad_norm": 0.03160696476697922, "learning_rate": 1.4593512591941305e-06, "loss": 0.0037, "step": 6505 }, { "epoch": 4.650580875781948, "eval_loss": 0.0071655032224953175, "eval_runtime": 4.5898, "eval_samples_per_second": 10.894, "eval_steps_per_second": 2.832, "step": 6505 }, { "epoch": 4.651295799821269, "grad_norm": 0.02968425117433071, "learning_rate": 1.4533688152578385e-06, "loss": 0.003, "step": 6506 }, { "epoch": 4.65201072386059, "grad_norm": 0.025172989815473557, "learning_rate": 1.447398477845463e-06, "loss": 0.0025, "step": 6507 }, { "epoch": 4.652725647899911, "grad_norm": 0.030295055359601974, "learning_rate": 1.4414402484458745e-06, "loss": 0.003, "step": 6508 }, { "epoch": 4.653440571939232, "grad_norm": 0.02401295118033886, "learning_rate": 1.4354941285449341e-06, "loss": 0.0028, "step": 6509 }, { "epoch": 4.654155495978552, "grad_norm": 0.024936780333518982, "learning_rate": 1.4295601196254838e-06, "loss": 0.0019, "step": 6510 }, { "epoch": 4.654155495978552, "eval_loss": 0.007171579170972109, "eval_runtime": 4.5824, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6510 }, { "epoch": 4.654870420017873, "grad_norm": 0.02685166709125042, "learning_rate": 1.4236382231673395e-06, "loss": 0.0022, "step": 6511 }, { "epoch": 4.655585344057194, "grad_norm": 0.025950413197278976, "learning_rate": 1.417728440647298e-06, "loss": 0.0032, "step": 6512 }, { "epoch": 4.656300268096515, "grad_norm": 0.02952929027378559, "learning_rate": 1.4118307735391412e-06, "loss": 0.0022, "step": 6513 }, { "epoch": 4.657015192135836, "grad_norm": 0.032075900584459305, "learning_rate": 1.4059452233136095e-06, "loss": 0.0056, "step": 6514 }, { "epoch": 4.6577301161751565, "grad_norm": 0.03309348598122597, "learning_rate": 1.4000717914384677e-06, "loss": 0.0066, "step": 6515 }, { "epoch": 4.6577301161751565, "eval_loss": 0.007172180339694023, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6515 }, { "epoch": 4.6584450402144775, "grad_norm": 0.03050239570438862, "learning_rate": 1.3942104793783994e-06, "loss": 0.0043, "step": 6516 }, { "epoch": 4.659159964253798, "grad_norm": 0.024148957803845406, "learning_rate": 1.388361288595108e-06, "loss": 0.0021, "step": 6517 }, { "epoch": 4.659874888293119, "grad_norm": 0.02692456915974617, "learning_rate": 1.38252422054726e-06, "loss": 0.0024, "step": 6518 }, { "epoch": 4.660589812332439, "grad_norm": 0.02469075471162796, "learning_rate": 1.3766992766904907e-06, "loss": 0.0021, "step": 6519 }, { "epoch": 4.66130473637176, "grad_norm": 0.02623854950070381, "learning_rate": 1.3708864584774327e-06, "loss": 0.0031, "step": 6520 }, { "epoch": 4.66130473637176, "eval_loss": 0.0071687037125229836, "eval_runtime": 4.5883, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 6520 }, { "epoch": 4.662019660411081, "grad_norm": 0.028454609215259552, "learning_rate": 1.3650857673576766e-06, "loss": 0.0018, "step": 6521 }, { "epoch": 4.662734584450402, "grad_norm": 0.028867362067103386, "learning_rate": 1.3592972047777874e-06, "loss": 0.0026, "step": 6522 }, { "epoch": 4.663449508489723, "grad_norm": 0.027951283380389214, "learning_rate": 1.3535207721813326e-06, "loss": 0.0026, "step": 6523 }, { "epoch": 4.664164432529044, "grad_norm": 0.029530981555581093, "learning_rate": 1.3477564710088098e-06, "loss": 0.0036, "step": 6524 }, { "epoch": 4.664879356568365, "grad_norm": 0.029424874112010002, "learning_rate": 1.3420043026977302e-06, "loss": 0.0036, "step": 6525 }, { "epoch": 4.664879356568365, "eval_loss": 0.007160272914916277, "eval_runtime": 4.5972, "eval_samples_per_second": 10.876, "eval_steps_per_second": 2.828, "step": 6525 }, { "epoch": 4.665594280607685, "grad_norm": 0.020516512915492058, "learning_rate": 1.3362642686825688e-06, "loss": 0.0019, "step": 6526 }, { "epoch": 4.666309204647006, "grad_norm": 0.025744661688804626, "learning_rate": 1.3305363703947582e-06, "loss": 0.0023, "step": 6527 }, { "epoch": 4.667024128686327, "grad_norm": 0.023794429376721382, "learning_rate": 1.324820609262728e-06, "loss": 0.0022, "step": 6528 }, { "epoch": 4.667739052725648, "grad_norm": 0.027279339730739594, "learning_rate": 1.319116986711877e-06, "loss": 0.0028, "step": 6529 }, { "epoch": 4.668453976764969, "grad_norm": 0.02664095349609852, "learning_rate": 1.3134255041645505e-06, "loss": 0.0046, "step": 6530 }, { "epoch": 4.668453976764969, "eval_loss": 0.007184359710663557, "eval_runtime": 4.5834, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6530 }, { "epoch": 4.66916890080429, "grad_norm": 0.030609460547566414, "learning_rate": 1.3077461630400967e-06, "loss": 0.0037, "step": 6531 }, { "epoch": 4.669883824843611, "grad_norm": 0.02857133559882641, "learning_rate": 1.3020789647548327e-06, "loss": 0.0035, "step": 6532 }, { "epoch": 4.670598748882931, "grad_norm": 0.027679208666086197, "learning_rate": 1.2964239107220277e-06, "loss": 0.0036, "step": 6533 }, { "epoch": 4.671313672922252, "grad_norm": 0.02554364502429962, "learning_rate": 1.2907810023519484e-06, "loss": 0.0019, "step": 6534 }, { "epoch": 4.672028596961573, "grad_norm": 0.034068915992975235, "learning_rate": 1.2851502410518024e-06, "loss": 0.0045, "step": 6535 }, { "epoch": 4.672028596961573, "eval_loss": 0.007172644138336182, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 6535 }, { "epoch": 4.672743521000894, "grad_norm": 0.020526332780718803, "learning_rate": 1.2795316282257942e-06, "loss": 0.0029, "step": 6536 }, { "epoch": 4.673458445040215, "grad_norm": 0.02902650274336338, "learning_rate": 1.2739251652750916e-06, "loss": 0.0045, "step": 6537 }, { "epoch": 4.6741733690795355, "grad_norm": 0.029941515997052193, "learning_rate": 1.2683308535978323e-06, "loss": 0.0027, "step": 6538 }, { "epoch": 4.6748882931188565, "grad_norm": 0.028253361582756042, "learning_rate": 1.2627486945891165e-06, "loss": 0.0034, "step": 6539 }, { "epoch": 4.6756032171581765, "grad_norm": 0.0343565009534359, "learning_rate": 1.2571786896410143e-06, "loss": 0.0023, "step": 6540 }, { "epoch": 4.6756032171581765, "eval_loss": 0.007124978583306074, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6540 }, { "epoch": 4.6763181411974974, "grad_norm": 0.0313531793653965, "learning_rate": 1.251620840142581e-06, "loss": 0.0032, "step": 6541 }, { "epoch": 4.677033065236818, "grad_norm": 0.025678245350718498, "learning_rate": 1.2460751474798249e-06, "loss": 0.0026, "step": 6542 }, { "epoch": 4.677747989276139, "grad_norm": 0.021283891052007675, "learning_rate": 1.2405416130357174e-06, "loss": 0.0023, "step": 6543 }, { "epoch": 4.67846291331546, "grad_norm": 0.02503301203250885, "learning_rate": 1.235020238190221e-06, "loss": 0.0022, "step": 6544 }, { "epoch": 4.679177837354781, "grad_norm": 0.02959742210805416, "learning_rate": 1.2295110243202456e-06, "loss": 0.0037, "step": 6545 }, { "epoch": 4.679177837354781, "eval_loss": 0.007179084233939648, "eval_runtime": 4.5891, "eval_samples_per_second": 10.895, "eval_steps_per_second": 2.833, "step": 6545 }, { "epoch": 4.679892761394102, "grad_norm": 0.03238166868686676, "learning_rate": 1.2240139727996757e-06, "loss": 0.0048, "step": 6546 }, { "epoch": 4.680607685433422, "grad_norm": 0.0212397500872612, "learning_rate": 1.2185290849993647e-06, "loss": 0.0014, "step": 6547 }, { "epoch": 4.681322609472743, "grad_norm": 0.029869498685002327, "learning_rate": 1.2130563622871238e-06, "loss": 0.0062, "step": 6548 }, { "epoch": 4.682037533512064, "grad_norm": 0.026853511109948158, "learning_rate": 1.2075958060277392e-06, "loss": 0.0024, "step": 6549 }, { "epoch": 4.682752457551385, "grad_norm": 0.02346525713801384, "learning_rate": 1.202147417582966e-06, "loss": 0.003, "step": 6550 }, { "epoch": 4.682752457551385, "eval_loss": 0.007155198138207197, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6550 }, { "epoch": 4.683467381590706, "grad_norm": 0.02623908780515194, "learning_rate": 1.1967111983115121e-06, "loss": 0.0029, "step": 6551 }, { "epoch": 4.684182305630027, "grad_norm": 0.020804034546017647, "learning_rate": 1.1912871495690592e-06, "loss": 0.0022, "step": 6552 }, { "epoch": 4.684897229669348, "grad_norm": 0.02864690124988556, "learning_rate": 1.185875272708259e-06, "loss": 0.003, "step": 6553 }, { "epoch": 4.685612153708669, "grad_norm": 0.029360460117459297, "learning_rate": 1.1804755690787094e-06, "loss": 0.0024, "step": 6554 }, { "epoch": 4.68632707774799, "grad_norm": 0.023348456248641014, "learning_rate": 1.1750880400269947e-06, "loss": 0.004, "step": 6555 }, { "epoch": 4.68632707774799, "eval_loss": 0.0071290708146989346, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6555 }, { "epoch": 4.68704200178731, "grad_norm": 0.025989869609475136, "learning_rate": 1.1697126868966569e-06, "loss": 0.0042, "step": 6556 }, { "epoch": 4.687756925826631, "grad_norm": 0.029705513268709183, "learning_rate": 1.1643495110281843e-06, "loss": 0.0042, "step": 6557 }, { "epoch": 4.688471849865952, "grad_norm": 0.029658373445272446, "learning_rate": 1.158998513759052e-06, "loss": 0.003, "step": 6558 }, { "epoch": 4.689186773905273, "grad_norm": 0.03890750929713249, "learning_rate": 1.1536596964236757e-06, "loss": 0.0063, "step": 6559 }, { "epoch": 4.6899016979445936, "grad_norm": 0.04208455979824066, "learning_rate": 1.1483330603534626e-06, "loss": 0.0045, "step": 6560 }, { "epoch": 4.6899016979445936, "eval_loss": 0.00714219780638814, "eval_runtime": 4.5868, "eval_samples_per_second": 10.901, "eval_steps_per_second": 2.834, "step": 6560 }, { "epoch": 4.6906166219839145, "grad_norm": 0.02740565314888954, "learning_rate": 1.1430186068767556e-06, "loss": 0.0037, "step": 6561 }, { "epoch": 4.691331546023235, "grad_norm": 0.02989351376891136, "learning_rate": 1.137716337318867e-06, "loss": 0.0051, "step": 6562 }, { "epoch": 4.6920464700625555, "grad_norm": 0.028548382222652435, "learning_rate": 1.1324262530020835e-06, "loss": 0.0048, "step": 6563 }, { "epoch": 4.692761394101876, "grad_norm": 0.027979690581560135, "learning_rate": 1.127148355245633e-06, "loss": 0.0034, "step": 6564 }, { "epoch": 4.693476318141197, "grad_norm": 0.02877870388329029, "learning_rate": 1.1218826453657127e-06, "loss": 0.0028, "step": 6565 }, { "epoch": 4.693476318141197, "eval_loss": 0.007119557820260525, "eval_runtime": 4.5913, "eval_samples_per_second": 10.89, "eval_steps_per_second": 2.831, "step": 6565 }, { "epoch": 4.694191242180518, "grad_norm": 0.0250654648989439, "learning_rate": 1.1166291246754946e-06, "loss": 0.003, "step": 6566 }, { "epoch": 4.694906166219839, "grad_norm": 0.02940577268600464, "learning_rate": 1.1113877944850804e-06, "loss": 0.0038, "step": 6567 }, { "epoch": 4.69562109025916, "grad_norm": 0.03265167400240898, "learning_rate": 1.1061586561015635e-06, "loss": 0.0045, "step": 6568 }, { "epoch": 4.696336014298481, "grad_norm": 0.025606054812669754, "learning_rate": 1.1009417108289732e-06, "loss": 0.0033, "step": 6569 }, { "epoch": 4.697050938337801, "grad_norm": 0.0308375246822834, "learning_rate": 1.0957369599683131e-06, "loss": 0.0038, "step": 6570 }, { "epoch": 4.697050938337801, "eval_loss": 0.007152142468839884, "eval_runtime": 4.5862, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 6570 }, { "epoch": 4.697765862377122, "grad_norm": 0.03801445662975311, "learning_rate": 1.0905444048175396e-06, "loss": 0.0053, "step": 6571 }, { "epoch": 4.698480786416443, "grad_norm": 0.030300460755825043, "learning_rate": 1.0853640466715665e-06, "loss": 0.0039, "step": 6572 }, { "epoch": 4.699195710455764, "grad_norm": 0.02237621136009693, "learning_rate": 1.080195886822266e-06, "loss": 0.0022, "step": 6573 }, { "epoch": 4.699910634495085, "grad_norm": 0.022952018305659294, "learning_rate": 1.0750399265584743e-06, "loss": 0.0025, "step": 6574 }, { "epoch": 4.700625558534406, "grad_norm": 0.018780365586280823, "learning_rate": 1.0698961671659792e-06, "loss": 0.002, "step": 6575 }, { "epoch": 4.700625558534406, "eval_loss": 0.007098826114088297, "eval_runtime": 4.5877, "eval_samples_per_second": 10.899, "eval_steps_per_second": 2.834, "step": 6575 }, { "epoch": 4.701340482573727, "grad_norm": 0.03282143175601959, "learning_rate": 1.0647646099275265e-06, "loss": 0.0031, "step": 6576 }, { "epoch": 4.702055406613047, "grad_norm": 0.03172535449266434, "learning_rate": 1.0596452561228209e-06, "loss": 0.0057, "step": 6577 }, { "epoch": 4.702770330652368, "grad_norm": 0.02934158220887184, "learning_rate": 1.0545381070285243e-06, "loss": 0.0038, "step": 6578 }, { "epoch": 4.703485254691689, "grad_norm": 0.02606596238911152, "learning_rate": 1.0494431639182566e-06, "loss": 0.0042, "step": 6579 }, { "epoch": 4.70420017873101, "grad_norm": 0.029728440567851067, "learning_rate": 1.044360428062585e-06, "loss": 0.004, "step": 6580 }, { "epoch": 4.70420017873101, "eval_loss": 0.007095865439623594, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6580 }, { "epoch": 4.704915102770331, "grad_norm": 0.023898210376501083, "learning_rate": 1.0392899007290346e-06, "loss": 0.0022, "step": 6581 }, { "epoch": 4.705630026809652, "grad_norm": 0.03317264840006828, "learning_rate": 1.0342315831821103e-06, "loss": 0.0042, "step": 6582 }, { "epoch": 4.7063449508489725, "grad_norm": 0.02536209300160408, "learning_rate": 1.0291854766832253e-06, "loss": 0.0035, "step": 6583 }, { "epoch": 4.7070598748882935, "grad_norm": 0.022656934335827827, "learning_rate": 1.0241515824907954e-06, "loss": 0.0022, "step": 6584 }, { "epoch": 4.707774798927614, "grad_norm": 0.02924300730228424, "learning_rate": 1.0191299018601607e-06, "loss": 0.0026, "step": 6585 }, { "epoch": 4.707774798927614, "eval_loss": 0.007086243946105242, "eval_runtime": 4.584, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6585 }, { "epoch": 4.7084897229669345, "grad_norm": 0.02631356753408909, "learning_rate": 1.0141204360436196e-06, "loss": 0.0038, "step": 6586 }, { "epoch": 4.709204647006255, "grad_norm": 0.023711754009127617, "learning_rate": 1.0091231862904394e-06, "loss": 0.002, "step": 6587 }, { "epoch": 4.709919571045576, "grad_norm": 0.021500304341316223, "learning_rate": 1.0041381538468174e-06, "loss": 0.0025, "step": 6588 }, { "epoch": 4.710634495084897, "grad_norm": 0.0286595169454813, "learning_rate": 9.991653399559265e-07, "loss": 0.0039, "step": 6589 }, { "epoch": 4.711349419124218, "grad_norm": 0.028327904641628265, "learning_rate": 9.942047458578852e-07, "loss": 0.0021, "step": 6590 }, { "epoch": 4.711349419124218, "eval_loss": 0.0071116844192147255, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6590 }, { "epoch": 4.712064343163539, "grad_norm": 0.0312329288572073, "learning_rate": 9.892563727897597e-07, "loss": 0.0051, "step": 6591 }, { "epoch": 4.71277926720286, "grad_norm": 0.031132075935602188, "learning_rate": 9.843202219855686e-07, "loss": 0.0052, "step": 6592 }, { "epoch": 4.71349419124218, "grad_norm": 0.024085914716124535, "learning_rate": 9.793962946762935e-07, "loss": 0.0021, "step": 6593 }, { "epoch": 4.714209115281501, "grad_norm": 0.02116541750729084, "learning_rate": 9.744845920898526e-07, "loss": 0.0022, "step": 6594 }, { "epoch": 4.714924039320822, "grad_norm": 0.025282692164182663, "learning_rate": 9.69585115451127e-07, "loss": 0.0028, "step": 6595 }, { "epoch": 4.714924039320822, "eval_loss": 0.007118948269635439, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6595 }, { "epoch": 4.715638963360143, "grad_norm": 0.03297603130340576, "learning_rate": 9.646978659819395e-07, "loss": 0.0046, "step": 6596 }, { "epoch": 4.716353887399464, "grad_norm": 0.020644232630729675, "learning_rate": 9.598228449010704e-07, "loss": 0.0017, "step": 6597 }, { "epoch": 4.717068811438785, "grad_norm": 0.031437087804079056, "learning_rate": 9.549600534242587e-07, "loss": 0.0043, "step": 6598 }, { "epoch": 4.717783735478106, "grad_norm": 0.02339310199022293, "learning_rate": 9.501094927641729e-07, "loss": 0.0019, "step": 6599 }, { "epoch": 4.718498659517426, "grad_norm": 0.019068237394094467, "learning_rate": 9.452711641304402e-07, "loss": 0.0018, "step": 6600 }, { "epoch": 4.718498659517426, "eval_loss": 0.007115141488611698, "eval_runtime": 4.6024, "eval_samples_per_second": 10.864, "eval_steps_per_second": 2.825, "step": 6600 }, { "epoch": 4.719213583556747, "grad_norm": 0.03011026233434677, "learning_rate": 9.404450687296506e-07, "loss": 0.0034, "step": 6601 }, { "epoch": 4.719928507596068, "grad_norm": 0.026020288467407227, "learning_rate": 9.356312077653196e-07, "loss": 0.0022, "step": 6602 }, { "epoch": 4.720643431635389, "grad_norm": 0.031137842684984207, "learning_rate": 9.308295824379365e-07, "loss": 0.0033, "step": 6603 }, { "epoch": 4.72135835567471, "grad_norm": 0.030883274972438812, "learning_rate": 9.260401939449215e-07, "loss": 0.0054, "step": 6604 }, { "epoch": 4.722073279714031, "grad_norm": 0.020842401310801506, "learning_rate": 9.212630434806413e-07, "loss": 0.0018, "step": 6605 }, { "epoch": 4.722073279714031, "eval_loss": 0.0071335709653794765, "eval_runtime": 4.6384, "eval_samples_per_second": 10.78, "eval_steps_per_second": 2.803, "step": 6605 }, { "epoch": 4.7227882037533515, "grad_norm": 0.035345423966646194, "learning_rate": 9.164981322364374e-07, "loss": 0.0053, "step": 6606 }, { "epoch": 4.723503127792672, "grad_norm": 0.027912240475416183, "learning_rate": 9.117454614005649e-07, "loss": 0.0026, "step": 6607 }, { "epoch": 4.7242180518319925, "grad_norm": 0.03205835446715355, "learning_rate": 9.07005032158248e-07, "loss": 0.005, "step": 6608 }, { "epoch": 4.7249329758713134, "grad_norm": 0.03070421703159809, "learning_rate": 9.022768456916408e-07, "loss": 0.0047, "step": 6609 }, { "epoch": 4.725647899910634, "grad_norm": 0.030314629897475243, "learning_rate": 8.975609031798671e-07, "loss": 0.0051, "step": 6610 }, { "epoch": 4.725647899910634, "eval_loss": 0.007142652757465839, "eval_runtime": 4.5949, "eval_samples_per_second": 10.882, "eval_steps_per_second": 2.829, "step": 6610 }, { "epoch": 4.726362823949955, "grad_norm": 0.02397419884800911, "learning_rate": 8.928572057989804e-07, "loss": 0.0027, "step": 6611 }, { "epoch": 4.727077747989276, "grad_norm": 0.03374544531106949, "learning_rate": 8.881657547219868e-07, "loss": 0.0044, "step": 6612 }, { "epoch": 4.727792672028597, "grad_norm": 0.021426817402243614, "learning_rate": 8.83486551118834e-07, "loss": 0.0021, "step": 6613 }, { "epoch": 4.728507596067918, "grad_norm": 0.025368159636855125, "learning_rate": 8.788195961564272e-07, "loss": 0.0021, "step": 6614 }, { "epoch": 4.729222520107239, "grad_norm": 0.03164085000753403, "learning_rate": 8.741648909985966e-07, "loss": 0.0043, "step": 6615 }, { "epoch": 4.729222520107239, "eval_loss": 0.007143315859138966, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6615 }, { "epoch": 4.729937444146559, "grad_norm": 0.02446589060127735, "learning_rate": 8.695224368061305e-07, "loss": 0.0026, "step": 6616 }, { "epoch": 4.73065236818588, "grad_norm": 0.023680442944169044, "learning_rate": 8.64892234736775e-07, "loss": 0.0021, "step": 6617 }, { "epoch": 4.731367292225201, "grad_norm": 0.031230922788381577, "learning_rate": 8.60274285945184e-07, "loss": 0.0028, "step": 6618 }, { "epoch": 4.732082216264522, "grad_norm": 0.03894119709730148, "learning_rate": 8.556685915830032e-07, "loss": 0.0034, "step": 6619 }, { "epoch": 4.732797140303843, "grad_norm": 0.02927486039698124, "learning_rate": 8.510751527987748e-07, "loss": 0.0045, "step": 6620 }, { "epoch": 4.732797140303843, "eval_loss": 0.007100449874997139, "eval_runtime": 4.6178, "eval_samples_per_second": 10.828, "eval_steps_per_second": 2.815, "step": 6620 }, { "epoch": 4.733512064343164, "grad_norm": 0.025595052167773247, "learning_rate": 8.46493970738016e-07, "loss": 0.0022, "step": 6621 }, { "epoch": 4.734226988382485, "grad_norm": 0.029823429882526398, "learning_rate": 8.419250465431905e-07, "loss": 0.0047, "step": 6622 }, { "epoch": 4.734941912421805, "grad_norm": 0.033410828560590744, "learning_rate": 8.373683813536703e-07, "loss": 0.0053, "step": 6623 }, { "epoch": 4.735656836461126, "grad_norm": 0.03324374184012413, "learning_rate": 8.328239763058076e-07, "loss": 0.0047, "step": 6624 }, { "epoch": 4.736371760500447, "grad_norm": 0.022796565666794777, "learning_rate": 8.282918325328848e-07, "loss": 0.0029, "step": 6625 }, { "epoch": 4.736371760500447, "eval_loss": 0.007110486272722483, "eval_runtime": 4.5904, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 6625 }, { "epoch": 4.737086684539768, "grad_norm": 0.03074878267943859, "learning_rate": 8.237719511651198e-07, "loss": 0.0061, "step": 6626 }, { "epoch": 4.737801608579089, "grad_norm": 0.03542165830731392, "learning_rate": 8.192643333296779e-07, "loss": 0.0046, "step": 6627 }, { "epoch": 4.7385165326184095, "grad_norm": 0.02672431990504265, "learning_rate": 8.147689801506653e-07, "loss": 0.0022, "step": 6628 }, { "epoch": 4.7392314566577305, "grad_norm": 0.028853638097643852, "learning_rate": 8.102858927491297e-07, "loss": 0.0023, "step": 6629 }, { "epoch": 4.7399463806970505, "grad_norm": 0.024563897401094437, "learning_rate": 8.058150722430658e-07, "loss": 0.0038, "step": 6630 }, { "epoch": 4.7399463806970505, "eval_loss": 0.007105275988578796, "eval_runtime": 4.5805, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6630 }, { "epoch": 4.7406613047363715, "grad_norm": 0.0359027236700058, "learning_rate": 8.013565197473926e-07, "loss": 0.0062, "step": 6631 }, { "epoch": 4.741376228775692, "grad_norm": 0.024545889347791672, "learning_rate": 7.969102363739933e-07, "loss": 0.0037, "step": 6632 }, { "epoch": 4.742091152815013, "grad_norm": 0.025592241436243057, "learning_rate": 7.92476223231664e-07, "loss": 0.0025, "step": 6633 }, { "epoch": 4.742806076854334, "grad_norm": 0.030520331114530563, "learning_rate": 7.880544814261703e-07, "loss": 0.0033, "step": 6634 }, { "epoch": 4.743521000893655, "grad_norm": 0.023926271125674248, "learning_rate": 7.836450120601968e-07, "loss": 0.0035, "step": 6635 }, { "epoch": 4.743521000893655, "eval_loss": 0.007093570660799742, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 6635 }, { "epoch": 4.744235924932976, "grad_norm": 0.03550037369132042, "learning_rate": 7.792478162333694e-07, "loss": 0.0015, "step": 6636 }, { "epoch": 4.744950848972296, "grad_norm": 0.027653951197862625, "learning_rate": 7.748628950422665e-07, "loss": 0.0034, "step": 6637 }, { "epoch": 4.745665773011617, "grad_norm": 0.030055740848183632, "learning_rate": 7.704902495803912e-07, "loss": 0.0031, "step": 6638 }, { "epoch": 4.746380697050938, "grad_norm": 0.023051686584949493, "learning_rate": 7.661298809381878e-07, "loss": 0.0026, "step": 6639 }, { "epoch": 4.747095621090259, "grad_norm": 0.024096911773085594, "learning_rate": 7.617817902030477e-07, "loss": 0.0019, "step": 6640 }, { "epoch": 4.747095621090259, "eval_loss": 0.007089088670909405, "eval_runtime": 4.6066, "eval_samples_per_second": 10.854, "eval_steps_per_second": 2.822, "step": 6640 }, { "epoch": 4.74781054512958, "grad_norm": 0.021696599200367928, "learning_rate": 7.57445978459298e-07, "loss": 0.002, "step": 6641 }, { "epoch": 4.748525469168901, "grad_norm": 0.027765410020947456, "learning_rate": 7.531224467881847e-07, "loss": 0.0037, "step": 6642 }, { "epoch": 4.749240393208222, "grad_norm": 0.031207958236336708, "learning_rate": 7.488111962679289e-07, "loss": 0.0048, "step": 6643 }, { "epoch": 4.749955317247543, "grad_norm": 0.01950366422533989, "learning_rate": 7.445122279736483e-07, "loss": 0.0016, "step": 6644 }, { "epoch": 4.750670241286863, "grad_norm": 0.026036739349365234, "learning_rate": 7.402255429774241e-07, "loss": 0.0028, "step": 6645 }, { "epoch": 4.750670241286863, "eval_loss": 0.007096413057297468, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6645 }, { "epoch": 4.751385165326184, "grad_norm": 0.028039997443556786, "learning_rate": 7.359511423482679e-07, "loss": 0.0024, "step": 6646 }, { "epoch": 4.752100089365505, "grad_norm": 0.029536662623286247, "learning_rate": 7.316890271521215e-07, "loss": 0.0025, "step": 6647 }, { "epoch": 4.752815013404826, "grad_norm": 0.024265680462121964, "learning_rate": 7.274391984518736e-07, "loss": 0.0025, "step": 6648 }, { "epoch": 4.753529937444147, "grad_norm": 0.02623424492776394, "learning_rate": 7.23201657307343e-07, "loss": 0.0023, "step": 6649 }, { "epoch": 4.754244861483468, "grad_norm": 0.02063179947435856, "learning_rate": 7.18976404775279e-07, "loss": 0.0017, "step": 6650 }, { "epoch": 4.754244861483468, "eval_loss": 0.007114018779247999, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6650 }, { "epoch": 4.7549597855227885, "grad_norm": 0.029740503057837486, "learning_rate": 7.147634419093829e-07, "loss": 0.0046, "step": 6651 }, { "epoch": 4.7556747095621095, "grad_norm": 0.02703375555574894, "learning_rate": 7.105627697602702e-07, "loss": 0.0043, "step": 6652 }, { "epoch": 4.7563896336014295, "grad_norm": 0.02926578000187874, "learning_rate": 7.063743893755026e-07, "loss": 0.0053, "step": 6653 }, { "epoch": 4.7571045576407505, "grad_norm": 0.022061152383685112, "learning_rate": 7.021983017995836e-07, "loss": 0.002, "step": 6654 }, { "epoch": 4.757819481680071, "grad_norm": 0.030056843534111977, "learning_rate": 6.980345080739304e-07, "loss": 0.0047, "step": 6655 }, { "epoch": 4.757819481680071, "eval_loss": 0.00713162450119853, "eval_runtime": 4.6051, "eval_samples_per_second": 10.857, "eval_steps_per_second": 2.823, "step": 6655 }, { "epoch": 4.758534405719392, "grad_norm": 0.02772348001599312, "learning_rate": 6.938830092369175e-07, "loss": 0.0037, "step": 6656 }, { "epoch": 4.759249329758713, "grad_norm": 0.0388210229575634, "learning_rate": 6.897438063238392e-07, "loss": 0.0059, "step": 6657 }, { "epoch": 4.759964253798034, "grad_norm": 0.022850893437862396, "learning_rate": 6.856169003669255e-07, "loss": 0.0023, "step": 6658 }, { "epoch": 4.760679177837355, "grad_norm": 0.024864057078957558, "learning_rate": 6.815022923953418e-07, "loss": 0.0023, "step": 6659 }, { "epoch": 4.761394101876675, "grad_norm": 0.026501666754484177, "learning_rate": 6.77399983435184e-07, "loss": 0.0031, "step": 6660 }, { "epoch": 4.761394101876675, "eval_loss": 0.007143176160752773, "eval_runtime": 4.5768, "eval_samples_per_second": 10.925, "eval_steps_per_second": 2.84, "step": 6660 }, { "epoch": 4.762109025915996, "grad_norm": 0.026839865371584892, "learning_rate": 6.733099745094895e-07, "loss": 0.004, "step": 6661 }, { "epoch": 4.762823949955317, "grad_norm": 0.03241660073399544, "learning_rate": 6.6923226663822e-07, "loss": 0.0024, "step": 6662 }, { "epoch": 4.763538873994638, "grad_norm": 0.022167647257447243, "learning_rate": 6.651668608382622e-07, "loss": 0.0033, "step": 6663 }, { "epoch": 4.764253798033959, "grad_norm": 0.02529478631913662, "learning_rate": 6.611137581234495e-07, "loss": 0.002, "step": 6664 }, { "epoch": 4.76496872207328, "grad_norm": 0.023652423173189163, "learning_rate": 6.57072959504551e-07, "loss": 0.0024, "step": 6665 }, { "epoch": 4.76496872207328, "eval_loss": 0.007120548747479916, "eval_runtime": 4.5801, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6665 }, { "epoch": 4.765683646112601, "grad_norm": 0.02235310710966587, "learning_rate": 6.530444659892442e-07, "loss": 0.0022, "step": 6666 }, { "epoch": 4.766398570151921, "grad_norm": 0.031247902661561966, "learning_rate": 6.490282785821644e-07, "loss": 0.0023, "step": 6667 }, { "epoch": 4.767113494191242, "grad_norm": 0.019029613584280014, "learning_rate": 6.450243982848547e-07, "loss": 0.0016, "step": 6668 }, { "epoch": 4.767828418230563, "grad_norm": 0.03383617103099823, "learning_rate": 6.410328260957998e-07, "loss": 0.004, "step": 6669 }, { "epoch": 4.768543342269884, "grad_norm": 0.02947256900370121, "learning_rate": 6.370535630104257e-07, "loss": 0.0041, "step": 6670 }, { "epoch": 4.768543342269884, "eval_loss": 0.007106573320925236, "eval_runtime": 4.6304, "eval_samples_per_second": 10.798, "eval_steps_per_second": 2.808, "step": 6670 }, { "epoch": 4.769258266309205, "grad_norm": 0.027953842654824257, "learning_rate": 6.330866100210719e-07, "loss": 0.0028, "step": 6671 }, { "epoch": 4.769973190348526, "grad_norm": 0.029814837500452995, "learning_rate": 6.291319681170138e-07, "loss": 0.0045, "step": 6672 }, { "epoch": 4.770688114387847, "grad_norm": 0.03387304022908211, "learning_rate": 6.251896382844569e-07, "loss": 0.0021, "step": 6673 }, { "epoch": 4.771403038427167, "grad_norm": 0.021038930863142014, "learning_rate": 6.21259621506537e-07, "loss": 0.0023, "step": 6674 }, { "epoch": 4.772117962466488, "grad_norm": 0.030948041006922722, "learning_rate": 6.173419187633201e-07, "loss": 0.0036, "step": 6675 }, { "epoch": 4.772117962466488, "eval_loss": 0.007136998698115349, "eval_runtime": 4.6144, "eval_samples_per_second": 10.836, "eval_steps_per_second": 2.817, "step": 6675 }, { "epoch": 4.7728328865058085, "grad_norm": 0.023482955992221832, "learning_rate": 6.134365310317969e-07, "loss": 0.0021, "step": 6676 }, { "epoch": 4.773547810545129, "grad_norm": 0.02526218071579933, "learning_rate": 6.09543459285894e-07, "loss": 0.0022, "step": 6677 }, { "epoch": 4.77426273458445, "grad_norm": 0.029875855892896652, "learning_rate": 6.05662704496468e-07, "loss": 0.0058, "step": 6678 }, { "epoch": 4.774977658623771, "grad_norm": 0.023400260135531425, "learning_rate": 6.017942676312838e-07, "loss": 0.0018, "step": 6679 }, { "epoch": 4.775692582663092, "grad_norm": 0.03477632626891136, "learning_rate": 5.97938149655064e-07, "loss": 0.0046, "step": 6680 }, { "epoch": 4.775692582663092, "eval_loss": 0.007088434416800737, "eval_runtime": 4.6123, "eval_samples_per_second": 10.841, "eval_steps_per_second": 2.819, "step": 6680 }, { "epoch": 4.776407506702413, "grad_norm": 0.02425844594836235, "learning_rate": 5.940943515294339e-07, "loss": 0.004, "step": 6681 }, { "epoch": 4.777122430741734, "grad_norm": 0.02273043803870678, "learning_rate": 5.902628742129602e-07, "loss": 0.0019, "step": 6682 }, { "epoch": 4.777837354781054, "grad_norm": 0.02398836240172386, "learning_rate": 5.864437186611394e-07, "loss": 0.0033, "step": 6683 }, { "epoch": 4.778552278820375, "grad_norm": 0.026028282940387726, "learning_rate": 5.82636885826382e-07, "loss": 0.002, "step": 6684 }, { "epoch": 4.779267202859696, "grad_norm": 0.027962813153862953, "learning_rate": 5.788423766580342e-07, "loss": 0.0027, "step": 6685 }, { "epoch": 4.779267202859696, "eval_loss": 0.007096272427588701, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6685 }, { "epoch": 4.779982126899017, "grad_norm": 0.02640692889690399, "learning_rate": 5.750601921023779e-07, "loss": 0.0045, "step": 6686 }, { "epoch": 4.780697050938338, "grad_norm": 0.03556950017809868, "learning_rate": 5.712903331026031e-07, "loss": 0.0024, "step": 6687 }, { "epoch": 4.781411974977659, "grad_norm": 0.03431599214673042, "learning_rate": 5.675328005988301e-07, "loss": 0.0024, "step": 6688 }, { "epoch": 4.78212689901698, "grad_norm": 0.02978217601776123, "learning_rate": 5.637875955281202e-07, "loss": 0.0021, "step": 6689 }, { "epoch": 4.7828418230563, "grad_norm": 0.035720884799957275, "learning_rate": 5.600547188244431e-07, "loss": 0.003, "step": 6690 }, { "epoch": 4.7828418230563, "eval_loss": 0.007122116629034281, "eval_runtime": 4.6041, "eval_samples_per_second": 10.86, "eval_steps_per_second": 2.824, "step": 6690 }, { "epoch": 4.783556747095621, "grad_norm": 0.029976513236761093, "learning_rate": 5.563341714186987e-07, "loss": 0.0034, "step": 6691 }, { "epoch": 4.784271671134942, "grad_norm": 0.027760734781622887, "learning_rate": 5.526259542387224e-07, "loss": 0.0031, "step": 6692 }, { "epoch": 4.784986595174263, "grad_norm": 0.0263595562428236, "learning_rate": 5.489300682092635e-07, "loss": 0.003, "step": 6693 }, { "epoch": 4.785701519213584, "grad_norm": 0.028622053563594818, "learning_rate": 5.452465142520014e-07, "loss": 0.0034, "step": 6694 }, { "epoch": 4.786416443252905, "grad_norm": 0.03090755082666874, "learning_rate": 5.41575293285529e-07, "loss": 0.0044, "step": 6695 }, { "epoch": 4.786416443252905, "eval_loss": 0.00710194231942296, "eval_runtime": 4.5838, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6695 }, { "epoch": 4.7871313672922255, "grad_norm": 0.03171369433403015, "learning_rate": 5.379164062253861e-07, "loss": 0.0034, "step": 6696 }, { "epoch": 4.787846291331546, "grad_norm": 0.02650466375052929, "learning_rate": 5.342698539840096e-07, "loss": 0.0026, "step": 6697 }, { "epoch": 4.7885612153708665, "grad_norm": 0.032882336527109146, "learning_rate": 5.306356374707833e-07, "loss": 0.0039, "step": 6698 }, { "epoch": 4.7892761394101875, "grad_norm": 0.021379251033067703, "learning_rate": 5.270137575920098e-07, "loss": 0.0022, "step": 6699 }, { "epoch": 4.789991063449508, "grad_norm": 0.029099859297275543, "learning_rate": 5.234042152508944e-07, "loss": 0.0031, "step": 6700 }, { "epoch": 4.789991063449508, "eval_loss": 0.007104076910763979, "eval_runtime": 4.5848, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.835, "step": 6700 }, { "epoch": 4.790705987488829, "grad_norm": 0.019480817019939423, "learning_rate": 5.19807011347595e-07, "loss": 0.0018, "step": 6701 }, { "epoch": 4.79142091152815, "grad_norm": 0.030488522723317146, "learning_rate": 5.162221467791772e-07, "loss": 0.0046, "step": 6702 }, { "epoch": 4.792135835567471, "grad_norm": 0.02966499701142311, "learning_rate": 5.126496224396316e-07, "loss": 0.0044, "step": 6703 }, { "epoch": 4.792850759606791, "grad_norm": 0.0285301823168993, "learning_rate": 5.090894392198731e-07, "loss": 0.0032, "step": 6704 }, { "epoch": 4.793565683646112, "grad_norm": 0.024953804910182953, "learning_rate": 5.05541598007736e-07, "loss": 0.0031, "step": 6705 }, { "epoch": 4.793565683646112, "eval_loss": 0.007126981392502785, "eval_runtime": 4.6116, "eval_samples_per_second": 10.842, "eval_steps_per_second": 2.819, "step": 6705 }, { "epoch": 4.794280607685433, "grad_norm": 0.022712228819727898, "learning_rate": 5.020060996879738e-07, "loss": 0.0022, "step": 6706 }, { "epoch": 4.794995531724754, "grad_norm": 0.024286173284053802, "learning_rate": 4.984829451422756e-07, "loss": 0.0031, "step": 6707 }, { "epoch": 4.795710455764075, "grad_norm": 0.02719913050532341, "learning_rate": 4.949721352492387e-07, "loss": 0.0025, "step": 6708 }, { "epoch": 4.796425379803396, "grad_norm": 0.022794518619775772, "learning_rate": 4.914736708843847e-07, "loss": 0.002, "step": 6709 }, { "epoch": 4.797140303842717, "grad_norm": 0.025777217000722885, "learning_rate": 4.879875529201606e-07, "loss": 0.0036, "step": 6710 }, { "epoch": 4.797140303842717, "eval_loss": 0.007094335276633501, "eval_runtime": 4.5859, "eval_samples_per_second": 10.903, "eval_steps_per_second": 2.835, "step": 6710 }, { "epoch": 4.797855227882038, "grad_norm": 0.023906420916318893, "learning_rate": 4.84513782225926e-07, "loss": 0.002, "step": 6711 }, { "epoch": 4.798570151921359, "grad_norm": 0.02724265493452549, "learning_rate": 4.810523596679772e-07, "loss": 0.0025, "step": 6712 }, { "epoch": 4.799285075960679, "grad_norm": 0.02728438563644886, "learning_rate": 4.776032861095181e-07, "loss": 0.004, "step": 6713 }, { "epoch": 4.8, "grad_norm": 0.02614464983344078, "learning_rate": 4.741665624106717e-07, "loss": 0.003, "step": 6714 }, { "epoch": 4.800714924039321, "grad_norm": 0.017296049743890762, "learning_rate": 4.7074218942849135e-07, "loss": 0.0013, "step": 6715 }, { "epoch": 4.800714924039321, "eval_loss": 0.007130227517336607, "eval_runtime": 4.5849, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 6715 }, { "epoch": 4.801429848078642, "grad_norm": 0.0311876330524683, "learning_rate": 4.673301680169384e-07, "loss": 0.0026, "step": 6716 }, { "epoch": 4.802144772117963, "grad_norm": 0.030413787811994553, "learning_rate": 4.639304990269044e-07, "loss": 0.0044, "step": 6717 }, { "epoch": 4.802859696157284, "grad_norm": 0.02814820408821106, "learning_rate": 4.6054318330620016e-07, "loss": 0.0028, "step": 6718 }, { "epoch": 4.8035746201966045, "grad_norm": 0.025243472307920456, "learning_rate": 4.571682216995443e-07, "loss": 0.003, "step": 6719 }, { "epoch": 4.804289544235925, "grad_norm": 0.02441585622727871, "learning_rate": 4.5380561504858585e-07, "loss": 0.0039, "step": 6720 }, { "epoch": 4.804289544235925, "eval_loss": 0.00712159089744091, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 6720 }, { "epoch": 4.8050044682752455, "grad_norm": 0.028056811541318893, "learning_rate": 4.5045536419188736e-07, "loss": 0.0038, "step": 6721 }, { "epoch": 4.8057193923145665, "grad_norm": 0.026951603591442108, "learning_rate": 4.471174699649361e-07, "loss": 0.0045, "step": 6722 }, { "epoch": 4.806434316353887, "grad_norm": 0.027767403051257133, "learning_rate": 4.437919332001328e-07, "loss": 0.0018, "step": 6723 }, { "epoch": 4.807149240393208, "grad_norm": 0.03765980899333954, "learning_rate": 4.4047875472679187e-07, "loss": 0.0038, "step": 6724 }, { "epoch": 4.807864164432529, "grad_norm": 0.026144132018089294, "learning_rate": 4.37177935371158e-07, "loss": 0.0041, "step": 6725 }, { "epoch": 4.807864164432529, "eval_loss": 0.007101630792021751, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6725 }, { "epoch": 4.80857908847185, "grad_norm": 0.032040566205978394, "learning_rate": 4.338894759563894e-07, "loss": 0.0024, "step": 6726 }, { "epoch": 4.80929401251117, "grad_norm": 0.024019889533519745, "learning_rate": 4.306133773025467e-07, "loss": 0.0025, "step": 6727 }, { "epoch": 4.810008936550491, "grad_norm": 0.02578655630350113, "learning_rate": 4.273496402266319e-07, "loss": 0.0033, "step": 6728 }, { "epoch": 4.810723860589812, "grad_norm": 0.02416910044848919, "learning_rate": 4.2409826554255515e-07, "loss": 0.0023, "step": 6729 }, { "epoch": 4.811438784629133, "grad_norm": 0.023016389459371567, "learning_rate": 4.208592540611289e-07, "loss": 0.0025, "step": 6730 }, { "epoch": 4.811438784629133, "eval_loss": 0.007083797827363014, "eval_runtime": 4.5789, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 6730 }, { "epoch": 4.812153708668454, "grad_norm": 0.03109103813767433, "learning_rate": 4.176326065901126e-07, "loss": 0.0051, "step": 6731 }, { "epoch": 4.812868632707775, "grad_norm": 0.029437914490699768, "learning_rate": 4.144183239341515e-07, "loss": 0.0037, "step": 6732 }, { "epoch": 4.813583556747096, "grad_norm": 0.023493047803640366, "learning_rate": 4.1121640689482655e-07, "loss": 0.0033, "step": 6733 }, { "epoch": 4.814298480786416, "grad_norm": 0.02720562368631363, "learning_rate": 4.0802685627063244e-07, "loss": 0.0045, "step": 6734 }, { "epoch": 4.815013404825737, "grad_norm": 0.025364330038428307, "learning_rate": 4.048496728569717e-07, "loss": 0.0034, "step": 6735 }, { "epoch": 4.815013404825737, "eval_loss": 0.007114733569324017, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6735 }, { "epoch": 4.815728328865058, "grad_norm": 0.032858964055776596, "learning_rate": 4.0168485744617177e-07, "loss": 0.0041, "step": 6736 }, { "epoch": 4.816443252904379, "grad_norm": 0.030025754123926163, "learning_rate": 3.9853241082746793e-07, "loss": 0.004, "step": 6737 }, { "epoch": 4.8171581769437, "grad_norm": 0.03781416267156601, "learning_rate": 3.9539233378701467e-07, "loss": 0.0063, "step": 6738 }, { "epoch": 4.817873100983021, "grad_norm": 0.02569807693362236, "learning_rate": 3.9226462710789114e-07, "loss": 0.004, "step": 6739 }, { "epoch": 4.818588025022342, "grad_norm": 0.02872972935438156, "learning_rate": 3.8914929157007894e-07, "loss": 0.0046, "step": 6740 }, { "epoch": 4.818588025022342, "eval_loss": 0.0071137454360723495, "eval_runtime": 4.5923, "eval_samples_per_second": 10.888, "eval_steps_per_second": 2.831, "step": 6740 }, { "epoch": 4.819302949061663, "grad_norm": 0.02224288508296013, "learning_rate": 3.860463279504678e-07, "loss": 0.0037, "step": 6741 }, { "epoch": 4.8200178731009835, "grad_norm": 0.028444349765777588, "learning_rate": 3.8295573702288866e-07, "loss": 0.0043, "step": 6742 }, { "epoch": 4.820732797140304, "grad_norm": 0.030448470264673233, "learning_rate": 3.798775195580584e-07, "loss": 0.0039, "step": 6743 }, { "epoch": 4.8214477211796245, "grad_norm": 0.027595970779657364, "learning_rate": 3.76811676323624e-07, "loss": 0.0038, "step": 6744 }, { "epoch": 4.822162645218945, "grad_norm": 0.028610622510313988, "learning_rate": 3.737582080841462e-07, "loss": 0.0036, "step": 6745 }, { "epoch": 4.822162645218945, "eval_loss": 0.007148386910557747, "eval_runtime": 4.582, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6745 }, { "epoch": 4.822877569258266, "grad_norm": 0.022199368104338646, "learning_rate": 3.707171156010936e-07, "loss": 0.0021, "step": 6746 }, { "epoch": 4.823592493297587, "grad_norm": 0.03254622966051102, "learning_rate": 3.6768839963285394e-07, "loss": 0.0051, "step": 6747 }, { "epoch": 4.824307417336908, "grad_norm": 0.022835036739706993, "learning_rate": 3.6467206093472315e-07, "loss": 0.0023, "step": 6748 }, { "epoch": 4.825022341376229, "grad_norm": 0.02893936075270176, "learning_rate": 3.616681002589162e-07, "loss": 0.0021, "step": 6749 }, { "epoch": 4.825737265415549, "grad_norm": 0.03469804301857948, "learning_rate": 3.5867651835456153e-07, "loss": 0.0034, "step": 6750 }, { "epoch": 4.825737265415549, "eval_loss": 0.007112746126949787, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6750 }, { "epoch": 4.82645218945487, "grad_norm": 0.024782026186585426, "learning_rate": 3.556973159676902e-07, "loss": 0.0034, "step": 6751 }, { "epoch": 4.827167113494191, "grad_norm": 0.02829751931130886, "learning_rate": 3.5273049384126343e-07, "loss": 0.0041, "step": 6752 }, { "epoch": 4.827882037533512, "grad_norm": 0.03237837553024292, "learning_rate": 3.4977605271512834e-07, "loss": 0.0031, "step": 6753 }, { "epoch": 4.828596961572833, "grad_norm": 0.034434858709573746, "learning_rate": 3.468339933260789e-07, "loss": 0.0026, "step": 6754 }, { "epoch": 4.829311885612154, "grad_norm": 0.027199918404221535, "learning_rate": 3.439043164078004e-07, "loss": 0.0034, "step": 6755 }, { "epoch": 4.829311885612154, "eval_loss": 0.007139893714338541, "eval_runtime": 4.5845, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 6755 }, { "epoch": 4.830026809651475, "grad_norm": 0.026004401966929436, "learning_rate": 3.409870226908862e-07, "loss": 0.0025, "step": 6756 }, { "epoch": 4.830741733690795, "grad_norm": 0.02950894832611084, "learning_rate": 3.380821129028489e-07, "loss": 0.0031, "step": 6757 }, { "epoch": 4.831456657730116, "grad_norm": 0.026813827455043793, "learning_rate": 3.351895877681255e-07, "loss": 0.002, "step": 6758 }, { "epoch": 4.832171581769437, "grad_norm": 0.026318350806832314, "learning_rate": 3.323094480080335e-07, "loss": 0.0027, "step": 6759 }, { "epoch": 4.832886505808758, "grad_norm": 0.023023098707199097, "learning_rate": 3.294416943408374e-07, "loss": 0.0023, "step": 6760 }, { "epoch": 4.832886505808758, "eval_loss": 0.007140443194657564, "eval_runtime": 4.594, "eval_samples_per_second": 10.884, "eval_steps_per_second": 2.83, "step": 6760 }, { "epoch": 4.833601429848079, "grad_norm": 0.029360996559262276, "learning_rate": 3.265863274816872e-07, "loss": 0.0042, "step": 6761 }, { "epoch": 4.8343163538874, "grad_norm": 0.0268258024007082, "learning_rate": 3.237433481426522e-07, "loss": 0.0025, "step": 6762 }, { "epoch": 4.835031277926721, "grad_norm": 0.03158620744943619, "learning_rate": 3.209127570327153e-07, "loss": 0.0046, "step": 6763 }, { "epoch": 4.835746201966041, "grad_norm": 0.022775698453187943, "learning_rate": 3.1809455485776185e-07, "loss": 0.0021, "step": 6764 }, { "epoch": 4.836461126005362, "grad_norm": 0.022351492196321487, "learning_rate": 3.1528874232059636e-07, "loss": 0.0017, "step": 6765 }, { "epoch": 4.836461126005362, "eval_loss": 0.007111313287168741, "eval_runtime": 4.583, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6765 }, { "epoch": 4.8371760500446825, "grad_norm": 0.02236177958548069, "learning_rate": 3.124953201209313e-07, "loss": 0.0024, "step": 6766 }, { "epoch": 4.8378909740840035, "grad_norm": 0.03420475497841835, "learning_rate": 3.0971428895538723e-07, "loss": 0.0042, "step": 6767 }, { "epoch": 4.838605898123324, "grad_norm": 0.028488466516137123, "learning_rate": 3.0694564951749825e-07, "loss": 0.0038, "step": 6768 }, { "epoch": 4.839320822162645, "grad_norm": 0.029118159785866737, "learning_rate": 3.04189402497701e-07, "loss": 0.0048, "step": 6769 }, { "epoch": 4.840035746201966, "grad_norm": 0.024659236893057823, "learning_rate": 3.014455485833456e-07, "loss": 0.0025, "step": 6770 }, { "epoch": 4.840035746201966, "eval_loss": 0.0071272458881139755, "eval_runtime": 4.585, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 6770 }, { "epoch": 4.840750670241287, "grad_norm": 0.025803036987781525, "learning_rate": 2.987140884587014e-07, "loss": 0.0029, "step": 6771 }, { "epoch": 4.841465594280607, "grad_norm": 0.02873227186501026, "learning_rate": 2.9599502280492906e-07, "loss": 0.0036, "step": 6772 }, { "epoch": 4.842180518319928, "grad_norm": 0.02976594679057598, "learning_rate": 2.93288352300114e-07, "loss": 0.0037, "step": 6773 }, { "epoch": 4.842895442359249, "grad_norm": 0.025964578613638878, "learning_rate": 2.905940776192384e-07, "loss": 0.0024, "step": 6774 }, { "epoch": 4.84361036639857, "grad_norm": 0.02816200628876686, "learning_rate": 2.879121994341982e-07, "loss": 0.0028, "step": 6775 }, { "epoch": 4.84361036639857, "eval_loss": 0.0071220481768250465, "eval_runtime": 4.5815, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 6775 }, { "epoch": 4.844325290437891, "grad_norm": 0.03246309608221054, "learning_rate": 2.852427184138029e-07, "loss": 0.0038, "step": 6776 }, { "epoch": 4.845040214477212, "grad_norm": 0.027654917910695076, "learning_rate": 2.8258563522375883e-07, "loss": 0.0035, "step": 6777 }, { "epoch": 4.845755138516533, "grad_norm": 0.028543680906295776, "learning_rate": 2.7994095052669145e-07, "loss": 0.0025, "step": 6778 }, { "epoch": 4.846470062555854, "grad_norm": 0.027074092999100685, "learning_rate": 2.773086649821344e-07, "loss": 0.0025, "step": 6779 }, { "epoch": 4.847184986595174, "grad_norm": 0.02477843128144741, "learning_rate": 2.7468877924651803e-07, "loss": 0.0021, "step": 6780 }, { "epoch": 4.847184986595174, "eval_loss": 0.0071130176074802876, "eval_runtime": 4.6077, "eval_samples_per_second": 10.851, "eval_steps_per_second": 2.821, "step": 6780 }, { "epoch": 4.847899910634495, "grad_norm": 0.02335168421268463, "learning_rate": 2.7208129397319206e-07, "loss": 0.0019, "step": 6781 }, { "epoch": 4.848614834673816, "grad_norm": 0.03250279277563095, "learning_rate": 2.694862098124085e-07, "loss": 0.0058, "step": 6782 }, { "epoch": 4.849329758713137, "grad_norm": 0.025291085243225098, "learning_rate": 2.6690352741132743e-07, "loss": 0.0021, "step": 6783 }, { "epoch": 4.850044682752458, "grad_norm": 0.021854551509022713, "learning_rate": 2.64333247414017e-07, "loss": 0.0026, "step": 6784 }, { "epoch": 4.850759606791779, "grad_norm": 0.026782630011439323, "learning_rate": 2.617753704614478e-07, "loss": 0.0025, "step": 6785 }, { "epoch": 4.850759606791779, "eval_loss": 0.007126114796847105, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6785 }, { "epoch": 4.8514745308311, "grad_norm": 0.028495093807578087, "learning_rate": 2.5922989719150417e-07, "loss": 0.0028, "step": 6786 }, { "epoch": 4.85218945487042, "grad_norm": 0.022889984771609306, "learning_rate": 2.566968282389781e-07, "loss": 0.0025, "step": 6787 }, { "epoch": 4.852904378909741, "grad_norm": 0.03162172809243202, "learning_rate": 2.541761642355589e-07, "loss": 0.0058, "step": 6788 }, { "epoch": 4.8536193029490615, "grad_norm": 0.03286117687821388, "learning_rate": 2.516679058098492e-07, "loss": 0.0058, "step": 6789 }, { "epoch": 4.8543342269883825, "grad_norm": 0.0230341088026762, "learning_rate": 2.4917205358735984e-07, "loss": 0.0019, "step": 6790 }, { "epoch": 4.8543342269883825, "eval_loss": 0.0071070995181798935, "eval_runtime": 4.5803, "eval_samples_per_second": 10.916, "eval_steps_per_second": 2.838, "step": 6790 }, { "epoch": 4.855049151027703, "grad_norm": 0.02279876358807087, "learning_rate": 2.46688608190504e-07, "loss": 0.0021, "step": 6791 }, { "epoch": 4.855764075067024, "grad_norm": 0.030162794515490532, "learning_rate": 2.4421757023859735e-07, "loss": 0.0036, "step": 6792 }, { "epoch": 4.856478999106345, "grad_norm": 0.02902781218290329, "learning_rate": 2.4175894034786927e-07, "loss": 0.0027, "step": 6793 }, { "epoch": 4.857193923145665, "grad_norm": 0.028916487470269203, "learning_rate": 2.393127191314459e-07, "loss": 0.0019, "step": 6794 }, { "epoch": 4.857908847184986, "grad_norm": 0.026290234178304672, "learning_rate": 2.3687890719937268e-07, "loss": 0.0049, "step": 6795 }, { "epoch": 4.857908847184986, "eval_loss": 0.0071039521135389805, "eval_runtime": 4.5823, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6795 }, { "epoch": 4.858623771224307, "grad_norm": 0.02660653553903103, "learning_rate": 2.3445750515858066e-07, "loss": 0.003, "step": 6796 }, { "epoch": 4.859338695263628, "grad_norm": 0.0260777585208416, "learning_rate": 2.3204851361293133e-07, "loss": 0.003, "step": 6797 }, { "epoch": 4.860053619302949, "grad_norm": 0.02550615184009075, "learning_rate": 2.2965193316316636e-07, "loss": 0.0033, "step": 6798 }, { "epoch": 4.86076854334227, "grad_norm": 0.04171382635831833, "learning_rate": 2.2726776440694654e-07, "loss": 0.0044, "step": 6799 }, { "epoch": 4.861483467381591, "grad_norm": 0.02669014036655426, "learning_rate": 2.2489600793883515e-07, "loss": 0.0033, "step": 6800 }, { "epoch": 4.861483467381591, "eval_loss": 0.00712747173383832, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6800 }, { "epoch": 4.862198391420911, "grad_norm": 0.03636538237333298, "learning_rate": 2.2253666435029797e-07, "loss": 0.0075, "step": 6801 }, { "epoch": 4.862913315460232, "grad_norm": 0.03036675602197647, "learning_rate": 2.201897342297088e-07, "loss": 0.003, "step": 6802 }, { "epoch": 4.863628239499553, "grad_norm": 0.022058192640542984, "learning_rate": 2.1785521816233835e-07, "loss": 0.002, "step": 6803 }, { "epoch": 4.864343163538874, "grad_norm": 0.03180858492851257, "learning_rate": 2.155331167303709e-07, "loss": 0.0062, "step": 6804 }, { "epoch": 4.865058087578195, "grad_norm": 0.02261153608560562, "learning_rate": 2.1322343051289327e-07, "loss": 0.0027, "step": 6805 }, { "epoch": 4.865058087578195, "eval_loss": 0.007083738688379526, "eval_runtime": 4.591, "eval_samples_per_second": 10.891, "eval_steps_per_second": 2.832, "step": 6805 }, { "epoch": 4.865773011617516, "grad_norm": 0.023570584133267403, "learning_rate": 2.1092616008588363e-07, "loss": 0.0019, "step": 6806 }, { "epoch": 4.866487935656837, "grad_norm": 0.027486974373459816, "learning_rate": 2.086413060222392e-07, "loss": 0.0035, "step": 6807 }, { "epoch": 4.867202859696158, "grad_norm": 0.028589727357029915, "learning_rate": 2.0636886889175977e-07, "loss": 0.0035, "step": 6808 }, { "epoch": 4.867917783735479, "grad_norm": 0.022381730377674103, "learning_rate": 2.0410884926113094e-07, "loss": 0.002, "step": 6809 }, { "epoch": 4.868632707774799, "grad_norm": 0.026767365634441376, "learning_rate": 2.0186124769396851e-07, "loss": 0.0023, "step": 6810 }, { "epoch": 4.868632707774799, "eval_loss": 0.007128124590963125, "eval_runtime": 4.5826, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6810 }, { "epoch": 4.86934763181412, "grad_norm": 0.03145160153508186, "learning_rate": 1.996260647507797e-07, "loss": 0.0028, "step": 6811 }, { "epoch": 4.8700625558534405, "grad_norm": 0.03480617329478264, "learning_rate": 1.9740330098895754e-07, "loss": 0.0051, "step": 6812 }, { "epoch": 4.870777479892761, "grad_norm": 0.035965315997600555, "learning_rate": 1.951929569628197e-07, "loss": 0.007, "step": 6813 }, { "epoch": 4.871492403932082, "grad_norm": 0.021068181842565536, "learning_rate": 1.9299503322358637e-07, "loss": 0.0025, "step": 6814 }, { "epoch": 4.872207327971403, "grad_norm": 0.0288289412856102, "learning_rate": 1.908095303193691e-07, "loss": 0.0039, "step": 6815 }, { "epoch": 4.872207327971403, "eval_loss": 0.007130607962608337, "eval_runtime": 4.5817, "eval_samples_per_second": 10.913, "eval_steps_per_second": 2.837, "step": 6815 }, { "epoch": 4.872922252010724, "grad_norm": 0.022619254887104034, "learning_rate": 1.886364487951875e-07, "loss": 0.0023, "step": 6816 }, { "epoch": 4.873637176050044, "grad_norm": 0.024408120661973953, "learning_rate": 1.864757891929636e-07, "loss": 0.0019, "step": 6817 }, { "epoch": 4.874352100089365, "grad_norm": 0.023991411551833153, "learning_rate": 1.8432755205152197e-07, "loss": 0.0021, "step": 6818 }, { "epoch": 4.875067024128686, "grad_norm": 0.028100362047553062, "learning_rate": 1.8219173790658407e-07, "loss": 0.0033, "step": 6819 }, { "epoch": 4.875781948168007, "grad_norm": 0.029975248500704765, "learning_rate": 1.8006834729077938e-07, "loss": 0.0034, "step": 6820 }, { "epoch": 4.875781948168007, "eval_loss": 0.007115339860320091, "eval_runtime": 4.5992, "eval_samples_per_second": 10.872, "eval_steps_per_second": 2.827, "step": 6820 }, { "epoch": 4.876496872207328, "grad_norm": 0.030811741948127747, "learning_rate": 1.7795738073364542e-07, "loss": 0.0023, "step": 6821 }, { "epoch": 4.877211796246649, "grad_norm": 0.028544917702674866, "learning_rate": 1.7585883876160002e-07, "loss": 0.0038, "step": 6822 }, { "epoch": 4.87792672028597, "grad_norm": 0.022802967578172684, "learning_rate": 1.7377272189799121e-07, "loss": 0.0019, "step": 6823 }, { "epoch": 4.87864164432529, "grad_norm": 0.031619857996702194, "learning_rate": 1.7169903066303617e-07, "loss": 0.0067, "step": 6824 }, { "epoch": 4.879356568364611, "grad_norm": 0.02724311128258705, "learning_rate": 1.6963776557388234e-07, "loss": 0.0025, "step": 6825 }, { "epoch": 4.879356568364611, "eval_loss": 0.007098337635397911, "eval_runtime": 4.6146, "eval_samples_per_second": 10.835, "eval_steps_per_second": 2.817, "step": 6825 }, { "epoch": 4.880071492403932, "grad_norm": 0.02966611087322235, "learning_rate": 1.6758892714456852e-07, "loss": 0.0038, "step": 6826 }, { "epoch": 4.880786416443253, "grad_norm": 0.023262901231646538, "learning_rate": 1.6555251588602493e-07, "loss": 0.0019, "step": 6827 }, { "epoch": 4.881501340482574, "grad_norm": 0.024673065170645714, "learning_rate": 1.6352853230609532e-07, "loss": 0.002, "step": 6828 }, { "epoch": 4.882216264521895, "grad_norm": 0.02950267866253853, "learning_rate": 1.6151697690951483e-07, "loss": 0.0037, "step": 6829 }, { "epoch": 4.882931188561216, "grad_norm": 0.024512523785233498, "learning_rate": 1.5951785019792664e-07, "loss": 0.0043, "step": 6830 }, { "epoch": 4.882931188561216, "eval_loss": 0.007126966957002878, "eval_runtime": 4.5799, "eval_samples_per_second": 10.917, "eval_steps_per_second": 2.838, "step": 6830 }, { "epoch": 4.883646112600536, "grad_norm": 0.02194334752857685, "learning_rate": 1.575311526698653e-07, "loss": 0.0024, "step": 6831 }, { "epoch": 4.884361036639857, "grad_norm": 0.03059895522892475, "learning_rate": 1.5555688482078445e-07, "loss": 0.0057, "step": 6832 }, { "epoch": 4.885075960679178, "grad_norm": 0.033055659383535385, "learning_rate": 1.5359504714301808e-07, "loss": 0.0043, "step": 6833 }, { "epoch": 4.8857908847184985, "grad_norm": 0.029091227799654007, "learning_rate": 1.516456401258082e-07, "loss": 0.0023, "step": 6834 }, { "epoch": 4.8865058087578195, "grad_norm": 0.021625056862831116, "learning_rate": 1.4970866425529918e-07, "loss": 0.0024, "step": 6835 }, { "epoch": 4.8865058087578195, "eval_loss": 0.007114688400179148, "eval_runtime": 4.6005, "eval_samples_per_second": 10.868, "eval_steps_per_second": 2.826, "step": 6835 }, { "epoch": 4.88722073279714, "grad_norm": 0.02330034039914608, "learning_rate": 1.477841200145269e-07, "loss": 0.0025, "step": 6836 }, { "epoch": 4.887935656836461, "grad_norm": 0.023897996172308922, "learning_rate": 1.4587200788343524e-07, "loss": 0.002, "step": 6837 }, { "epoch": 4.888650580875782, "grad_norm": 0.02808534912765026, "learning_rate": 1.4397232833887052e-07, "loss": 0.003, "step": 6838 }, { "epoch": 4.889365504915103, "grad_norm": 0.034650273621082306, "learning_rate": 1.4208508185456492e-07, "loss": 0.005, "step": 6839 }, { "epoch": 4.890080428954423, "grad_norm": 0.01973002776503563, "learning_rate": 1.4021026890116418e-07, "loss": 0.0025, "step": 6840 }, { "epoch": 4.890080428954423, "eval_loss": 0.007107625715434551, "eval_runtime": 4.5844, "eval_samples_per_second": 10.906, "eval_steps_per_second": 2.836, "step": 6840 }, { "epoch": 4.890795352993744, "grad_norm": 0.03723856061697006, "learning_rate": 1.383478899462054e-07, "loss": 0.011, "step": 6841 }, { "epoch": 4.891510277033065, "grad_norm": 0.02507670223712921, "learning_rate": 1.3649794545413374e-07, "loss": 0.0024, "step": 6842 }, { "epoch": 4.892225201072386, "grad_norm": 0.02865346148610115, "learning_rate": 1.346604358862802e-07, "loss": 0.0025, "step": 6843 }, { "epoch": 4.892940125111707, "grad_norm": 0.02406882867217064, "learning_rate": 1.3283536170087818e-07, "loss": 0.003, "step": 6844 }, { "epoch": 4.893655049151028, "grad_norm": 0.029625670984387398, "learning_rate": 1.3102272335307475e-07, "loss": 0.0035, "step": 6845 }, { "epoch": 4.893655049151028, "eval_loss": 0.007107242941856384, "eval_runtime": 4.5901, "eval_samples_per_second": 10.893, "eval_steps_per_second": 2.832, "step": 6845 }, { "epoch": 4.894369973190349, "grad_norm": 0.03281213715672493, "learning_rate": 1.2922252129489166e-07, "loss": 0.0064, "step": 6846 }, { "epoch": 4.895084897229669, "grad_norm": 0.019756978377699852, "learning_rate": 1.2743475597526978e-07, "loss": 0.0017, "step": 6847 }, { "epoch": 4.89579982126899, "grad_norm": 0.023294880986213684, "learning_rate": 1.2565942784004692e-07, "loss": 0.0029, "step": 6848 }, { "epoch": 4.896514745308311, "grad_norm": 0.02597140520811081, "learning_rate": 1.2389653733193562e-07, "loss": 0.0031, "step": 6849 }, { "epoch": 4.897229669347632, "grad_norm": 0.024746332317590714, "learning_rate": 1.2214608489057865e-07, "loss": 0.0036, "step": 6850 }, { "epoch": 4.897229669347632, "eval_loss": 0.007126267533749342, "eval_runtime": 4.5863, "eval_samples_per_second": 10.902, "eval_steps_per_second": 2.835, "step": 6850 }, { "epoch": 4.897944593386953, "grad_norm": 0.04341399297118187, "learning_rate": 1.2040807095249908e-07, "loss": 0.0059, "step": 6851 }, { "epoch": 4.898659517426274, "grad_norm": 0.028967810794711113, "learning_rate": 1.186824959511168e-07, "loss": 0.0025, "step": 6852 }, { "epoch": 4.899374441465595, "grad_norm": 0.02804664894938469, "learning_rate": 1.1696936031676542e-07, "loss": 0.0033, "step": 6853 }, { "epoch": 4.900089365504915, "grad_norm": 0.028013844043016434, "learning_rate": 1.1526866447665319e-07, "loss": 0.003, "step": 6854 }, { "epoch": 4.900804289544236, "grad_norm": 0.03297724574804306, "learning_rate": 1.1358040885490195e-07, "loss": 0.006, "step": 6855 }, { "epoch": 4.900804289544236, "eval_loss": 0.0071110655553638935, "eval_runtime": 4.5848, "eval_samples_per_second": 10.905, "eval_steps_per_second": 2.835, "step": 6855 }, { "epoch": 4.901519213583557, "grad_norm": 0.028112847357988358, "learning_rate": 1.1190459387253049e-07, "loss": 0.0027, "step": 6856 }, { "epoch": 4.9022341376228775, "grad_norm": 0.028199411928653717, "learning_rate": 1.1024121994745451e-07, "loss": 0.0035, "step": 6857 }, { "epoch": 4.9029490616621985, "grad_norm": 0.02595345489680767, "learning_rate": 1.0859028749447552e-07, "loss": 0.0042, "step": 6858 }, { "epoch": 4.903663985701519, "grad_norm": 0.02451811358332634, "learning_rate": 1.069517969253142e-07, "loss": 0.0021, "step": 6859 }, { "epoch": 4.90437890974084, "grad_norm": 0.02465469017624855, "learning_rate": 1.0532574864856593e-07, "loss": 0.0033, "step": 6860 }, { "epoch": 4.90437890974084, "eval_loss": 0.00709068076685071, "eval_runtime": 4.5931, "eval_samples_per_second": 10.886, "eval_steps_per_second": 2.83, "step": 6860 }, { "epoch": 4.90509383378016, "grad_norm": 0.025117335841059685, "learning_rate": 1.0371214306973965e-07, "loss": 0.0021, "step": 6861 }, { "epoch": 4.905808757819481, "grad_norm": 0.03511350601911545, "learning_rate": 1.0211098059123015e-07, "loss": 0.0038, "step": 6862 }, { "epoch": 4.906523681858802, "grad_norm": 0.026646040380001068, "learning_rate": 1.0052226161234025e-07, "loss": 0.0022, "step": 6863 }, { "epoch": 4.907238605898123, "grad_norm": 0.023409776389598846, "learning_rate": 9.894598652925858e-08, "loss": 0.0028, "step": 6864 }, { "epoch": 4.907953529937444, "grad_norm": 0.026857469230890274, "learning_rate": 9.738215573507625e-08, "loss": 0.0037, "step": 6865 }, { "epoch": 4.907953529937444, "eval_loss": 0.007127192337065935, "eval_runtime": 4.5957, "eval_samples_per_second": 10.88, "eval_steps_per_second": 2.829, "step": 6865 }, { "epoch": 4.908668453976765, "grad_norm": 0.0277948509901762, "learning_rate": 9.583076961978133e-08, "loss": 0.0033, "step": 6866 }, { "epoch": 4.909383378016086, "grad_norm": 0.025462443009018898, "learning_rate": 9.429182857025876e-08, "loss": 0.0045, "step": 6867 }, { "epoch": 4.910098302055407, "grad_norm": 0.032636720687150955, "learning_rate": 9.27653329702849e-08, "loss": 0.0024, "step": 6868 }, { "epoch": 4.910813226094728, "grad_norm": 0.029532428830862045, "learning_rate": 9.125128320053855e-08, "loss": 0.005, "step": 6869 }, { "epoch": 4.911528150134048, "grad_norm": 0.03504151105880737, "learning_rate": 8.974967963858994e-08, "loss": 0.0043, "step": 6870 }, { "epoch": 4.911528150134048, "eval_loss": 0.007110781501978636, "eval_runtime": 4.589, "eval_samples_per_second": 10.896, "eval_steps_per_second": 2.833, "step": 6870 }, { "epoch": 4.912243074173369, "grad_norm": 0.025449581444263458, "learning_rate": 8.826052265891172e-08, "loss": 0.0019, "step": 6871 }, { "epoch": 4.91295799821269, "grad_norm": 0.030254308134317398, "learning_rate": 8.67838126328735e-08, "loss": 0.005, "step": 6872 }, { "epoch": 4.913672922252011, "grad_norm": 0.034500282257795334, "learning_rate": 8.53195499287196e-08, "loss": 0.0041, "step": 6873 }, { "epoch": 4.914387846291332, "grad_norm": 0.027447450906038284, "learning_rate": 8.386773491162459e-08, "loss": 0.0016, "step": 6874 }, { "epoch": 4.915102770330653, "grad_norm": 0.024216625839471817, "learning_rate": 8.242836794362662e-08, "loss": 0.003, "step": 6875 }, { "epoch": 4.915102770330653, "eval_loss": 0.0071187857538461685, "eval_runtime": 4.5796, "eval_samples_per_second": 10.918, "eval_steps_per_second": 2.839, "step": 6875 }, { "epoch": 4.915817694369974, "grad_norm": 0.03147822618484497, "learning_rate": 8.100144938368304e-08, "loss": 0.0054, "step": 6876 }, { "epoch": 4.916532618409294, "grad_norm": 0.02892131544649601, "learning_rate": 7.958697958763694e-08, "loss": 0.0033, "step": 6877 }, { "epoch": 4.917247542448615, "grad_norm": 0.03215862810611725, "learning_rate": 7.81849589082284e-08, "loss": 0.0035, "step": 6878 }, { "epoch": 4.917962466487936, "grad_norm": 0.026057209819555283, "learning_rate": 7.679538769508888e-08, "loss": 0.0024, "step": 6879 }, { "epoch": 4.9186773905272565, "grad_norm": 0.028184980154037476, "learning_rate": 7.541826629474669e-08, "loss": 0.0023, "step": 6880 }, { "epoch": 4.9186773905272565, "eval_loss": 0.007115709595382214, "eval_runtime": 4.5821, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6880 }, { "epoch": 4.919392314566577, "grad_norm": 0.025123916566371918, "learning_rate": 7.40535950506327e-08, "loss": 0.0043, "step": 6881 }, { "epoch": 4.920107238605898, "grad_norm": 0.020858226343989372, "learning_rate": 7.270137430306356e-08, "loss": 0.0019, "step": 6882 }, { "epoch": 4.920822162645219, "grad_norm": 0.032553158700466156, "learning_rate": 7.136160438925843e-08, "loss": 0.0032, "step": 6883 }, { "epoch": 4.921537086684539, "grad_norm": 0.026626765727996826, "learning_rate": 7.003428564332782e-08, "loss": 0.0044, "step": 6884 }, { "epoch": 4.92225201072386, "grad_norm": 0.03459125757217407, "learning_rate": 6.871941839627915e-08, "loss": 0.0047, "step": 6885 }, { "epoch": 4.92225201072386, "eval_loss": 0.007115733344107866, "eval_runtime": 4.5839, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6885 }, { "epoch": 4.922966934763181, "grad_norm": 0.026028065010905266, "learning_rate": 6.741700297600573e-08, "loss": 0.0039, "step": 6886 }, { "epoch": 4.923681858802502, "grad_norm": 0.031552284955978394, "learning_rate": 6.612703970730883e-08, "loss": 0.0051, "step": 6887 }, { "epoch": 4.924396782841823, "grad_norm": 0.03280484676361084, "learning_rate": 6.484952891188112e-08, "loss": 0.0045, "step": 6888 }, { "epoch": 4.925111706881144, "grad_norm": 0.023373156785964966, "learning_rate": 6.358447090829555e-08, "loss": 0.0026, "step": 6889 }, { "epoch": 4.925826630920465, "grad_norm": 0.02992599457502365, "learning_rate": 6.233186601204422e-08, "loss": 0.003, "step": 6890 }, { "epoch": 4.925826630920465, "eval_loss": 0.0071330140344798565, "eval_runtime": 4.5832, "eval_samples_per_second": 10.909, "eval_steps_per_second": 2.836, "step": 6890 }, { "epoch": 4.926541554959785, "grad_norm": 0.029038701206445694, "learning_rate": 6.109171453549944e-08, "loss": 0.0034, "step": 6891 }, { "epoch": 4.927256478999106, "grad_norm": 0.02778642624616623, "learning_rate": 5.986401678791942e-08, "loss": 0.0032, "step": 6892 }, { "epoch": 4.927971403038427, "grad_norm": 0.027730818837881088, "learning_rate": 5.864877307547589e-08, "loss": 0.0036, "step": 6893 }, { "epoch": 4.928686327077748, "grad_norm": 0.03465179353952408, "learning_rate": 5.7445983701226444e-08, "loss": 0.0045, "step": 6894 }, { "epoch": 4.929401251117069, "grad_norm": 0.02824784256517887, "learning_rate": 5.625564896511448e-08, "loss": 0.0043, "step": 6895 }, { "epoch": 4.929401251117069, "eval_loss": 0.007121429778635502, "eval_runtime": 4.5787, "eval_samples_per_second": 10.92, "eval_steps_per_second": 2.839, "step": 6895 }, { "epoch": 4.93011617515639, "grad_norm": 0.027069436386227608, "learning_rate": 5.5077769163985884e-08, "loss": 0.0024, "step": 6896 }, { "epoch": 4.930831099195711, "grad_norm": 0.029291998594999313, "learning_rate": 5.3912344591589e-08, "loss": 0.0032, "step": 6897 }, { "epoch": 4.931546023235032, "grad_norm": 0.029365582391619682, "learning_rate": 5.275937553854138e-08, "loss": 0.0038, "step": 6898 }, { "epoch": 4.932260947274352, "grad_norm": 0.02605256251990795, "learning_rate": 5.1618862292385215e-08, "loss": 0.0039, "step": 6899 }, { "epoch": 4.932975871313673, "grad_norm": 0.027855854481458664, "learning_rate": 5.049080513752635e-08, "loss": 0.0036, "step": 6900 }, { "epoch": 4.932975871313673, "eval_loss": 0.007097162771970034, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6900 }, { "epoch": 4.933690795352994, "grad_norm": 0.028165288269519806, "learning_rate": 4.937520435528975e-08, "loss": 0.0042, "step": 6901 }, { "epoch": 4.9344057193923145, "grad_norm": 0.029743719846010208, "learning_rate": 4.8272060223880644e-08, "loss": 0.0033, "step": 6902 }, { "epoch": 4.9351206434316355, "grad_norm": 0.023088036105036736, "learning_rate": 4.718137301839565e-08, "loss": 0.0017, "step": 6903 }, { "epoch": 4.935835567470956, "grad_norm": 0.022970370948314667, "learning_rate": 4.610314301083385e-08, "loss": 0.0022, "step": 6904 }, { "epoch": 4.936550491510277, "grad_norm": 0.03767351806163788, "learning_rate": 4.503737047008572e-08, "loss": 0.0035, "step": 6905 }, { "epoch": 4.936550491510277, "eval_loss": 0.007099395617842674, "eval_runtime": 4.5764, "eval_samples_per_second": 10.926, "eval_steps_per_second": 2.841, "step": 6905 }, { "epoch": 4.937265415549598, "grad_norm": 0.026601923629641533, "learning_rate": 4.3984055661927535e-08, "loss": 0.0037, "step": 6906 }, { "epoch": 4.937980339588918, "grad_norm": 0.02925628237426281, "learning_rate": 4.294319884903253e-08, "loss": 0.0018, "step": 6907 }, { "epoch": 4.938695263628239, "grad_norm": 0.026591870933771133, "learning_rate": 4.191480029097639e-08, "loss": 0.0038, "step": 6908 }, { "epoch": 4.93941018766756, "grad_norm": 0.025339486077427864, "learning_rate": 4.089886024421507e-08, "loss": 0.0033, "step": 6909 }, { "epoch": 4.940125111706881, "grad_norm": 0.026112472638487816, "learning_rate": 3.989537896210704e-08, "loss": 0.0032, "step": 6910 }, { "epoch": 4.940125111706881, "eval_loss": 0.00709906779229641, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6910 }, { "epoch": 4.940840035746202, "grad_norm": 0.028559379279613495, "learning_rate": 3.890435669489656e-08, "loss": 0.0028, "step": 6911 }, { "epoch": 4.941554959785523, "grad_norm": 0.028018604964017868, "learning_rate": 3.792579368972482e-08, "loss": 0.0023, "step": 6912 }, { "epoch": 4.942269883824844, "grad_norm": 0.024188095703721046, "learning_rate": 3.6959690190618844e-08, "loss": 0.0019, "step": 6913 }, { "epoch": 4.942984807864164, "grad_norm": 0.029612544924020767, "learning_rate": 3.6006046438519233e-08, "loss": 0.0039, "step": 6914 }, { "epoch": 4.943699731903485, "grad_norm": 0.027709292247891426, "learning_rate": 3.506486267123021e-08, "loss": 0.0038, "step": 6915 }, { "epoch": 4.943699731903485, "eval_loss": 0.007089150603860617, "eval_runtime": 4.5828, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6915 }, { "epoch": 4.944414655942806, "grad_norm": 0.028348198160529137, "learning_rate": 3.413613912347513e-08, "loss": 0.0035, "step": 6916 }, { "epoch": 4.945129579982127, "grad_norm": 0.023531721904873848, "learning_rate": 3.321987602685206e-08, "loss": 0.0025, "step": 6917 }, { "epoch": 4.945844504021448, "grad_norm": 0.028659110888838768, "learning_rate": 3.2316073609856e-08, "loss": 0.0032, "step": 6918 }, { "epoch": 4.946559428060769, "grad_norm": 0.021325990557670593, "learning_rate": 3.142473209788444e-08, "loss": 0.0022, "step": 6919 }, { "epoch": 4.94727435210009, "grad_norm": 0.027343060821294785, "learning_rate": 3.054585171321511e-08, "loss": 0.0043, "step": 6920 }, { "epoch": 4.94727435210009, "eval_loss": 0.0070894681848585606, "eval_runtime": 4.5882, "eval_samples_per_second": 10.897, "eval_steps_per_second": 2.833, "step": 6920 }, { "epoch": 4.94798927613941, "grad_norm": 0.023016566410660744, "learning_rate": 2.967943267501716e-08, "loss": 0.0021, "step": 6921 }, { "epoch": 4.948704200178731, "grad_norm": 0.035754863172769547, "learning_rate": 2.8825475199367734e-08, "loss": 0.0041, "step": 6922 }, { "epoch": 4.949419124218052, "grad_norm": 0.027060922235250473, "learning_rate": 2.7983979499218714e-08, "loss": 0.0022, "step": 6923 }, { "epoch": 4.950134048257373, "grad_norm": 0.035346198827028275, "learning_rate": 2.7154945784424457e-08, "loss": 0.0074, "step": 6924 }, { "epoch": 4.9508489722966935, "grad_norm": 0.028971204534173012, "learning_rate": 2.6338374261730693e-08, "loss": 0.0036, "step": 6925 }, { "epoch": 4.9508489722966935, "eval_loss": 0.0071130734868347645, "eval_runtime": 4.5836, "eval_samples_per_second": 10.908, "eval_steps_per_second": 2.836, "step": 6925 }, { "epoch": 4.9515638963360145, "grad_norm": 0.023855721578001976, "learning_rate": 2.5534265134768974e-08, "loss": 0.0022, "step": 6926 }, { "epoch": 4.952278820375335, "grad_norm": 0.02922724187374115, "learning_rate": 2.4742618604067792e-08, "loss": 0.0051, "step": 6927 }, { "epoch": 4.9529937444146555, "grad_norm": 0.028909360989928246, "learning_rate": 2.3963434867052548e-08, "loss": 0.0033, "step": 6928 }, { "epoch": 4.953708668453976, "grad_norm": 0.02382749505341053, "learning_rate": 2.3196714118028927e-08, "loss": 0.0022, "step": 6929 }, { "epoch": 4.954423592493297, "grad_norm": 0.02324669435620308, "learning_rate": 2.2442456548205093e-08, "loss": 0.0022, "step": 6930 }, { "epoch": 4.954423592493297, "eval_loss": 0.007095497567206621, "eval_runtime": 4.5811, "eval_samples_per_second": 10.914, "eval_steps_per_second": 2.838, "step": 6930 }, { "epoch": 4.955138516532618, "grad_norm": 0.02864489145576954, "learning_rate": 2.170066234568058e-08, "loss": 0.0028, "step": 6931 }, { "epoch": 4.955853440571939, "grad_norm": 0.03499123454093933, "learning_rate": 2.0971331695435192e-08, "loss": 0.0072, "step": 6932 }, { "epoch": 4.95656836461126, "grad_norm": 0.028479240834712982, "learning_rate": 2.0254464779356775e-08, "loss": 0.0038, "step": 6933 }, { "epoch": 4.957283288650581, "grad_norm": 0.033204637467861176, "learning_rate": 1.9550061776213435e-08, "loss": 0.0063, "step": 6934 }, { "epoch": 4.957998212689902, "grad_norm": 0.029745787382125854, "learning_rate": 1.8858122861664662e-08, "loss": 0.0022, "step": 6935 }, { "epoch": 4.957998212689902, "eval_loss": 0.0071086459793150425, "eval_runtime": 4.5857, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 6935 }, { "epoch": 4.958713136729223, "grad_norm": 0.023237310349941254, "learning_rate": 1.817864820827242e-08, "loss": 0.0019, "step": 6936 }, { "epoch": 4.959428060768543, "grad_norm": 0.03075658157467842, "learning_rate": 1.7511637985478946e-08, "loss": 0.0036, "step": 6937 }, { "epoch": 4.960142984807864, "grad_norm": 0.027601070702075958, "learning_rate": 1.6857092359628957e-08, "loss": 0.0035, "step": 6938 }, { "epoch": 4.960857908847185, "grad_norm": 0.02997698076069355, "learning_rate": 1.6215011493947442e-08, "loss": 0.0033, "step": 6939 }, { "epoch": 4.961572832886506, "grad_norm": 0.027881095185875893, "learning_rate": 1.5585395548556315e-08, "loss": 0.0022, "step": 6940 }, { "epoch": 4.961572832886506, "eval_loss": 0.007086644880473614, "eval_runtime": 4.6072, "eval_samples_per_second": 10.852, "eval_steps_per_second": 2.822, "step": 6940 }, { "epoch": 4.962287756925827, "grad_norm": 0.026538528501987457, "learning_rate": 1.496824468046887e-08, "loss": 0.0023, "step": 6941 }, { "epoch": 4.963002680965148, "grad_norm": 0.023035269230604172, "learning_rate": 1.436355904358977e-08, "loss": 0.0016, "step": 6942 }, { "epoch": 4.963717605004469, "grad_norm": 0.025692889466881752, "learning_rate": 1.377133878871506e-08, "loss": 0.0036, "step": 6943 }, { "epoch": 4.964432529043789, "grad_norm": 0.022707730531692505, "learning_rate": 1.319158406353771e-08, "loss": 0.0028, "step": 6944 }, { "epoch": 4.96514745308311, "grad_norm": 0.029130222275853157, "learning_rate": 1.2624295012625409e-08, "loss": 0.0035, "step": 6945 }, { "epoch": 4.96514745308311, "eval_loss": 0.00711869029328227, "eval_runtime": 4.5904, "eval_samples_per_second": 10.892, "eval_steps_per_second": 2.832, "step": 6945 }, { "epoch": 4.965862377122431, "grad_norm": 0.03428163751959801, "learning_rate": 1.206947177745943e-08, "loss": 0.0065, "step": 6946 }, { "epoch": 4.966577301161752, "grad_norm": 0.026639772579073906, "learning_rate": 1.1527114496395764e-08, "loss": 0.0039, "step": 6947 }, { "epoch": 4.9672922252010725, "grad_norm": 0.026329418644309044, "learning_rate": 1.0997223304687332e-08, "loss": 0.0037, "step": 6948 }, { "epoch": 4.968007149240393, "grad_norm": 0.027522867545485497, "learning_rate": 1.047979833447843e-08, "loss": 0.0035, "step": 6949 }, { "epoch": 4.968722073279714, "grad_norm": 0.029827246442437172, "learning_rate": 9.974839714799178e-09, "loss": 0.0035, "step": 6950 }, { "epoch": 4.968722073279714, "eval_loss": 0.007095913402736187, "eval_runtime": 4.5822, "eval_samples_per_second": 10.912, "eval_steps_per_second": 2.837, "step": 6950 }, { "epoch": 4.969436997319034, "grad_norm": 0.021008918061852455, "learning_rate": 9.482347571587724e-09, "loss": 0.002, "step": 6951 }, { "epoch": 4.970151921358355, "grad_norm": 0.028417320922017097, "learning_rate": 9.002322027651389e-09, "loss": 0.0027, "step": 6952 }, { "epoch": 4.970866845397676, "grad_norm": 0.02401852421462536, "learning_rate": 8.53476320269997e-09, "loss": 0.0019, "step": 6953 }, { "epoch": 4.971581769436997, "grad_norm": 0.02519780397415161, "learning_rate": 8.079671213334639e-09, "loss": 0.0025, "step": 6954 }, { "epoch": 4.972296693476318, "grad_norm": 0.028194215148687363, "learning_rate": 7.637046173047947e-09, "loss": 0.0046, "step": 6955 }, { "epoch": 4.972296693476318, "eval_loss": 0.007106785196810961, "eval_runtime": 4.5842, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6955 }, { "epoch": 4.973011617515639, "grad_norm": 0.029546743258833885, "learning_rate": 7.206888192218264e-09, "loss": 0.0031, "step": 6956 }, { "epoch": 4.97372654155496, "grad_norm": 0.03474979102611542, "learning_rate": 6.789197378115342e-09, "loss": 0.0049, "step": 6957 }, { "epoch": 4.97444146559428, "grad_norm": 0.025365298613905907, "learning_rate": 6.383973834911405e-09, "loss": 0.0022, "step": 6958 }, { "epoch": 4.975156389633601, "grad_norm": 0.03561582416296005, "learning_rate": 5.991217663653403e-09, "loss": 0.0066, "step": 6959 }, { "epoch": 4.975871313672922, "grad_norm": 0.024500034749507904, "learning_rate": 5.610928962290763e-09, "loss": 0.0025, "step": 6960 }, { "epoch": 4.975871313672922, "eval_loss": 0.0071172467432916164, "eval_runtime": 4.5827, "eval_samples_per_second": 10.911, "eval_steps_per_second": 2.837, "step": 6960 }, { "epoch": 4.976586237712243, "grad_norm": 0.02404576539993286, "learning_rate": 5.243107825653182e-09, "loss": 0.003, "step": 6961 }, { "epoch": 4.977301161751564, "grad_norm": 0.03470585495233536, "learning_rate": 4.88775434547839e-09, "loss": 0.0063, "step": 6962 }, { "epoch": 4.978016085790885, "grad_norm": 0.02028028666973114, "learning_rate": 4.544868610373287e-09, "loss": 0.0019, "step": 6963 }, { "epoch": 4.978731009830206, "grad_norm": 0.031516559422016144, "learning_rate": 4.214450705852802e-09, "loss": 0.0025, "step": 6964 }, { "epoch": 4.979445933869527, "grad_norm": 0.03490598499774933, "learning_rate": 3.8965007143176855e-09, "loss": 0.0038, "step": 6965 }, { "epoch": 4.979445933869527, "eval_loss": 0.0070967902429401875, "eval_runtime": 4.5807, "eval_samples_per_second": 10.915, "eval_steps_per_second": 2.838, "step": 6965 }, { "epoch": 4.980160857908848, "grad_norm": 0.023972950875759125, "learning_rate": 3.591018715054517e-09, "loss": 0.003, "step": 6966 }, { "epoch": 4.980875781948168, "grad_norm": 0.029931675642728806, "learning_rate": 3.298004784241249e-09, "loss": 0.0027, "step": 6967 }, { "epoch": 4.981590705987489, "grad_norm": 0.02358938194811344, "learning_rate": 3.01745899495276e-09, "loss": 0.0022, "step": 6968 }, { "epoch": 4.98230563002681, "grad_norm": 0.0328964926302433, "learning_rate": 2.7493814171553056e-09, "loss": 0.0052, "step": 6969 }, { "epoch": 4.9830205540661305, "grad_norm": 0.02654399164021015, "learning_rate": 2.493772117695414e-09, "loss": 0.0031, "step": 6970 }, { "epoch": 4.9830205540661305, "eval_loss": 0.007083152420818806, "eval_runtime": 4.5853, "eval_samples_per_second": 10.904, "eval_steps_per_second": 2.835, "step": 6970 }, { "epoch": 4.9837354781054515, "grad_norm": 0.03446346893906593, "learning_rate": 2.250631160316541e-09, "loss": 0.0048, "step": 6971 }, { "epoch": 4.984450402144772, "grad_norm": 0.01944124698638916, "learning_rate": 2.019958605659067e-09, "loss": 0.0022, "step": 6972 }, { "epoch": 4.985165326184093, "grad_norm": 0.06779291480779648, "learning_rate": 1.8017545112491984e-09, "loss": 0.0016, "step": 6973 }, { "epoch": 4.985880250223413, "grad_norm": 0.02826177515089512, "learning_rate": 1.5960189314934149e-09, "loss": 0.0031, "step": 6974 }, { "epoch": 4.986595174262734, "grad_norm": 0.027076642960309982, "learning_rate": 1.4027519177006732e-09, "loss": 0.0036, "step": 6975 }, { "epoch": 4.986595174262734, "eval_loss": 0.007081215735524893, "eval_runtime": 4.6047, "eval_samples_per_second": 10.859, "eval_steps_per_second": 2.823, "step": 6975 }, { "epoch": 4.987310098302055, "grad_norm": 0.029124077409505844, "learning_rate": 1.221953518071306e-09, "loss": 0.0036, "step": 6976 }, { "epoch": 4.988025022341376, "grad_norm": 0.018980959430336952, "learning_rate": 1.0536237776970214e-09, "loss": 0.0029, "step": 6977 }, { "epoch": 4.988739946380697, "grad_norm": 0.03630708530545235, "learning_rate": 8.977627385442499e-10, "loss": 0.0038, "step": 6978 }, { "epoch": 4.989454870420018, "grad_norm": 0.02806711010634899, "learning_rate": 7.543704394874507e-10, "loss": 0.0029, "step": 6979 }, { "epoch": 4.990169794459339, "grad_norm": 0.03278542309999466, "learning_rate": 6.234469162813561e-10, "loss": 0.0051, "step": 6980 }, { "epoch": 4.990169794459339, "eval_loss": 0.007081491872668266, "eval_runtime": 4.5829, "eval_samples_per_second": 10.91, "eval_steps_per_second": 2.837, "step": 6980 }, { "epoch": 4.990884718498659, "grad_norm": 0.02965676225721836, "learning_rate": 5.049922015887277e-10, "loss": 0.003, "step": 6981 }, { "epoch": 4.99159964253798, "grad_norm": 0.025937175378203392, "learning_rate": 3.9900632493594656e-10, "loss": 0.0023, "step": 6982 }, { "epoch": 4.992314566577301, "grad_norm": 0.02144385129213333, "learning_rate": 3.054893127574232e-10, "loss": 0.0019, "step": 6983 }, { "epoch": 4.993029490616622, "grad_norm": 0.023220963776111603, "learning_rate": 2.244411883733921e-10, "loss": 0.0022, "step": 6984 }, { "epoch": 4.993744414655943, "grad_norm": 0.023863382637500763, "learning_rate": 1.558619720010146e-10, "loss": 0.0025, "step": 6985 }, { "epoch": 4.993744414655943, "eval_loss": 0.007108084391802549, "eval_runtime": 4.587, "eval_samples_per_second": 10.9, "eval_steps_per_second": 2.834, "step": 6985 }, { "epoch": 4.994459338695264, "grad_norm": 0.03068946860730648, "learning_rate": 9.975168073772522e-11, "loss": 0.0019, "step": 6986 }, { "epoch": 4.995174262734585, "grad_norm": 0.030799949541687965, "learning_rate": 5.611032857788523e-11, "loss": 0.003, "step": 6987 }, { "epoch": 4.995889186773905, "grad_norm": 0.027426043525338173, "learning_rate": 2.4937926401680245e-11, "loss": 0.003, "step": 6988 }, { "epoch": 4.996604110813226, "grad_norm": 0.02836833894252777, "learning_rate": 6.234481991773677e-12, "loss": 0.0037, "step": 6989 }, { "epoch": 4.997319034852547, "grad_norm": 0.02619370073080063, "learning_rate": 0.0, "loss": 0.0031, "step": 6990 }, { "epoch": 4.997319034852547, "eval_loss": 0.007108742371201515, "eval_runtime": 4.5841, "eval_samples_per_second": 10.907, "eval_steps_per_second": 2.836, "step": 6990 }, { "epoch": 4.997319034852547, "step": 6990, "total_flos": 9.055925074306204e+18, "train_loss": 0.012000293021579306, "train_runtime": 73664.6386, "train_samples_per_second": 3.038, "train_steps_per_second": 0.095 } ], "logging_steps": 1, "max_steps": 6990, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.055925074306204e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }