{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 50, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 9.064603805541992, "learning_rate": 1.0000000000000002e-06, "loss": 1.735, "step": 50 }, { "epoch": 0.05, "eval_loss": 0.9427366256713867, "eval_runtime": 1.9961, "eval_samples_per_second": 78.151, "eval_steps_per_second": 4.008, "step": 50 }, { "epoch": 0.1, "grad_norm": 8.72606372833252, "learning_rate": 2.0000000000000003e-06, "loss": 0.7505, "step": 100 }, { "epoch": 0.1, "eval_loss": 0.758994460105896, "eval_runtime": 2.0068, "eval_samples_per_second": 77.736, "eval_steps_per_second": 3.986, "step": 100 }, { "epoch": 0.15, "grad_norm": 9.077518463134766, "learning_rate": 3e-06, "loss": 0.6926, "step": 150 }, { "epoch": 0.15, "eval_loss": 0.6940224766731262, "eval_runtime": 1.9878, "eval_samples_per_second": 78.477, "eval_steps_per_second": 4.024, "step": 150 }, { "epoch": 0.2, "grad_norm": 8.854167938232422, "learning_rate": 4.000000000000001e-06, "loss": 0.6325, "step": 200 }, { "epoch": 0.2, "eval_loss": 0.6877058148384094, "eval_runtime": 1.9646, "eval_samples_per_second": 79.406, "eval_steps_per_second": 4.072, "step": 200 }, { "epoch": 0.25, "grad_norm": 5.465810298919678, "learning_rate": 5e-06, "loss": 0.6475, "step": 250 }, { "epoch": 0.25, "eval_loss": 0.679232120513916, "eval_runtime": 1.9834, "eval_samples_per_second": 78.653, "eval_steps_per_second": 4.033, "step": 250 }, { "epoch": 0.3, "grad_norm": 5.30919075012207, "learning_rate": 6e-06, "loss": 0.6392, "step": 300 }, { "epoch": 0.3, "eval_loss": 0.6842172145843506, "eval_runtime": 1.9525, "eval_samples_per_second": 79.896, "eval_steps_per_second": 4.097, "step": 300 }, { "epoch": 0.35, "grad_norm": 4.993810176849365, "learning_rate": 7e-06, "loss": 0.6626, "step": 350 }, { "epoch": 0.35, "eval_loss": 0.685288667678833, "eval_runtime": 1.9494, "eval_samples_per_second": 80.026, "eval_steps_per_second": 4.104, "step": 350 }, { "epoch": 0.4, "grad_norm": 3.977246046066284, "learning_rate": 8.000000000000001e-06, "loss": 0.6438, "step": 400 }, { "epoch": 0.4, "eval_loss": 0.6853578686714172, "eval_runtime": 1.9636, "eval_samples_per_second": 79.447, "eval_steps_per_second": 4.074, "step": 400 }, { "epoch": 0.45, "grad_norm": 2.963512659072876, "learning_rate": 9e-06, "loss": 0.6257, "step": 450 }, { "epoch": 0.45, "eval_loss": 0.6963115334510803, "eval_runtime": 1.9625, "eval_samples_per_second": 79.489, "eval_steps_per_second": 4.076, "step": 450 }, { "epoch": 0.5, "grad_norm": 3.4475905895233154, "learning_rate": 1e-05, "loss": 0.655, "step": 500 }, { "epoch": 0.5, "eval_loss": 0.7059319615364075, "eval_runtime": 1.9549, "eval_samples_per_second": 79.798, "eval_steps_per_second": 4.092, "step": 500 }, { "epoch": 0.55, "grad_norm": 3.927042245864868, "learning_rate": 1.1000000000000001e-05, "loss": 0.6604, "step": 550 }, { "epoch": 0.55, "eval_loss": 0.70585036277771, "eval_runtime": 1.9622, "eval_samples_per_second": 79.503, "eval_steps_per_second": 4.077, "step": 550 }, { "epoch": 0.6, "grad_norm": 3.3213369846343994, "learning_rate": 1.2e-05, "loss": 0.6906, "step": 600 }, { "epoch": 0.6, "eval_loss": 0.7131460905075073, "eval_runtime": 1.9604, "eval_samples_per_second": 79.577, "eval_steps_per_second": 4.081, "step": 600 }, { "epoch": 0.65, "grad_norm": 3.0374979972839355, "learning_rate": 1.3000000000000001e-05, "loss": 0.6968, "step": 650 }, { "epoch": 0.65, "eval_loss": 0.7101390957832336, "eval_runtime": 1.9553, "eval_samples_per_second": 79.784, "eval_steps_per_second": 4.091, "step": 650 }, { "epoch": 0.7, "grad_norm": 3.219902276992798, "learning_rate": 1.4e-05, "loss": 0.6704, "step": 700 }, { "epoch": 0.7, "eval_loss": 0.7224947810173035, "eval_runtime": 1.9875, "eval_samples_per_second": 78.491, "eval_steps_per_second": 4.025, "step": 700 }, { "epoch": 0.75, "grad_norm": 3.4985125064849854, "learning_rate": 1.5000000000000002e-05, "loss": 0.6637, "step": 750 }, { "epoch": 0.75, "eval_loss": 0.7254340052604675, "eval_runtime": 1.9941, "eval_samples_per_second": 78.232, "eval_steps_per_second": 4.012, "step": 750 }, { "epoch": 0.8, "grad_norm": 3.596057653427124, "learning_rate": 1.6000000000000003e-05, "loss": 0.6954, "step": 800 }, { "epoch": 0.8, "eval_loss": 0.7311561703681946, "eval_runtime": 1.9553, "eval_samples_per_second": 79.784, "eval_steps_per_second": 4.091, "step": 800 }, { "epoch": 0.85, "grad_norm": 3.449831962585449, "learning_rate": 1.7e-05, "loss": 0.697, "step": 850 }, { "epoch": 0.85, "eval_loss": 0.7341279983520508, "eval_runtime": 1.9555, "eval_samples_per_second": 79.773, "eval_steps_per_second": 4.091, "step": 850 }, { "epoch": 0.9, "grad_norm": 2.8547210693359375, "learning_rate": 1.8e-05, "loss": 0.7077, "step": 900 }, { "epoch": 0.9, "eval_loss": 0.7329990863800049, "eval_runtime": 1.9739, "eval_samples_per_second": 79.033, "eval_steps_per_second": 4.053, "step": 900 }, { "epoch": 0.95, "grad_norm": 3.30784010887146, "learning_rate": 1.9e-05, "loss": 0.7119, "step": 950 }, { "epoch": 0.95, "eval_loss": 0.7406164407730103, "eval_runtime": 1.953, "eval_samples_per_second": 79.877, "eval_steps_per_second": 4.096, "step": 950 }, { "epoch": 1.0, "grad_norm": 2.941829204559326, "learning_rate": 2e-05, "loss": 0.7267, "step": 1000 }, { "epoch": 1.0, "eval_loss": 0.745068371295929, "eval_runtime": 1.9569, "eval_samples_per_second": 79.718, "eval_steps_per_second": 4.088, "step": 1000 }, { "epoch": 1.05, "grad_norm": 2.627633810043335, "learning_rate": 1.9998476951563914e-05, "loss": 0.4425, "step": 1050 }, { "epoch": 1.05, "eval_loss": 0.7665248513221741, "eval_runtime": 1.9561, "eval_samples_per_second": 79.752, "eval_steps_per_second": 4.09, "step": 1050 }, { "epoch": 1.1, "grad_norm": 2.317258596420288, "learning_rate": 1.999390827019096e-05, "loss": 0.4223, "step": 1100 }, { "epoch": 1.1, "eval_loss": 0.776759922504425, "eval_runtime": 1.9592, "eval_samples_per_second": 79.625, "eval_steps_per_second": 4.083, "step": 1100 }, { "epoch": 1.15, "grad_norm": 2.567178964614868, "learning_rate": 1.9986295347545738e-05, "loss": 0.4482, "step": 1150 }, { "epoch": 1.15, "eval_loss": 0.7822524905204773, "eval_runtime": 1.9683, "eval_samples_per_second": 79.254, "eval_steps_per_second": 4.064, "step": 1150 }, { "epoch": 1.2, "grad_norm": 2.5893921852111816, "learning_rate": 1.9975640502598243e-05, "loss": 0.4239, "step": 1200 }, { "epoch": 1.2, "eval_loss": 0.7750845551490784, "eval_runtime": 1.9554, "eval_samples_per_second": 79.78, "eval_steps_per_second": 4.091, "step": 1200 }, { "epoch": 1.25, "grad_norm": 2.4784305095672607, "learning_rate": 1.9961946980917457e-05, "loss": 0.4317, "step": 1250 }, { "epoch": 1.25, "eval_loss": 0.7848005294799805, "eval_runtime": 1.9866, "eval_samples_per_second": 78.527, "eval_steps_per_second": 4.027, "step": 1250 }, { "epoch": 1.3, "grad_norm": 2.32456636428833, "learning_rate": 1.9945218953682736e-05, "loss": 0.4417, "step": 1300 }, { "epoch": 1.3, "eval_loss": 0.7774109244346619, "eval_runtime": 1.9508, "eval_samples_per_second": 79.967, "eval_steps_per_second": 4.101, "step": 1300 }, { "epoch": 1.35, "grad_norm": 3.0050048828125, "learning_rate": 1.9925461516413224e-05, "loss": 0.4602, "step": 1350 }, { "epoch": 1.35, "eval_loss": 0.7812933921813965, "eval_runtime": 1.9842, "eval_samples_per_second": 78.621, "eval_steps_per_second": 4.032, "step": 1350 }, { "epoch": 1.4, "grad_norm": 3.0922508239746094, "learning_rate": 1.9902680687415704e-05, "loss": 0.442, "step": 1400 }, { "epoch": 1.4, "eval_loss": 0.7885680794715881, "eval_runtime": 1.98, "eval_samples_per_second": 78.788, "eval_steps_per_second": 4.04, "step": 1400 }, { "epoch": 1.45, "grad_norm": 2.266848087310791, "learning_rate": 1.9876883405951378e-05, "loss": 0.457, "step": 1450 }, { "epoch": 1.45, "eval_loss": 0.7839911580085754, "eval_runtime": 1.9792, "eval_samples_per_second": 78.819, "eval_steps_per_second": 4.042, "step": 1450 }, { "epoch": 1.5, "grad_norm": 2.5083045959472656, "learning_rate": 1.9848077530122083e-05, "loss": 0.4472, "step": 1500 }, { "epoch": 1.5, "eval_loss": 0.7787847518920898, "eval_runtime": 1.9643, "eval_samples_per_second": 79.418, "eval_steps_per_second": 4.073, "step": 1500 }, { "epoch": 1.55, "grad_norm": 1.6221176385879517, "learning_rate": 1.9816271834476642e-05, "loss": 0.4595, "step": 1550 }, { "epoch": 1.55, "eval_loss": 0.7851470112800598, "eval_runtime": 1.9558, "eval_samples_per_second": 79.764, "eval_steps_per_second": 4.09, "step": 1550 }, { "epoch": 1.6, "grad_norm": 2.7922744750976562, "learning_rate": 1.9781476007338058e-05, "loss": 0.4611, "step": 1600 }, { "epoch": 1.6, "eval_loss": 0.7840977907180786, "eval_runtime": 1.9814, "eval_samples_per_second": 78.732, "eval_steps_per_second": 4.038, "step": 1600 }, { "epoch": 1.65, "grad_norm": 2.0918426513671875, "learning_rate": 1.9743700647852356e-05, "loss": 0.4727, "step": 1650 }, { "epoch": 1.65, "eval_loss": 0.7842524647712708, "eval_runtime": 1.9794, "eval_samples_per_second": 78.81, "eval_steps_per_second": 4.042, "step": 1650 }, { "epoch": 1.7, "grad_norm": 3.00199031829834, "learning_rate": 1.9702957262759964e-05, "loss": 0.4963, "step": 1700 }, { "epoch": 1.7, "eval_loss": 0.7786985039710999, "eval_runtime": 1.9547, "eval_samples_per_second": 79.807, "eval_steps_per_second": 4.093, "step": 1700 }, { "epoch": 1.75, "grad_norm": 3.255531072616577, "learning_rate": 1.9659258262890683e-05, "loss": 0.4718, "step": 1750 }, { "epoch": 1.75, "eval_loss": 0.7729825973510742, "eval_runtime": 1.9499, "eval_samples_per_second": 80.002, "eval_steps_per_second": 4.103, "step": 1750 }, { "epoch": 1.8, "grad_norm": 3.8440444469451904, "learning_rate": 1.961261695938319e-05, "loss": 0.458, "step": 1800 }, { "epoch": 1.8, "eval_loss": 0.7783200144767761, "eval_runtime": 1.9521, "eval_samples_per_second": 79.913, "eval_steps_per_second": 4.098, "step": 1800 }, { "epoch": 1.85, "grad_norm": 2.689103364944458, "learning_rate": 1.9563047559630356e-05, "loss": 0.4705, "step": 1850 }, { "epoch": 1.85, "eval_loss": 0.7806310653686523, "eval_runtime": 1.9952, "eval_samples_per_second": 78.188, "eval_steps_per_second": 4.01, "step": 1850 }, { "epoch": 1.9, "grad_norm": 2.5303993225097656, "learning_rate": 1.9510565162951538e-05, "loss": 0.4883, "step": 1900 }, { "epoch": 1.9, "eval_loss": 0.7753411531448364, "eval_runtime": 1.9583, "eval_samples_per_second": 79.66, "eval_steps_per_second": 4.085, "step": 1900 }, { "epoch": 1.95, "grad_norm": 2.224400281906128, "learning_rate": 1.945518575599317e-05, "loss": 0.4582, "step": 1950 }, { "epoch": 1.95, "eval_loss": 0.7831416726112366, "eval_runtime": 1.9552, "eval_samples_per_second": 79.787, "eval_steps_per_second": 4.092, "step": 1950 }, { "epoch": 2.0, "grad_norm": 2.6229469776153564, "learning_rate": 1.9396926207859085e-05, "loss": 0.4655, "step": 2000 }, { "epoch": 2.0, "eval_loss": 0.7799313068389893, "eval_runtime": 1.9774, "eval_samples_per_second": 78.893, "eval_steps_per_second": 4.046, "step": 2000 }, { "epoch": 2.05, "grad_norm": 1.596127986907959, "learning_rate": 1.9335804264972018e-05, "loss": 0.2482, "step": 2050 }, { "epoch": 2.05, "eval_loss": 0.8467806577682495, "eval_runtime": 1.9655, "eval_samples_per_second": 79.368, "eval_steps_per_second": 4.07, "step": 2050 }, { "epoch": 2.1, "grad_norm": 1.846142053604126, "learning_rate": 1.9271838545667876e-05, "loss": 0.2419, "step": 2100 }, { "epoch": 2.1, "eval_loss": 0.8468015789985657, "eval_runtime": 1.9946, "eval_samples_per_second": 78.212, "eval_steps_per_second": 4.011, "step": 2100 }, { "epoch": 2.15, "grad_norm": 1.7468857765197754, "learning_rate": 1.9205048534524405e-05, "loss": 0.2425, "step": 2150 }, { "epoch": 2.15, "eval_loss": 0.8560521006584167, "eval_runtime": 1.9854, "eval_samples_per_second": 78.572, "eval_steps_per_second": 4.029, "step": 2150 }, { "epoch": 2.2, "grad_norm": 1.469488263130188, "learning_rate": 1.913545457642601e-05, "loss": 0.2464, "step": 2200 }, { "epoch": 2.2, "eval_loss": 0.8571642637252808, "eval_runtime": 1.9629, "eval_samples_per_second": 79.474, "eval_steps_per_second": 4.076, "step": 2200 }, { "epoch": 2.25, "grad_norm": 1.8838245868682861, "learning_rate": 1.9063077870366504e-05, "loss": 0.2375, "step": 2250 }, { "epoch": 2.25, "eval_loss": 0.8749194145202637, "eval_runtime": 1.9507, "eval_samples_per_second": 79.973, "eval_steps_per_second": 4.101, "step": 2250 }, { "epoch": 2.3, "grad_norm": 1.6521848440170288, "learning_rate": 1.8987940462991673e-05, "loss": 0.2549, "step": 2300 }, { "epoch": 2.3, "eval_loss": 0.8745154142379761, "eval_runtime": 1.9946, "eval_samples_per_second": 78.211, "eval_steps_per_second": 4.011, "step": 2300 }, { "epoch": 2.35, "grad_norm": 2.443060874938965, "learning_rate": 1.891006524188368e-05, "loss": 0.2517, "step": 2350 }, { "epoch": 2.35, "eval_loss": 0.8473518490791321, "eval_runtime": 1.9854, "eval_samples_per_second": 78.574, "eval_steps_per_second": 4.029, "step": 2350 }, { "epoch": 2.4, "grad_norm": 2.0200119018554688, "learning_rate": 1.8829475928589272e-05, "loss": 0.2499, "step": 2400 }, { "epoch": 2.4, "eval_loss": 0.8664969205856323, "eval_runtime": 2.0066, "eval_samples_per_second": 77.743, "eval_steps_per_second": 3.987, "step": 2400 }, { "epoch": 2.45, "grad_norm": 1.9448421001434326, "learning_rate": 1.874619707139396e-05, "loss": 0.2579, "step": 2450 }, { "epoch": 2.45, "eval_loss": 0.8644601702690125, "eval_runtime": 1.9824, "eval_samples_per_second": 78.694, "eval_steps_per_second": 4.036, "step": 2450 }, { "epoch": 2.5, "grad_norm": 1.611676573753357, "learning_rate": 1.866025403784439e-05, "loss": 0.2637, "step": 2500 }, { "epoch": 2.5, "eval_loss": 0.8647167682647705, "eval_runtime": 1.9579, "eval_samples_per_second": 79.676, "eval_steps_per_second": 4.086, "step": 2500 }, { "epoch": 2.55, "grad_norm": 4.220365047454834, "learning_rate": 1.8571673007021124e-05, "loss": 0.2651, "step": 2550 }, { "epoch": 2.55, "eval_loss": 0.8691959977149963, "eval_runtime": 1.9886, "eval_samples_per_second": 78.445, "eval_steps_per_second": 4.023, "step": 2550 }, { "epoch": 2.6, "grad_norm": 1.417657732963562, "learning_rate": 1.848048096156426e-05, "loss": 0.2642, "step": 2600 }, { "epoch": 2.6, "eval_loss": 0.8810213804244995, "eval_runtime": 1.9541, "eval_samples_per_second": 79.831, "eval_steps_per_second": 4.094, "step": 2600 }, { "epoch": 2.65, "grad_norm": 1.611427664756775, "learning_rate": 1.8386705679454243e-05, "loss": 0.2678, "step": 2650 }, { "epoch": 2.65, "eval_loss": 0.8694055080413818, "eval_runtime": 1.9556, "eval_samples_per_second": 79.769, "eval_steps_per_second": 4.091, "step": 2650 }, { "epoch": 2.7, "grad_norm": 1.7564852237701416, "learning_rate": 1.8290375725550417e-05, "loss": 0.2664, "step": 2700 }, { "epoch": 2.7, "eval_loss": 0.8709028363227844, "eval_runtime": 1.9486, "eval_samples_per_second": 80.056, "eval_steps_per_second": 4.105, "step": 2700 }, { "epoch": 2.75, "grad_norm": 2.1291887760162354, "learning_rate": 1.819152044288992e-05, "loss": 0.2556, "step": 2750 }, { "epoch": 2.75, "eval_loss": 0.8776626586914062, "eval_runtime": 1.9861, "eval_samples_per_second": 78.546, "eval_steps_per_second": 4.028, "step": 2750 }, { "epoch": 2.8, "grad_norm": 2.107320785522461, "learning_rate": 1.8090169943749477e-05, "loss": 0.2657, "step": 2800 }, { "epoch": 2.8, "eval_loss": 0.8849774599075317, "eval_runtime": 1.954, "eval_samples_per_second": 79.834, "eval_steps_per_second": 4.094, "step": 2800 }, { "epoch": 2.85, "grad_norm": 1.935482144355774, "learning_rate": 1.798635510047293e-05, "loss": 0.2735, "step": 2850 }, { "epoch": 2.85, "eval_loss": 0.868646502494812, "eval_runtime": 1.9767, "eval_samples_per_second": 78.921, "eval_steps_per_second": 4.047, "step": 2850 }, { "epoch": 2.9, "grad_norm": 1.7236666679382324, "learning_rate": 1.788010753606722e-05, "loss": 0.2539, "step": 2900 }, { "epoch": 2.9, "eval_loss": 0.8697803020477295, "eval_runtime": 1.9531, "eval_samples_per_second": 79.872, "eval_steps_per_second": 4.096, "step": 2900 }, { "epoch": 2.95, "grad_norm": 2.3540799617767334, "learning_rate": 1.777145961456971e-05, "loss": 0.2661, "step": 2950 }, { "epoch": 2.95, "eval_loss": 0.8683090209960938, "eval_runtime": 1.9501, "eval_samples_per_second": 79.998, "eval_steps_per_second": 4.102, "step": 2950 }, { "epoch": 3.0, "grad_norm": 1.8433347940444946, "learning_rate": 1.766044443118978e-05, "loss": 0.2632, "step": 3000 }, { "epoch": 3.0, "eval_loss": 0.8736282587051392, "eval_runtime": 1.9898, "eval_samples_per_second": 78.401, "eval_steps_per_second": 4.021, "step": 3000 }, { "epoch": 3.05, "grad_norm": 1.7454739809036255, "learning_rate": 1.7547095802227723e-05, "loss": 0.1605, "step": 3050 }, { "epoch": 3.05, "eval_loss": 0.9220167398452759, "eval_runtime": 1.9546, "eval_samples_per_second": 79.811, "eval_steps_per_second": 4.093, "step": 3050 }, { "epoch": 3.1, "grad_norm": 1.8734194040298462, "learning_rate": 1.7431448254773943e-05, "loss": 0.1714, "step": 3100 }, { "epoch": 3.1, "eval_loss": 0.9173617362976074, "eval_runtime": 1.9816, "eval_samples_per_second": 78.726, "eval_steps_per_second": 4.037, "step": 3100 }, { "epoch": 3.15, "grad_norm": 1.4522944688796997, "learning_rate": 1.7313537016191706e-05, "loss": 0.1621, "step": 3150 }, { "epoch": 3.15, "eval_loss": 0.9355226159095764, "eval_runtime": 1.9542, "eval_samples_per_second": 79.83, "eval_steps_per_second": 4.094, "step": 3150 }, { "epoch": 3.2, "grad_norm": 2.084001064300537, "learning_rate": 1.7193398003386514e-05, "loss": 0.1639, "step": 3200 }, { "epoch": 3.2, "eval_loss": 0.9425297379493713, "eval_runtime": 1.9547, "eval_samples_per_second": 79.806, "eval_steps_per_second": 4.093, "step": 3200 }, { "epoch": 3.25, "grad_norm": 1.1925337314605713, "learning_rate": 1.7071067811865477e-05, "loss": 0.169, "step": 3250 }, { "epoch": 3.25, "eval_loss": 0.9308127164840698, "eval_runtime": 1.9563, "eval_samples_per_second": 79.743, "eval_steps_per_second": 4.089, "step": 3250 }, { "epoch": 3.3, "grad_norm": 1.694814682006836, "learning_rate": 1.6946583704589973e-05, "loss": 0.1759, "step": 3300 }, { "epoch": 3.3, "eval_loss": 0.9162003993988037, "eval_runtime": 1.9535, "eval_samples_per_second": 79.856, "eval_steps_per_second": 4.095, "step": 3300 }, { "epoch": 3.35, "grad_norm": 1.417009711265564, "learning_rate": 1.6819983600624986e-05, "loss": 0.1754, "step": 3350 }, { "epoch": 3.35, "eval_loss": 0.9497876167297363, "eval_runtime": 1.9576, "eval_samples_per_second": 79.688, "eval_steps_per_second": 4.087, "step": 3350 }, { "epoch": 3.4, "grad_norm": 1.1292682886123657, "learning_rate": 1.6691306063588583e-05, "loss": 0.1676, "step": 3400 }, { "epoch": 3.4, "eval_loss": 0.931199312210083, "eval_runtime": 1.9577, "eval_samples_per_second": 79.684, "eval_steps_per_second": 4.086, "step": 3400 }, { "epoch": 3.45, "grad_norm": 1.327938199043274, "learning_rate": 1.6560590289905074e-05, "loss": 0.1768, "step": 3450 }, { "epoch": 3.45, "eval_loss": 0.9207851886749268, "eval_runtime": 1.9844, "eval_samples_per_second": 78.613, "eval_steps_per_second": 4.031, "step": 3450 }, { "epoch": 3.5, "grad_norm": 1.5228685140609741, "learning_rate": 1.6427876096865394e-05, "loss": 0.1705, "step": 3500 }, { "epoch": 3.5, "eval_loss": 0.9314968585968018, "eval_runtime": 1.9503, "eval_samples_per_second": 79.987, "eval_steps_per_second": 4.102, "step": 3500 }, { "epoch": 3.55, "grad_norm": 1.0945371389389038, "learning_rate": 1.6293203910498375e-05, "loss": 0.1714, "step": 3550 }, { "epoch": 3.55, "eval_loss": 0.9226655960083008, "eval_runtime": 1.9806, "eval_samples_per_second": 78.762, "eval_steps_per_second": 4.039, "step": 3550 }, { "epoch": 3.6, "grad_norm": 2.246633291244507, "learning_rate": 1.6156614753256583e-05, "loss": 0.1728, "step": 3600 }, { "epoch": 3.6, "eval_loss": 0.9342146515846252, "eval_runtime": 1.9812, "eval_samples_per_second": 78.741, "eval_steps_per_second": 4.038, "step": 3600 }, { "epoch": 3.65, "grad_norm": 2.521110773086548, "learning_rate": 1.6018150231520486e-05, "loss": 0.1793, "step": 3650 }, { "epoch": 3.65, "eval_loss": 0.938757061958313, "eval_runtime": 1.9562, "eval_samples_per_second": 79.746, "eval_steps_per_second": 4.09, "step": 3650 }, { "epoch": 3.7, "grad_norm": 1.523911714553833, "learning_rate": 1.5877852522924733e-05, "loss": 0.1789, "step": 3700 }, { "epoch": 3.7, "eval_loss": 0.9147320985794067, "eval_runtime": 1.9805, "eval_samples_per_second": 78.77, "eval_steps_per_second": 4.039, "step": 3700 }, { "epoch": 3.75, "grad_norm": 1.7088408470153809, "learning_rate": 1.573576436351046e-05, "loss": 0.1735, "step": 3750 }, { "epoch": 3.75, "eval_loss": 0.9470258951187134, "eval_runtime": 1.9739, "eval_samples_per_second": 79.031, "eval_steps_per_second": 4.053, "step": 3750 }, { "epoch": 3.8, "grad_norm": 1.4315546751022339, "learning_rate": 1.5591929034707468e-05, "loss": 0.1769, "step": 3800 }, { "epoch": 3.8, "eval_loss": 0.9245337247848511, "eval_runtime": 1.9553, "eval_samples_per_second": 79.782, "eval_steps_per_second": 4.091, "step": 3800 }, { "epoch": 3.85, "grad_norm": 1.5162451267242432, "learning_rate": 1.5446390350150272e-05, "loss": 0.1757, "step": 3850 }, { "epoch": 3.85, "eval_loss": 0.9159222841262817, "eval_runtime": 1.9548, "eval_samples_per_second": 79.803, "eval_steps_per_second": 4.092, "step": 3850 }, { "epoch": 3.9, "grad_norm": 2.092200517654419, "learning_rate": 1.529919264233205e-05, "loss": 0.1763, "step": 3900 }, { "epoch": 3.9, "eval_loss": 0.9359647035598755, "eval_runtime": 1.9782, "eval_samples_per_second": 78.859, "eval_steps_per_second": 4.044, "step": 3900 }, { "epoch": 3.95, "grad_norm": 1.4863076210021973, "learning_rate": 1.5150380749100545e-05, "loss": 0.1745, "step": 3950 }, { "epoch": 3.95, "eval_loss": 0.9099722504615784, "eval_runtime": 1.9536, "eval_samples_per_second": 79.852, "eval_steps_per_second": 4.095, "step": 3950 }, { "epoch": 4.0, "grad_norm": 1.6606465578079224, "learning_rate": 1.5000000000000002e-05, "loss": 0.1788, "step": 4000 }, { "epoch": 4.0, "eval_loss": 0.9263266324996948, "eval_runtime": 1.9595, "eval_samples_per_second": 79.612, "eval_steps_per_second": 4.083, "step": 4000 }, { "epoch": 4.05, "grad_norm": 1.041926622390747, "learning_rate": 1.4848096202463373e-05, "loss": 0.1102, "step": 4050 }, { "epoch": 4.05, "eval_loss": 0.9752283096313477, "eval_runtime": 1.9966, "eval_samples_per_second": 78.134, "eval_steps_per_second": 4.007, "step": 4050 }, { "epoch": 4.1, "grad_norm": 0.8900114893913269, "learning_rate": 1.469471562785891e-05, "loss": 0.1078, "step": 4100 }, { "epoch": 4.1, "eval_loss": 0.9762280583381653, "eval_runtime": 1.9569, "eval_samples_per_second": 79.718, "eval_steps_per_second": 4.088, "step": 4100 }, { "epoch": 4.15, "grad_norm": 1.457000970840454, "learning_rate": 1.4539904997395468e-05, "loss": 0.1104, "step": 4150 }, { "epoch": 4.15, "eval_loss": 0.9672176837921143, "eval_runtime": 1.9891, "eval_samples_per_second": 78.426, "eval_steps_per_second": 4.022, "step": 4150 }, { "epoch": 4.2, "grad_norm": 1.6533278226852417, "learning_rate": 1.4383711467890776e-05, "loss": 0.1133, "step": 4200 }, { "epoch": 4.2, "eval_loss": 0.9861645698547363, "eval_runtime": 1.9784, "eval_samples_per_second": 78.851, "eval_steps_per_second": 4.044, "step": 4200 }, { "epoch": 4.25, "grad_norm": 1.1720765829086304, "learning_rate": 1.4226182617406996e-05, "loss": 0.1078, "step": 4250 }, { "epoch": 4.25, "eval_loss": 0.9741531610488892, "eval_runtime": 1.9529, "eval_samples_per_second": 79.883, "eval_steps_per_second": 4.097, "step": 4250 }, { "epoch": 4.3, "grad_norm": 1.391183853149414, "learning_rate": 1.4067366430758004e-05, "loss": 0.1108, "step": 4300 }, { "epoch": 4.3, "eval_loss": 0.9842219352722168, "eval_runtime": 1.9859, "eval_samples_per_second": 78.555, "eval_steps_per_second": 4.028, "step": 4300 }, { "epoch": 4.35, "grad_norm": 1.5558375120162964, "learning_rate": 1.3907311284892737e-05, "loss": 0.113, "step": 4350 }, { "epoch": 4.35, "eval_loss": 0.9784641861915588, "eval_runtime": 1.9508, "eval_samples_per_second": 79.969, "eval_steps_per_second": 4.101, "step": 4350 }, { "epoch": 4.4, "grad_norm": 1.3257871866226196, "learning_rate": 1.3746065934159123e-05, "loss": 0.1127, "step": 4400 }, { "epoch": 4.4, "eval_loss": 0.9792104959487915, "eval_runtime": 1.9569, "eval_samples_per_second": 79.718, "eval_steps_per_second": 4.088, "step": 4400 }, { "epoch": 4.45, "grad_norm": 0.785510778427124, "learning_rate": 1.3583679495453e-05, "loss": 0.1062, "step": 4450 }, { "epoch": 4.45, "eval_loss": 0.9944807291030884, "eval_runtime": 1.9553, "eval_samples_per_second": 79.785, "eval_steps_per_second": 4.092, "step": 4450 }, { "epoch": 4.5, "grad_norm": 1.6052072048187256, "learning_rate": 1.342020143325669e-05, "loss": 0.1133, "step": 4500 }, { "epoch": 4.5, "eval_loss": 0.9830677509307861, "eval_runtime": 1.9561, "eval_samples_per_second": 79.751, "eval_steps_per_second": 4.09, "step": 4500 }, { "epoch": 4.55, "grad_norm": 2.2204127311706543, "learning_rate": 1.3255681544571568e-05, "loss": 0.1117, "step": 4550 }, { "epoch": 4.55, "eval_loss": 0.9811728000640869, "eval_runtime": 1.9578, "eval_samples_per_second": 79.683, "eval_steps_per_second": 4.086, "step": 4550 }, { "epoch": 4.6, "grad_norm": 0.9036445617675781, "learning_rate": 1.3090169943749475e-05, "loss": 0.111, "step": 4600 }, { "epoch": 4.6, "eval_loss": 0.9881971478462219, "eval_runtime": 1.9525, "eval_samples_per_second": 79.9, "eval_steps_per_second": 4.097, "step": 4600 }, { "epoch": 4.65, "grad_norm": 1.132278561592102, "learning_rate": 1.2923717047227368e-05, "loss": 0.1117, "step": 4650 }, { "epoch": 4.65, "eval_loss": 0.9802563190460205, "eval_runtime": 1.9498, "eval_samples_per_second": 80.009, "eval_steps_per_second": 4.103, "step": 4650 }, { "epoch": 4.7, "grad_norm": 1.7277779579162598, "learning_rate": 1.2756373558169992e-05, "loss": 0.1129, "step": 4700 }, { "epoch": 4.7, "eval_loss": 0.9768027067184448, "eval_runtime": 1.9497, "eval_samples_per_second": 80.012, "eval_steps_per_second": 4.103, "step": 4700 }, { "epoch": 4.75, "grad_norm": 1.4018073081970215, "learning_rate": 1.2588190451025209e-05, "loss": 0.1165, "step": 4750 }, { "epoch": 4.75, "eval_loss": 0.9888935685157776, "eval_runtime": 1.9546, "eval_samples_per_second": 79.813, "eval_steps_per_second": 4.093, "step": 4750 }, { "epoch": 4.8, "grad_norm": 0.7460929155349731, "learning_rate": 1.2419218955996677e-05, "loss": 0.1171, "step": 4800 }, { "epoch": 4.8, "eval_loss": 0.982837975025177, "eval_runtime": 1.9494, "eval_samples_per_second": 80.027, "eval_steps_per_second": 4.104, "step": 4800 }, { "epoch": 4.85, "grad_norm": 1.671038031578064, "learning_rate": 1.2249510543438652e-05, "loss": 0.1103, "step": 4850 }, { "epoch": 4.85, "eval_loss": 0.9951275587081909, "eval_runtime": 1.983, "eval_samples_per_second": 78.667, "eval_steps_per_second": 4.034, "step": 4850 }, { "epoch": 4.9, "grad_norm": 1.1324496269226074, "learning_rate": 1.2079116908177592e-05, "loss": 0.1145, "step": 4900 }, { "epoch": 4.9, "eval_loss": 0.9849076867103577, "eval_runtime": 1.952, "eval_samples_per_second": 79.916, "eval_steps_per_second": 4.098, "step": 4900 }, { "epoch": 4.95, "grad_norm": 1.1482257843017578, "learning_rate": 1.190808995376545e-05, "loss": 0.1086, "step": 4950 }, { "epoch": 4.95, "eval_loss": 0.9864331483840942, "eval_runtime": 1.9545, "eval_samples_per_second": 79.814, "eval_steps_per_second": 4.093, "step": 4950 }, { "epoch": 5.0, "grad_norm": 1.353508710861206, "learning_rate": 1.1736481776669307e-05, "loss": 0.1145, "step": 5000 }, { "epoch": 5.0, "eval_loss": 0.9888507127761841, "eval_runtime": 1.9585, "eval_samples_per_second": 79.653, "eval_steps_per_second": 4.085, "step": 5000 }, { "epoch": 5.05, "grad_norm": 0.7334951162338257, "learning_rate": 1.156434465040231e-05, "loss": 0.0695, "step": 5050 }, { "epoch": 5.05, "eval_loss": 1.0386019945144653, "eval_runtime": 1.9508, "eval_samples_per_second": 79.969, "eval_steps_per_second": 4.101, "step": 5050 }, { "epoch": 5.1, "grad_norm": 0.7137899398803711, "learning_rate": 1.1391731009600655e-05, "loss": 0.0681, "step": 5100 }, { "epoch": 5.1, "eval_loss": 1.045086145401001, "eval_runtime": 1.9564, "eval_samples_per_second": 79.739, "eval_steps_per_second": 4.089, "step": 5100 }, { "epoch": 5.15, "grad_norm": 0.7475718855857849, "learning_rate": 1.1218693434051475e-05, "loss": 0.0715, "step": 5150 }, { "epoch": 5.15, "eval_loss": 1.037683367729187, "eval_runtime": 1.9879, "eval_samples_per_second": 78.476, "eval_steps_per_second": 4.024, "step": 5150 }, { "epoch": 5.2, "grad_norm": 0.575427770614624, "learning_rate": 1.1045284632676535e-05, "loss": 0.0693, "step": 5200 }, { "epoch": 5.2, "eval_loss": 1.027417540550232, "eval_runtime": 2.0052, "eval_samples_per_second": 77.799, "eval_steps_per_second": 3.99, "step": 5200 }, { "epoch": 5.25, "grad_norm": 0.9635318517684937, "learning_rate": 1.0871557427476585e-05, "loss": 0.0705, "step": 5250 }, { "epoch": 5.25, "eval_loss": 1.033168911933899, "eval_runtime": 1.9887, "eval_samples_per_second": 78.444, "eval_steps_per_second": 4.023, "step": 5250 }, { "epoch": 5.3, "grad_norm": 0.6668184399604797, "learning_rate": 1.0697564737441254e-05, "loss": 0.0681, "step": 5300 }, { "epoch": 5.3, "eval_loss": 1.0387107133865356, "eval_runtime": 1.9473, "eval_samples_per_second": 80.111, "eval_steps_per_second": 4.108, "step": 5300 }, { "epoch": 5.35, "grad_norm": 0.648227334022522, "learning_rate": 1.0523359562429441e-05, "loss": 0.0718, "step": 5350 }, { "epoch": 5.35, "eval_loss": 1.0241611003875732, "eval_runtime": 1.9878, "eval_samples_per_second": 78.478, "eval_steps_per_second": 4.024, "step": 5350 }, { "epoch": 5.4, "grad_norm": 0.889001190662384, "learning_rate": 1.0348994967025012e-05, "loss": 0.0723, "step": 5400 }, { "epoch": 5.4, "eval_loss": 1.0176992416381836, "eval_runtime": 1.9841, "eval_samples_per_second": 78.624, "eval_steps_per_second": 4.032, "step": 5400 }, { "epoch": 5.45, "grad_norm": 0.5184072256088257, "learning_rate": 1.0174524064372837e-05, "loss": 0.0701, "step": 5450 }, { "epoch": 5.45, "eval_loss": 1.026439905166626, "eval_runtime": 1.9534, "eval_samples_per_second": 79.86, "eval_steps_per_second": 4.095, "step": 5450 }, { "epoch": 5.5, "grad_norm": 0.4163142442703247, "learning_rate": 1e-05, "loss": 0.0749, "step": 5500 }, { "epoch": 5.5, "eval_loss": 1.024969220161438, "eval_runtime": 1.9903, "eval_samples_per_second": 78.382, "eval_steps_per_second": 4.02, "step": 5500 }, { "epoch": 5.55, "grad_norm": 1.023705005645752, "learning_rate": 9.825475935627165e-06, "loss": 0.0702, "step": 5550 }, { "epoch": 5.55, "eval_loss": 1.0406672954559326, "eval_runtime": 2.0097, "eval_samples_per_second": 77.625, "eval_steps_per_second": 3.981, "step": 5550 }, { "epoch": 5.6, "grad_norm": 0.7389354109764099, "learning_rate": 9.651005032974994e-06, "loss": 0.073, "step": 5600 }, { "epoch": 5.6, "eval_loss": 1.024473786354065, "eval_runtime": 1.9944, "eval_samples_per_second": 78.219, "eval_steps_per_second": 4.011, "step": 5600 }, { "epoch": 5.65, "grad_norm": 0.9039676189422607, "learning_rate": 9.476640437570562e-06, "loss": 0.0683, "step": 5650 }, { "epoch": 5.65, "eval_loss": 1.0281826257705688, "eval_runtime": 1.9915, "eval_samples_per_second": 78.332, "eval_steps_per_second": 4.017, "step": 5650 }, { "epoch": 5.7, "grad_norm": 0.7177872061729431, "learning_rate": 9.302435262558748e-06, "loss": 0.0684, "step": 5700 }, { "epoch": 5.7, "eval_loss": 1.034155011177063, "eval_runtime": 1.9561, "eval_samples_per_second": 79.749, "eval_steps_per_second": 4.09, "step": 5700 }, { "epoch": 5.75, "grad_norm": 1.8074698448181152, "learning_rate": 9.128442572523418e-06, "loss": 0.0726, "step": 5750 }, { "epoch": 5.75, "eval_loss": 1.0173975229263306, "eval_runtime": 1.9946, "eval_samples_per_second": 78.212, "eval_steps_per_second": 4.011, "step": 5750 }, { "epoch": 5.8, "grad_norm": 0.4908640384674072, "learning_rate": 8.954715367323468e-06, "loss": 0.0736, "step": 5800 }, { "epoch": 5.8, "eval_loss": 1.0302704572677612, "eval_runtime": 1.9664, "eval_samples_per_second": 79.334, "eval_steps_per_second": 4.068, "step": 5800 }, { "epoch": 5.85, "grad_norm": 0.7088440656661987, "learning_rate": 8.781306565948528e-06, "loss": 0.0768, "step": 5850 }, { "epoch": 5.85, "eval_loss": 1.015944004058838, "eval_runtime": 2.0068, "eval_samples_per_second": 77.735, "eval_steps_per_second": 3.986, "step": 5850 }, { "epoch": 5.9, "grad_norm": 0.9419664740562439, "learning_rate": 8.60826899039935e-06, "loss": 0.072, "step": 5900 }, { "epoch": 5.9, "eval_loss": 1.01549232006073, "eval_runtime": 1.9678, "eval_samples_per_second": 79.277, "eval_steps_per_second": 4.065, "step": 5900 }, { "epoch": 5.95, "grad_norm": 0.6469582319259644, "learning_rate": 8.43565534959769e-06, "loss": 0.0708, "step": 5950 }, { "epoch": 5.95, "eval_loss": 1.0359619855880737, "eval_runtime": 1.9975, "eval_samples_per_second": 78.097, "eval_steps_per_second": 4.005, "step": 5950 }, { "epoch": 6.0, "grad_norm": 0.6396550536155701, "learning_rate": 8.263518223330698e-06, "loss": 0.0718, "step": 6000 }, { "epoch": 6.0, "eval_loss": 1.0418637990951538, "eval_runtime": 1.9897, "eval_samples_per_second": 78.406, "eval_steps_per_second": 4.021, "step": 6000 }, { "epoch": 6.05, "grad_norm": 0.5852078199386597, "learning_rate": 8.091910046234552e-06, "loss": 0.055, "step": 6050 }, { "epoch": 6.05, "eval_loss": 1.0743677616119385, "eval_runtime": 1.9914, "eval_samples_per_second": 78.338, "eval_steps_per_second": 4.017, "step": 6050 }, { "epoch": 6.1, "grad_norm": 0.3007534444332123, "learning_rate": 7.92088309182241e-06, "loss": 0.0535, "step": 6100 }, { "epoch": 6.1, "eval_loss": 1.079653024673462, "eval_runtime": 1.9956, "eval_samples_per_second": 78.173, "eval_steps_per_second": 4.009, "step": 6100 }, { "epoch": 6.15, "grad_norm": 0.6570334434509277, "learning_rate": 7.750489456561351e-06, "loss": 0.054, "step": 6150 }, { "epoch": 6.15, "eval_loss": 1.0885672569274902, "eval_runtime": 1.9628, "eval_samples_per_second": 79.48, "eval_steps_per_second": 4.076, "step": 6150 }, { "epoch": 6.2, "grad_norm": 0.2993961274623871, "learning_rate": 7.580781044003324e-06, "loss": 0.0529, "step": 6200 }, { "epoch": 6.2, "eval_loss": 1.0960559844970703, "eval_runtime": 1.9867, "eval_samples_per_second": 78.521, "eval_steps_per_second": 4.027, "step": 6200 }, { "epoch": 6.25, "grad_norm": 0.41366830468177795, "learning_rate": 7.411809548974792e-06, "loss": 0.0557, "step": 6250 }, { "epoch": 6.25, "eval_loss": 1.0834628343582153, "eval_runtime": 1.9985, "eval_samples_per_second": 78.058, "eval_steps_per_second": 4.003, "step": 6250 }, { "epoch": 6.3, "grad_norm": 0.34496158361434937, "learning_rate": 7.243626441830009e-06, "loss": 0.0559, "step": 6300 }, { "epoch": 6.3, "eval_loss": 1.0905544757843018, "eval_runtime": 1.963, "eval_samples_per_second": 79.47, "eval_steps_per_second": 4.075, "step": 6300 }, { "epoch": 6.35, "grad_norm": 0.744387686252594, "learning_rate": 7.076282952772634e-06, "loss": 0.057, "step": 6350 }, { "epoch": 6.35, "eval_loss": 1.0833388566970825, "eval_runtime": 1.954, "eval_samples_per_second": 79.836, "eval_steps_per_second": 4.094, "step": 6350 }, { "epoch": 6.4, "grad_norm": 0.32570087909698486, "learning_rate": 6.909830056250527e-06, "loss": 0.0547, "step": 6400 }, { "epoch": 6.4, "eval_loss": 1.0984578132629395, "eval_runtime": 1.9615, "eval_samples_per_second": 79.53, "eval_steps_per_second": 4.078, "step": 6400 }, { "epoch": 6.45, "grad_norm": 0.19425342977046967, "learning_rate": 6.744318455428436e-06, "loss": 0.0562, "step": 6450 }, { "epoch": 6.45, "eval_loss": 1.1050516366958618, "eval_runtime": 2.0169, "eval_samples_per_second": 77.347, "eval_steps_per_second": 3.967, "step": 6450 }, { "epoch": 6.5, "grad_norm": 0.7145141363143921, "learning_rate": 6.579798566743314e-06, "loss": 0.055, "step": 6500 }, { "epoch": 6.5, "eval_loss": 1.1162112951278687, "eval_runtime": 1.9628, "eval_samples_per_second": 79.477, "eval_steps_per_second": 4.076, "step": 6500 }, { "epoch": 6.55, "grad_norm": 1.1525160074234009, "learning_rate": 6.4163205045469975e-06, "loss": 0.0571, "step": 6550 }, { "epoch": 6.55, "eval_loss": 1.1009734869003296, "eval_runtime": 1.9623, "eval_samples_per_second": 79.499, "eval_steps_per_second": 4.077, "step": 6550 }, { "epoch": 6.6, "grad_norm": 0.5341395139694214, "learning_rate": 6.25393406584088e-06, "loss": 0.0545, "step": 6600 }, { "epoch": 6.6, "eval_loss": 1.097998023033142, "eval_runtime": 1.9676, "eval_samples_per_second": 79.286, "eval_steps_per_second": 4.066, "step": 6600 }, { "epoch": 6.65, "grad_norm": 0.3178306221961975, "learning_rate": 6.092688715107265e-06, "loss": 0.0562, "step": 6650 }, { "epoch": 6.65, "eval_loss": 1.0951284170150757, "eval_runtime": 1.9714, "eval_samples_per_second": 79.132, "eval_steps_per_second": 4.058, "step": 6650 }, { "epoch": 6.7, "grad_norm": 0.4612273573875427, "learning_rate": 5.932633569242e-06, "loss": 0.056, "step": 6700 }, { "epoch": 6.7, "eval_loss": 1.0997884273529053, "eval_runtime": 1.9664, "eval_samples_per_second": 79.332, "eval_steps_per_second": 4.068, "step": 6700 }, { "epoch": 6.75, "grad_norm": 0.9741678237915039, "learning_rate": 5.773817382593008e-06, "loss": 0.0556, "step": 6750 }, { "epoch": 6.75, "eval_loss": 1.090924620628357, "eval_runtime": 1.9718, "eval_samples_per_second": 79.115, "eval_steps_per_second": 4.057, "step": 6750 }, { "epoch": 6.8, "grad_norm": 0.5491740107536316, "learning_rate": 5.616288532109225e-06, "loss": 0.0551, "step": 6800 }, { "epoch": 6.8, "eval_loss": 1.0942950248718262, "eval_runtime": 1.9768, "eval_samples_per_second": 78.916, "eval_steps_per_second": 4.047, "step": 6800 }, { "epoch": 6.85, "grad_norm": 0.21083757281303406, "learning_rate": 5.460095002604533e-06, "loss": 0.0557, "step": 6850 }, { "epoch": 6.85, "eval_loss": 1.0904380083084106, "eval_runtime": 1.9989, "eval_samples_per_second": 78.044, "eval_steps_per_second": 4.002, "step": 6850 }, { "epoch": 6.9, "grad_norm": 0.4396616220474243, "learning_rate": 5.305284372141095e-06, "loss": 0.0574, "step": 6900 }, { "epoch": 6.9, "eval_loss": 1.0884004831314087, "eval_runtime": 1.9732, "eval_samples_per_second": 79.059, "eval_steps_per_second": 4.054, "step": 6900 }, { "epoch": 6.95, "grad_norm": 0.4765956997871399, "learning_rate": 5.151903797536631e-06, "loss": 0.0565, "step": 6950 }, { "epoch": 6.95, "eval_loss": 1.0859627723693848, "eval_runtime": 1.9559, "eval_samples_per_second": 79.758, "eval_steps_per_second": 4.09, "step": 6950 }, { "epoch": 7.0, "grad_norm": 0.6419967412948608, "learning_rate": 5.000000000000003e-06, "loss": 0.0561, "step": 7000 }, { "epoch": 7.0, "eval_loss": 1.0774165391921997, "eval_runtime": 1.9538, "eval_samples_per_second": 79.845, "eval_steps_per_second": 4.095, "step": 7000 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 2.7475274864420454e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }