{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.619987269255251, "eval_steps": 50, "global_step": 5200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06365372374283895, "grad_norm": 8.618297576904297, "learning_rate": 1.2738853503184715e-06, "loss": 1.5929, "step": 50 }, { "epoch": 0.06365372374283895, "eval_loss": 0.8718269467353821, "eval_runtime": 1.9665, "eval_samples_per_second": 79.328, "eval_steps_per_second": 4.068, "step": 50 }, { "epoch": 0.1273074474856779, "grad_norm": 13.18759822845459, "learning_rate": 2.547770700636943e-06, "loss": 0.7086, "step": 100 }, { "epoch": 0.1273074474856779, "eval_loss": 0.7185815572738647, "eval_runtime": 1.9537, "eval_samples_per_second": 79.847, "eval_steps_per_second": 4.095, "step": 100 }, { "epoch": 0.19096117122851686, "grad_norm": 7.225034236907959, "learning_rate": 3.821656050955415e-06, "loss": 0.6164, "step": 150 }, { "epoch": 0.19096117122851686, "eval_loss": 0.7371894717216492, "eval_runtime": 1.9509, "eval_samples_per_second": 79.963, "eval_steps_per_second": 4.101, "step": 150 }, { "epoch": 0.2546148949713558, "grad_norm": 3.914799213409424, "learning_rate": 5.095541401273886e-06, "loss": 0.6015, "step": 200 }, { "epoch": 0.2546148949713558, "eval_loss": 0.6896682381629944, "eval_runtime": 1.9812, "eval_samples_per_second": 78.74, "eval_steps_per_second": 4.038, "step": 200 }, { "epoch": 0.31826861871419476, "grad_norm": 4.877476215362549, "learning_rate": 6.369426751592357e-06, "loss": 0.6245, "step": 250 }, { "epoch": 0.31826861871419476, "eval_loss": 0.6881433725357056, "eval_runtime": 1.9896, "eval_samples_per_second": 78.407, "eval_steps_per_second": 4.021, "step": 250 }, { "epoch": 0.3819223424570337, "grad_norm": 4.47362756729126, "learning_rate": 7.64331210191083e-06, "loss": 0.5852, "step": 300 }, { "epoch": 0.3819223424570337, "eval_loss": 0.6920483708381653, "eval_runtime": 1.9486, "eval_samples_per_second": 80.059, "eval_steps_per_second": 4.106, "step": 300 }, { "epoch": 0.44557606619987267, "grad_norm": 3.484682559967041, "learning_rate": 8.9171974522293e-06, "loss": 0.6307, "step": 350 }, { "epoch": 0.44557606619987267, "eval_loss": 0.6990864276885986, "eval_runtime": 1.9831, "eval_samples_per_second": 78.663, "eval_steps_per_second": 4.034, "step": 350 }, { "epoch": 0.5092297899427116, "grad_norm": 3.779184103012085, "learning_rate": 1.0191082802547772e-05, "loss": 0.6639, "step": 400 }, { "epoch": 0.5092297899427116, "eval_loss": 0.7543805241584778, "eval_runtime": 1.952, "eval_samples_per_second": 79.916, "eval_steps_per_second": 4.098, "step": 400 }, { "epoch": 0.5728835136855506, "grad_norm": 2.8436331748962402, "learning_rate": 1.1464968152866242e-05, "loss": 0.6431, "step": 450 }, { "epoch": 0.5728835136855506, "eval_loss": 0.7171684503555298, "eval_runtime": 1.993, "eval_samples_per_second": 78.274, "eval_steps_per_second": 4.014, "step": 450 }, { "epoch": 0.6365372374283895, "grad_norm": 3.409532070159912, "learning_rate": 1.2738853503184714e-05, "loss": 0.6183, "step": 500 }, { "epoch": 0.6365372374283895, "eval_loss": 0.7295389175415039, "eval_runtime": 1.9547, "eval_samples_per_second": 79.809, "eval_steps_per_second": 4.093, "step": 500 }, { "epoch": 0.7001909611712285, "grad_norm": 2.2884674072265625, "learning_rate": 1.4012738853503186e-05, "loss": 0.6344, "step": 550 }, { "epoch": 0.7001909611712285, "eval_loss": 0.7278215885162354, "eval_runtime": 1.9597, "eval_samples_per_second": 79.605, "eval_steps_per_second": 4.082, "step": 550 }, { "epoch": 0.7638446849140674, "grad_norm": 2.662555694580078, "learning_rate": 1.528662420382166e-05, "loss": 0.6635, "step": 600 }, { "epoch": 0.7638446849140674, "eval_loss": 0.7314972281455994, "eval_runtime": 1.9527, "eval_samples_per_second": 79.889, "eval_steps_per_second": 4.097, "step": 600 }, { "epoch": 0.8274984086569064, "grad_norm": 2.6137986183166504, "learning_rate": 1.6560509554140128e-05, "loss": 0.7083, "step": 650 }, { "epoch": 0.8274984086569064, "eval_loss": 0.7786588668823242, "eval_runtime": 1.9552, "eval_samples_per_second": 79.787, "eval_steps_per_second": 4.092, "step": 650 }, { "epoch": 0.8911521323997453, "grad_norm": 3.339657783508301, "learning_rate": 1.78343949044586e-05, "loss": 0.6841, "step": 700 }, { "epoch": 0.8911521323997453, "eval_loss": 0.752537727355957, "eval_runtime": 1.9503, "eval_samples_per_second": 79.986, "eval_steps_per_second": 4.102, "step": 700 }, { "epoch": 0.9548058561425843, "grad_norm": 2.102302312850952, "learning_rate": 1.910828025477707e-05, "loss": 0.6622, "step": 750 }, { "epoch": 0.9548058561425843, "eval_loss": 0.7738191485404968, "eval_runtime": 1.9545, "eval_samples_per_second": 79.818, "eval_steps_per_second": 4.093, "step": 750 }, { "epoch": 1.0184595798854232, "grad_norm": 4.052014350891113, "learning_rate": 1.999977755311665e-05, "loss": 0.6277, "step": 800 }, { "epoch": 1.0184595798854232, "eval_loss": 0.7977485656738281, "eval_runtime": 1.9587, "eval_samples_per_second": 79.643, "eval_steps_per_second": 4.084, "step": 800 }, { "epoch": 1.0821133036282622, "grad_norm": 2.9770467281341553, "learning_rate": 1.99958232171617e-05, "loss": 0.4164, "step": 850 }, { "epoch": 1.0821133036282622, "eval_loss": 0.8168254494667053, "eval_runtime": 1.9783, "eval_samples_per_second": 78.855, "eval_steps_per_second": 4.044, "step": 850 }, { "epoch": 1.1457670273711011, "grad_norm": 2.4140615463256836, "learning_rate": 1.9986927867052226e-05, "loss": 0.3963, "step": 900 }, { "epoch": 1.1457670273711011, "eval_loss": 0.8188236355781555, "eval_runtime": 1.9525, "eval_samples_per_second": 79.898, "eval_steps_per_second": 4.097, "step": 900 }, { "epoch": 1.20942075111394, "grad_norm": 2.735466718673706, "learning_rate": 1.997309589982985e-05, "loss": 0.4199, "step": 950 }, { "epoch": 1.20942075111394, "eval_loss": 0.812458336353302, "eval_runtime": 1.9598, "eval_samples_per_second": 79.601, "eval_steps_per_second": 4.082, "step": 950 }, { "epoch": 1.273074474856779, "grad_norm": 2.7539219856262207, "learning_rate": 1.995433415274493e-05, "loss": 0.4159, "step": 1000 }, { "epoch": 1.273074474856779, "eval_loss": 0.80951327085495, "eval_runtime": 1.9753, "eval_samples_per_second": 78.975, "eval_steps_per_second": 4.05, "step": 1000 }, { "epoch": 1.336728198599618, "grad_norm": 2.275360584259033, "learning_rate": 1.993065189987684e-05, "loss": 0.4107, "step": 1050 }, { "epoch": 1.336728198599618, "eval_loss": 0.8287139534950256, "eval_runtime": 1.9574, "eval_samples_per_second": 79.696, "eval_steps_per_second": 4.087, "step": 1050 }, { "epoch": 1.400381922342457, "grad_norm": 2.8758232593536377, "learning_rate": 1.9902060847549716e-05, "loss": 0.434, "step": 1100 }, { "epoch": 1.400381922342457, "eval_loss": 0.8138644695281982, "eval_runtime": 1.9536, "eval_samples_per_second": 79.852, "eval_steps_per_second": 4.095, "step": 1100 }, { "epoch": 1.464035646085296, "grad_norm": 2.412508964538574, "learning_rate": 1.986857512854594e-05, "loss": 0.4443, "step": 1150 }, { "epoch": 1.464035646085296, "eval_loss": 0.8119045495986938, "eval_runtime": 1.9894, "eval_samples_per_second": 78.417, "eval_steps_per_second": 4.021, "step": 1150 }, { "epoch": 1.5276893698281349, "grad_norm": 1.9511016607284546, "learning_rate": 1.983021129512019e-05, "loss": 0.4203, "step": 1200 }, { "epoch": 1.5276893698281349, "eval_loss": 0.8141614198684692, "eval_runtime": 1.9602, "eval_samples_per_second": 79.584, "eval_steps_per_second": 4.081, "step": 1200 }, { "epoch": 1.5913430935709738, "grad_norm": 2.6505215167999268, "learning_rate": 1.9786988310817523e-05, "loss": 0.4385, "step": 1250 }, { "epoch": 1.5913430935709738, "eval_loss": 0.8097578883171082, "eval_runtime": 1.9831, "eval_samples_per_second": 78.665, "eval_steps_per_second": 4.034, "step": 1250 }, { "epoch": 1.6549968173138128, "grad_norm": 2.4782555103302, "learning_rate": 1.973892754109955e-05, "loss": 0.4625, "step": 1300 }, { "epoch": 1.6549968173138128, "eval_loss": 0.8060766458511353, "eval_runtime": 1.953, "eval_samples_per_second": 79.877, "eval_steps_per_second": 4.096, "step": 1300 }, { "epoch": 1.7186505410566517, "grad_norm": 2.4227771759033203, "learning_rate": 1.9686052742783324e-05, "loss": 0.4417, "step": 1350 }, { "epoch": 1.7186505410566517, "eval_loss": 0.8126944303512573, "eval_runtime": 2.1634, "eval_samples_per_second": 72.108, "eval_steps_per_second": 3.698, "step": 1350 }, { "epoch": 1.7823042647994907, "grad_norm": 2.3015785217285156, "learning_rate": 1.9628390052298155e-05, "loss": 0.4483, "step": 1400 }, { "epoch": 1.7823042647994907, "eval_loss": 0.8172005414962769, "eval_runtime": 1.9944, "eval_samples_per_second": 78.219, "eval_steps_per_second": 4.011, "step": 1400 }, { "epoch": 1.8459579885423296, "grad_norm": 2.7847931385040283, "learning_rate": 1.9565967972766164e-05, "loss": 0.4551, "step": 1450 }, { "epoch": 1.8459579885423296, "eval_loss": 0.8149409890174866, "eval_runtime": 1.9515, "eval_samples_per_second": 79.937, "eval_steps_per_second": 4.099, "step": 1450 }, { "epoch": 1.9096117122851686, "grad_norm": 2.573996067047119, "learning_rate": 1.9498817359912973e-05, "loss": 0.4526, "step": 1500 }, { "epoch": 1.9096117122851686, "eval_loss": 0.8210792541503906, "eval_runtime": 1.957, "eval_samples_per_second": 79.715, "eval_steps_per_second": 4.088, "step": 1500 }, { "epoch": 1.9732654360280075, "grad_norm": 2.4847145080566406, "learning_rate": 1.9426971406815464e-05, "loss": 0.436, "step": 1550 }, { "epoch": 1.9732654360280075, "eval_loss": 0.8161709904670715, "eval_runtime": 1.9565, "eval_samples_per_second": 79.732, "eval_steps_per_second": 4.089, "step": 1550 }, { "epoch": 2.0369191597708465, "grad_norm": 2.272801637649536, "learning_rate": 1.9350465627494196e-05, "loss": 0.3299, "step": 1600 }, { "epoch": 2.0369191597708465, "eval_loss": 0.8792819380760193, "eval_runtime": 1.9865, "eval_samples_per_second": 78.531, "eval_steps_per_second": 4.027, "step": 1600 }, { "epoch": 2.1005728835136854, "grad_norm": 2.545703172683716, "learning_rate": 1.926933783935853e-05, "loss": 0.2247, "step": 1650 }, { "epoch": 2.1005728835136854, "eval_loss": 0.89618319272995, "eval_runtime": 1.9719, "eval_samples_per_second": 79.113, "eval_steps_per_second": 4.057, "step": 1650 }, { "epoch": 2.1642266072565244, "grad_norm": 1.6922868490219116, "learning_rate": 1.9183628144513196e-05, "loss": 0.2404, "step": 1700 }, { "epoch": 2.1642266072565244, "eval_loss": 0.8865496516227722, "eval_runtime": 1.9548, "eval_samples_per_second": 79.805, "eval_steps_per_second": 4.093, "step": 1700 }, { "epoch": 2.2278803309993633, "grad_norm": 1.7290230989456177, "learning_rate": 1.9093378909935503e-05, "loss": 0.226, "step": 1750 }, { "epoch": 2.2278803309993633, "eval_loss": 0.9072224497795105, "eval_runtime": 1.979, "eval_samples_per_second": 78.828, "eval_steps_per_second": 4.042, "step": 1750 }, { "epoch": 2.2915340547422023, "grad_norm": 1.6177765130996704, "learning_rate": 1.8998634746533004e-05, "loss": 0.2395, "step": 1800 }, { "epoch": 2.2915340547422023, "eval_loss": 0.9168246388435364, "eval_runtime": 1.956, "eval_samples_per_second": 79.753, "eval_steps_per_second": 4.09, "step": 1800 }, { "epoch": 2.3551877784850412, "grad_norm": 2.4058384895324707, "learning_rate": 1.889944248709198e-05, "loss": 0.2425, "step": 1850 }, { "epoch": 2.3551877784850412, "eval_loss": 0.9104910492897034, "eval_runtime": 1.9546, "eval_samples_per_second": 79.811, "eval_steps_per_second": 4.093, "step": 1850 }, { "epoch": 2.41884150222788, "grad_norm": 1.7686213254928589, "learning_rate": 1.8795851163127626e-05, "loss": 0.2456, "step": 1900 }, { "epoch": 2.41884150222788, "eval_loss": 0.9102558493614197, "eval_runtime": 2.0139, "eval_samples_per_second": 77.463, "eval_steps_per_second": 3.972, "step": 1900 }, { "epoch": 2.482495225970719, "grad_norm": 2.483212947845459, "learning_rate": 1.8687911980647375e-05, "loss": 0.241, "step": 1950 }, { "epoch": 2.482495225970719, "eval_loss": 0.930452287197113, "eval_runtime": 1.964, "eval_samples_per_second": 79.428, "eval_steps_per_second": 4.073, "step": 1950 }, { "epoch": 2.546148949713558, "grad_norm": 2.0306715965270996, "learning_rate": 1.857567829483937e-05, "loss": 0.2529, "step": 2000 }, { "epoch": 2.546148949713558, "eval_loss": 0.9052493572235107, "eval_runtime": 1.9617, "eval_samples_per_second": 79.522, "eval_steps_per_second": 4.078, "step": 2000 }, { "epoch": 2.609802673456397, "grad_norm": 2.4319934844970703, "learning_rate": 1.8459205583698598e-05, "loss": 0.2479, "step": 2050 }, { "epoch": 2.609802673456397, "eval_loss": 0.909498929977417, "eval_runtime": 1.9526, "eval_samples_per_second": 79.894, "eval_steps_per_second": 4.097, "step": 2050 }, { "epoch": 2.673456397199236, "grad_norm": 1.3272260427474976, "learning_rate": 1.833855142060364e-05, "loss": 0.2411, "step": 2100 }, { "epoch": 2.673456397199236, "eval_loss": 0.9212460517883301, "eval_runtime": 1.9521, "eval_samples_per_second": 79.913, "eval_steps_per_second": 4.098, "step": 2100 }, { "epoch": 2.737110120942075, "grad_norm": 1.639654278755188, "learning_rate": 1.8213775445857716e-05, "loss": 0.2392, "step": 2150 }, { "epoch": 2.737110120942075, "eval_loss": 0.911665678024292, "eval_runtime": 1.9821, "eval_samples_per_second": 78.704, "eval_steps_per_second": 4.036, "step": 2150 }, { "epoch": 2.800763844684914, "grad_norm": 1.5815492868423462, "learning_rate": 1.808493933720802e-05, "loss": 0.2568, "step": 2200 }, { "epoch": 2.800763844684914, "eval_loss": 0.910538375377655, "eval_runtime": 1.9947, "eval_samples_per_second": 78.207, "eval_steps_per_second": 4.011, "step": 2200 }, { "epoch": 2.864417568427753, "grad_norm": 1.519374132156372, "learning_rate": 1.7952106779357922e-05, "loss": 0.2459, "step": 2250 }, { "epoch": 2.864417568427753, "eval_loss": 0.9174743890762329, "eval_runtime": 1.9547, "eval_samples_per_second": 79.81, "eval_steps_per_second": 4.093, "step": 2250 }, { "epoch": 2.928071292170592, "grad_norm": 2.186408042907715, "learning_rate": 1.7815343432487094e-05, "loss": 0.2521, "step": 2300 }, { "epoch": 2.928071292170592, "eval_loss": 0.9191064238548279, "eval_runtime": 1.972, "eval_samples_per_second": 79.107, "eval_steps_per_second": 4.057, "step": 2300 }, { "epoch": 2.991725015913431, "grad_norm": 1.9706707000732422, "learning_rate": 1.7674716899795205e-05, "loss": 0.2462, "step": 2350 }, { "epoch": 2.991725015913431, "eval_loss": 0.9158605933189392, "eval_runtime": 1.9566, "eval_samples_per_second": 79.73, "eval_steps_per_second": 4.089, "step": 2350 }, { "epoch": 3.0553787396562697, "grad_norm": 2.1943814754486084, "learning_rate": 1.753029669408509e-05, "loss": 0.1712, "step": 2400 }, { "epoch": 3.0553787396562697, "eval_loss": 0.9773328900337219, "eval_runtime": 1.9608, "eval_samples_per_second": 79.559, "eval_steps_per_second": 4.08, "step": 2400 }, { "epoch": 3.1190324633991087, "grad_norm": 1.6751331090927124, "learning_rate": 1.738215420340205e-05, "loss": 0.1609, "step": 2450 }, { "epoch": 3.1190324633991087, "eval_loss": 0.9757582545280457, "eval_runtime": 1.9922, "eval_samples_per_second": 78.306, "eval_steps_per_second": 4.016, "step": 2450 }, { "epoch": 3.1826861871419476, "grad_norm": 1.4819265604019165, "learning_rate": 1.723036265574616e-05, "loss": 0.1684, "step": 2500 }, { "epoch": 3.1826861871419476, "eval_loss": 0.9819521903991699, "eval_runtime": 1.9611, "eval_samples_per_second": 79.546, "eval_steps_per_second": 4.079, "step": 2500 }, { "epoch": 3.2463399108847866, "grad_norm": 1.6287983655929565, "learning_rate": 1.7074997082875113e-05, "loss": 0.1677, "step": 2550 }, { "epoch": 3.2463399108847866, "eval_loss": 0.9783702492713928, "eval_runtime": 1.9577, "eval_samples_per_second": 79.684, "eval_steps_per_second": 4.086, "step": 2550 }, { "epoch": 3.3099936346276255, "grad_norm": 0.7324376106262207, "learning_rate": 1.6916134283215412e-05, "loss": 0.1705, "step": 2600 }, { "epoch": 3.3099936346276255, "eval_loss": 0.9991599917411804, "eval_runtime": 1.9593, "eval_samples_per_second": 79.622, "eval_steps_per_second": 4.083, "step": 2600 }, { "epoch": 3.373647358370465, "grad_norm": 2.1236824989318848, "learning_rate": 1.6753852783900306e-05, "loss": 0.1679, "step": 2650 }, { "epoch": 3.373647358370465, "eval_loss": 0.9751440286636353, "eval_runtime": 1.9544, "eval_samples_per_second": 79.821, "eval_steps_per_second": 4.093, "step": 2650 }, { "epoch": 3.4373010821133034, "grad_norm": 1.1708719730377197, "learning_rate": 1.6588232801953223e-05, "loss": 0.1668, "step": 2700 }, { "epoch": 3.4373010821133034, "eval_loss": 0.9651870727539062, "eval_runtime": 1.9795, "eval_samples_per_second": 78.809, "eval_steps_per_second": 4.041, "step": 2700 }, { "epoch": 3.500954805856143, "grad_norm": 1.5355838537216187, "learning_rate": 1.641935620463584e-05, "loss": 0.1673, "step": 2750 }, { "epoch": 3.500954805856143, "eval_loss": 0.9585145115852356, "eval_runtime": 1.9555, "eval_samples_per_second": 79.775, "eval_steps_per_second": 4.091, "step": 2750 }, { "epoch": 3.5646085295989813, "grad_norm": 1.21344792842865, "learning_rate": 1.6247306468980475e-05, "loss": 0.1691, "step": 2800 }, { "epoch": 3.5646085295989813, "eval_loss": 0.9902192950248718, "eval_runtime": 1.9655, "eval_samples_per_second": 79.368, "eval_steps_per_second": 4.07, "step": 2800 }, { "epoch": 3.6282622533418207, "grad_norm": 1.3404573202133179, "learning_rate": 1.607216864052672e-05, "loss": 0.1707, "step": 2850 }, { "epoch": 3.6282622533418207, "eval_loss": 0.9820557832717896, "eval_runtime": 1.957, "eval_samples_per_second": 79.716, "eval_steps_per_second": 4.088, "step": 2850 }, { "epoch": 3.6919159770846592, "grad_norm": 1.7005574703216553, "learning_rate": 1.589402929128276e-05, "loss": 0.1694, "step": 2900 }, { "epoch": 3.6919159770846592, "eval_loss": 0.9860473871231079, "eval_runtime": 1.9577, "eval_samples_per_second": 79.685, "eval_steps_per_second": 4.086, "step": 2900 }, { "epoch": 3.7555697008274986, "grad_norm": 3.1085927486419678, "learning_rate": 1.571297647693215e-05, "loss": 0.1693, "step": 2950 }, { "epoch": 3.7555697008274986, "eval_loss": 0.9597901701927185, "eval_runtime": 1.9583, "eval_samples_per_second": 79.66, "eval_steps_per_second": 4.085, "step": 2950 }, { "epoch": 3.819223424570337, "grad_norm": 1.1406304836273193, "learning_rate": 1.5529099693307205e-05, "loss": 0.1708, "step": 3000 }, { "epoch": 3.819223424570337, "eval_loss": 0.9565241932868958, "eval_runtime": 1.9592, "eval_samples_per_second": 79.623, "eval_steps_per_second": 4.083, "step": 3000 }, { "epoch": 3.8828771483131765, "grad_norm": 1.1178061962127686, "learning_rate": 1.534248983215048e-05, "loss": 0.1671, "step": 3050 }, { "epoch": 3.8828771483131765, "eval_loss": 0.9801331758499146, "eval_runtime": 1.9614, "eval_samples_per_second": 79.536, "eval_steps_per_second": 4.079, "step": 3050 }, { "epoch": 3.946530872056015, "grad_norm": 1.7261744737625122, "learning_rate": 1.5153239136186297e-05, "loss": 0.1603, "step": 3100 }, { "epoch": 3.946530872056015, "eval_loss": 0.9833694100379944, "eval_runtime": 1.9582, "eval_samples_per_second": 79.664, "eval_steps_per_second": 4.085, "step": 3100 }, { "epoch": 4.0101845957988544, "grad_norm": 1.4014726877212524, "learning_rate": 1.4961441153524421e-05, "loss": 0.1604, "step": 3150 }, { "epoch": 4.0101845957988544, "eval_loss": 1.0256284475326538, "eval_runtime": 1.9881, "eval_samples_per_second": 78.467, "eval_steps_per_second": 4.024, "step": 3150 }, { "epoch": 4.073838319541693, "grad_norm": 0.9033142924308777, "learning_rate": 1.47671906914185e-05, "loss": 0.1046, "step": 3200 }, { "epoch": 4.073838319541693, "eval_loss": 1.0567926168441772, "eval_runtime": 1.9551, "eval_samples_per_second": 79.79, "eval_steps_per_second": 4.092, "step": 3200 }, { "epoch": 4.137492043284532, "grad_norm": 0.9893561005592346, "learning_rate": 1.4570583769402085e-05, "loss": 0.1069, "step": 3250 }, { "epoch": 4.137492043284532, "eval_loss": 1.0297176837921143, "eval_runtime": 1.9549, "eval_samples_per_second": 79.801, "eval_steps_per_second": 4.092, "step": 3250 }, { "epoch": 4.201145767027371, "grad_norm": 1.3346303701400757, "learning_rate": 1.437171757182542e-05, "loss": 0.1032, "step": 3300 }, { "epoch": 4.201145767027371, "eval_loss": 1.0205018520355225, "eval_runtime": 1.9586, "eval_samples_per_second": 79.648, "eval_steps_per_second": 4.084, "step": 3300 }, { "epoch": 4.26479949077021, "grad_norm": 0.8845686316490173, "learning_rate": 1.4170690399816469e-05, "loss": 0.1038, "step": 3350 }, { "epoch": 4.26479949077021, "eval_loss": 1.027697205543518, "eval_runtime": 1.9567, "eval_samples_per_second": 79.727, "eval_steps_per_second": 4.089, "step": 3350 }, { "epoch": 4.328453214513049, "grad_norm": 0.8891485333442688, "learning_rate": 1.3967601622689864e-05, "loss": 0.1122, "step": 3400 }, { "epoch": 4.328453214513049, "eval_loss": 1.0382416248321533, "eval_runtime": 1.9585, "eval_samples_per_second": 79.654, "eval_steps_per_second": 4.085, "step": 3400 }, { "epoch": 4.392106938255888, "grad_norm": 1.3972492218017578, "learning_rate": 1.3762551628827892e-05, "loss": 0.1086, "step": 3450 }, { "epoch": 4.392106938255888, "eval_loss": 1.0479986667633057, "eval_runtime": 1.9854, "eval_samples_per_second": 78.575, "eval_steps_per_second": 4.03, "step": 3450 }, { "epoch": 4.455760661998727, "grad_norm": 0.9529390335083008, "learning_rate": 1.3555641776057729e-05, "loss": 0.1086, "step": 3500 }, { "epoch": 4.455760661998727, "eval_loss": 1.026903748512268, "eval_runtime": 1.9801, "eval_samples_per_second": 78.783, "eval_steps_per_second": 4.04, "step": 3500 }, { "epoch": 4.519414385741566, "grad_norm": 0.9916590452194214, "learning_rate": 1.3346974341549448e-05, "loss": 0.1082, "step": 3550 }, { "epoch": 4.519414385741566, "eval_loss": 1.0392361879348755, "eval_runtime": 1.9611, "eval_samples_per_second": 79.546, "eval_steps_per_second": 4.079, "step": 3550 }, { "epoch": 4.583068109484405, "grad_norm": 1.4986087083816528, "learning_rate": 1.3136652471259624e-05, "loss": 0.1067, "step": 3600 }, { "epoch": 4.583068109484405, "eval_loss": 1.014196753501892, "eval_runtime": 1.9787, "eval_samples_per_second": 78.839, "eval_steps_per_second": 4.043, "step": 3600 }, { "epoch": 4.646721833227244, "grad_norm": 1.2164666652679443, "learning_rate": 1.2924780128945473e-05, "loss": 0.1124, "step": 3650 }, { "epoch": 4.646721833227244, "eval_loss": 1.0109295845031738, "eval_runtime": 1.9575, "eval_samples_per_second": 79.695, "eval_steps_per_second": 4.087, "step": 3650 }, { "epoch": 4.7103755569700825, "grad_norm": 0.980490505695343, "learning_rate": 1.2711462044774747e-05, "loss": 0.1079, "step": 3700 }, { "epoch": 4.7103755569700825, "eval_loss": 1.025646686553955, "eval_runtime": 1.9514, "eval_samples_per_second": 79.941, "eval_steps_per_second": 4.1, "step": 3700 }, { "epoch": 4.774029280712922, "grad_norm": 1.684493064880371, "learning_rate": 1.249680366355678e-05, "loss": 0.1106, "step": 3750 }, { "epoch": 4.774029280712922, "eval_loss": 1.0292022228240967, "eval_runtime": 1.9487, "eval_samples_per_second": 80.051, "eval_steps_per_second": 4.105, "step": 3750 }, { "epoch": 4.83768300445576, "grad_norm": 0.6794301867485046, "learning_rate": 1.2280911092620298e-05, "loss": 0.108, "step": 3800 }, { "epoch": 4.83768300445576, "eval_loss": 1.012285828590393, "eval_runtime": 1.9548, "eval_samples_per_second": 79.806, "eval_steps_per_second": 4.093, "step": 3800 }, { "epoch": 4.9013367281986, "grad_norm": 1.613420009613037, "learning_rate": 1.2063891049363725e-05, "loss": 0.1078, "step": 3850 }, { "epoch": 4.9013367281986, "eval_loss": 1.029428482055664, "eval_runtime": 1.9459, "eval_samples_per_second": 80.167, "eval_steps_per_second": 4.111, "step": 3850 }, { "epoch": 4.964990451941438, "grad_norm": 0.8637195825576782, "learning_rate": 1.1845850808503939e-05, "loss": 0.1012, "step": 3900 }, { "epoch": 4.964990451941438, "eval_loss": 1.0551037788391113, "eval_runtime": 1.9584, "eval_samples_per_second": 79.656, "eval_steps_per_second": 4.085, "step": 3900 }, { "epoch": 5.028644175684278, "grad_norm": 1.0889546871185303, "learning_rate": 1.1626898149049523e-05, "loss": 0.0913, "step": 3950 }, { "epoch": 5.028644175684278, "eval_loss": 1.0936988592147827, "eval_runtime": 1.9524, "eval_samples_per_second": 79.901, "eval_steps_per_second": 4.097, "step": 3950 }, { "epoch": 5.092297899427116, "grad_norm": 1.0244091749191284, "learning_rate": 1.1407141301024762e-05, "loss": 0.0699, "step": 4000 }, { "epoch": 5.092297899427116, "eval_loss": 1.0808786153793335, "eval_runtime": 1.9799, "eval_samples_per_second": 78.793, "eval_steps_per_second": 4.041, "step": 4000 }, { "epoch": 5.155951623169956, "grad_norm": 0.9700921773910522, "learning_rate": 1.1186688891970686e-05, "loss": 0.0685, "step": 4050 }, { "epoch": 5.155951623169956, "eval_loss": 1.0664215087890625, "eval_runtime": 1.9478, "eval_samples_per_second": 80.091, "eval_steps_per_second": 4.107, "step": 4050 }, { "epoch": 5.219605346912794, "grad_norm": 0.8142929077148438, "learning_rate": 1.0965649893249619e-05, "loss": 0.0667, "step": 4100 }, { "epoch": 5.219605346912794, "eval_loss": 1.0777896642684937, "eval_runtime": 1.9828, "eval_samples_per_second": 78.676, "eval_steps_per_second": 4.035, "step": 4100 }, { "epoch": 5.2832590706556335, "grad_norm": 0.47455140948295593, "learning_rate": 1.074413356617978e-05, "loss": 0.0689, "step": 4150 }, { "epoch": 5.2832590706556335, "eval_loss": 1.0918174982070923, "eval_runtime": 1.9937, "eval_samples_per_second": 78.247, "eval_steps_per_second": 4.013, "step": 4150 }, { "epoch": 5.346912794398472, "grad_norm": 0.5227736234664917, "learning_rate": 1.0522249408026553e-05, "loss": 0.0662, "step": 4200 }, { "epoch": 5.346912794398472, "eval_loss": 1.0910652875900269, "eval_runtime": 1.9443, "eval_samples_per_second": 80.236, "eval_steps_per_second": 4.115, "step": 4200 }, { "epoch": 5.410566518141311, "grad_norm": 0.9572489261627197, "learning_rate": 1.0300107097877114e-05, "loss": 0.066, "step": 4250 }, { "epoch": 5.410566518141311, "eval_loss": 1.0842642784118652, "eval_runtime": 1.9976, "eval_samples_per_second": 78.093, "eval_steps_per_second": 4.005, "step": 4250 }, { "epoch": 5.47422024188415, "grad_norm": 1.3733848333358765, "learning_rate": 1.0077816442425216e-05, "loss": 0.0717, "step": 4300 }, { "epoch": 5.47422024188415, "eval_loss": 1.0852998495101929, "eval_runtime": 1.9568, "eval_samples_per_second": 79.722, "eval_steps_per_second": 4.088, "step": 4300 }, { "epoch": 5.537873965626989, "grad_norm": 0.3845078945159912, "learning_rate": 9.85548732169286e-06, "loss": 0.0668, "step": 4350 }, { "epoch": 5.537873965626989, "eval_loss": 1.0928053855895996, "eval_runtime": 1.9824, "eval_samples_per_second": 78.692, "eval_steps_per_second": 4.036, "step": 4350 }, { "epoch": 5.601527689369828, "grad_norm": 0.6611254215240479, "learning_rate": 9.633229634715734e-06, "loss": 0.0683, "step": 4400 }, { "epoch": 5.601527689369828, "eval_loss": 1.08696448802948, "eval_runtime": 1.948, "eval_samples_per_second": 80.082, "eval_steps_per_second": 4.107, "step": 4400 }, { "epoch": 5.665181413112667, "grad_norm": 0.2818908095359802, "learning_rate": 9.411153245219262e-06, "loss": 0.0672, "step": 4450 }, { "epoch": 5.665181413112667, "eval_loss": 1.0971331596374512, "eval_runtime": 1.9492, "eval_samples_per_second": 80.034, "eval_steps_per_second": 4.104, "step": 4450 }, { "epoch": 5.728835136855506, "grad_norm": 0.5571719408035278, "learning_rate": 9.18936792731209e-06, "loss": 0.0716, "step": 4500 }, { "epoch": 5.728835136855506, "eval_loss": 1.084871530532837, "eval_runtime": 1.9919, "eval_samples_per_second": 78.318, "eval_steps_per_second": 4.016, "step": 4500 }, { "epoch": 5.792488860598345, "grad_norm": 0.5682917237281799, "learning_rate": 8.967983311223898e-06, "loss": 0.0699, "step": 4550 }, { "epoch": 5.792488860598345, "eval_loss": 1.092268943786621, "eval_runtime": 2.0128, "eval_samples_per_second": 77.502, "eval_steps_per_second": 3.974, "step": 4550 }, { "epoch": 5.856142584341184, "grad_norm": 0.7702981233596802, "learning_rate": 8.747108829114284e-06, "loss": 0.0731, "step": 4600 }, { "epoch": 5.856142584341184, "eval_loss": 1.0890579223632812, "eval_runtime": 1.9571, "eval_samples_per_second": 79.711, "eval_steps_per_second": 4.088, "step": 4600 }, { "epoch": 5.919796308084023, "grad_norm": 0.8630861043930054, "learning_rate": 8.526853660979609e-06, "loss": 0.0664, "step": 4650 }, { "epoch": 5.919796308084023, "eval_loss": 1.0908620357513428, "eval_runtime": 1.962, "eval_samples_per_second": 79.511, "eval_steps_per_second": 4.077, "step": 4650 }, { "epoch": 5.9834500318268615, "grad_norm": 1.2592061758041382, "learning_rate": 8.30732668068446e-06, "loss": 0.0723, "step": 4700 }, { "epoch": 5.9834500318268615, "eval_loss": 1.0808950662612915, "eval_runtime": 1.9536, "eval_samples_per_second": 79.853, "eval_steps_per_second": 4.095, "step": 4700 }, { "epoch": 6.047103755569701, "grad_norm": 0.3191361725330353, "learning_rate": 8.088636402144442e-06, "loss": 0.0584, "step": 4750 }, { "epoch": 6.047103755569701, "eval_loss": 1.1153998374938965, "eval_runtime": 1.9605, "eval_samples_per_second": 79.571, "eval_steps_per_second": 4.081, "step": 4750 }, { "epoch": 6.110757479312539, "grad_norm": 0.5758917927742004, "learning_rate": 7.870890925686875e-06, "loss": 0.0528, "step": 4800 }, { "epoch": 6.110757479312539, "eval_loss": 1.1246304512023926, "eval_runtime": 1.9862, "eval_samples_per_second": 78.541, "eval_steps_per_second": 4.028, "step": 4800 }, { "epoch": 6.174411203055379, "grad_norm": 0.8011472821235657, "learning_rate": 7.654197884615991e-06, "loss": 0.0549, "step": 4850 }, { "epoch": 6.174411203055379, "eval_loss": 1.1302942037582397, "eval_runtime": 1.9621, "eval_samples_per_second": 79.507, "eval_steps_per_second": 4.077, "step": 4850 }, { "epoch": 6.238064926798217, "grad_norm": 0.45742979645729065, "learning_rate": 7.438664392008903e-06, "loss": 0.0548, "step": 4900 }, { "epoch": 6.238064926798217, "eval_loss": 1.1371703147888184, "eval_runtime": 1.9865, "eval_samples_per_second": 78.531, "eval_steps_per_second": 4.027, "step": 4900 }, { "epoch": 6.301718650541057, "grad_norm": 1.6437746286392212, "learning_rate": 7.224396987768785e-06, "loss": 0.0552, "step": 4950 }, { "epoch": 6.301718650541057, "eval_loss": 1.1244136095046997, "eval_runtime": 1.9966, "eval_samples_per_second": 78.133, "eval_steps_per_second": 4.007, "step": 4950 }, { "epoch": 6.365372374283895, "grad_norm": 0.2734375298023224, "learning_rate": 7.011501585961369e-06, "loss": 0.053, "step": 5000 }, { "epoch": 6.365372374283895, "eval_loss": 1.1277772188186646, "eval_runtime": 1.9932, "eval_samples_per_second": 78.265, "eval_steps_per_second": 4.014, "step": 5000 }, { "epoch": 6.429026098026735, "grad_norm": 0.3379392623901367, "learning_rate": 6.800083422460766e-06, "loss": 0.0556, "step": 5050 }, { "epoch": 6.429026098026735, "eval_loss": 1.1319537162780762, "eval_runtime": 1.9815, "eval_samples_per_second": 78.728, "eval_steps_per_second": 4.037, "step": 5050 }, { "epoch": 6.492679821769573, "grad_norm": 0.34678924083709717, "learning_rate": 6.590247002930567e-06, "loss": 0.0574, "step": 5100 }, { "epoch": 6.492679821769573, "eval_loss": 1.112044334411621, "eval_runtime": 1.9568, "eval_samples_per_second": 79.721, "eval_steps_per_second": 4.088, "step": 5100 }, { "epoch": 6.5563335455124125, "grad_norm": 0.6663870811462402, "learning_rate": 6.382096051165847e-06, "loss": 0.0536, "step": 5150 }, { "epoch": 6.5563335455124125, "eval_loss": 1.135048747062683, "eval_runtime": 1.955, "eval_samples_per_second": 79.795, "eval_steps_per_second": 4.092, "step": 5150 }, { "epoch": 6.619987269255251, "grad_norm": 0.5493341088294983, "learning_rate": 6.175733457821691e-06, "loss": 0.0548, "step": 5200 }, { "epoch": 6.619987269255251, "eval_loss": 1.1426846981048584, "eval_runtime": 1.9543, "eval_samples_per_second": 79.823, "eval_steps_per_second": 4.094, "step": 5200 } ], "logging_steps": 50, "max_steps": 7850, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 400, "total_flos": 2.0478057288984166e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }